The purpose of this notebook is to code a routine to perform feature removal on a dataset containing categorical data. This routine is not written in scikit-learn and involved encoding the features using a One Hot Encoding scheme if categorical features are contained within the data set

In [1]:
import itertools
from time import time
from copy import deepcopy

from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [2]:
from get_data import data

In [3]:
df = data()

X = df[ [col for col in df if col not in ['label', 'class']]]
y = df['class']

In [4]:
def n_elt_subset(S,n):
    """
    Return all of the n element subsets of S
    """
    return set(itertools.combinations(S, n))

In [5]:
def feature_removal(model, X, y):
    """
    Parameters
    ----------
    X : pandas DataFrame
        DataFrame containing the features
    y : pandas Series
        Series containing the class labels
    """
    total_time = 0
    
    features = X.columns
    #print 'Features: {}'.format(features)
    
    # Binarize the categorical data using a DictVectorizer
    # This requires the data be fed in the form of Python dicts
    vectorizer = DV(sparse=False)
    X_one_hot = vectorizer.fit_transform( X.to_dict(orient='records') )
    enc_feats = vectorizer.get_feature_names()
    
    # Split into test and train sets
    X_train, X_test, y_train, y_test = train_test_split(X_one_hot, y, random_state=0)
    
    # Calculate first score (using error as criteria)
    classifier = deepcopy(model)
    t0 = time()
    classifier.fit(X_train, y_train)
    error = (1 - classifier.score(X_test, y_test))
    t1 = time()
    
    total_time = total_time + (t1-t0)
    
    n_element_test_error = [(features, error)]
    
    for n in range(X.shape[1]-1, 0, -1):
        # Time for this number of features
        n_feature_time = 0
        
        # Get list of all n-element subsets of column labels
        n_feature_subsets = n_elt_subset(features, n)
        errors = []
        
        # Loop through each list of n-element column labels
        for feature_list in n_feature_subsets:
            
            # Get list of indices of the features
            # This is non-trivial because of the way the OneHotEncoder 
            # encodes the categorical data
            feature_indices = []
            
            for feature in feature_list:
                if X[feature].dtype == object:
                    enc_format = feature + '='
                    cat_indices = [enc_feats.index(col) for col in enc_feats if enc_format in col]
                    feature_indices += cat_indices
                else:
                    feature_indices.append(enc_feats.index(feature))
                    
            # Get subset of X_train and X_test corresponding to the right features
            X_train_sub = X_train[:, feature_indices]
            X_test_sub = X_test[:, feature_indices]
            
            # Fit and get testing error
            # Calculate first score (using error as criteria)
            classifier = deepcopy(model)
            t0 = time()
            classifier.fit(X_train_sub, y_train)
            error = (1 - classifier.score(X_test_sub, y_test))
            t1 = time()
            
            n_feature_time = n_feature_time + (t1-t0)
            total_time = total_time + (t1-t0)
            
            errors.append((feature_list, error))
            
        # Sort the list of tuples
        errors.sort(key=lambda x:x[1])
        # Get the best performing subset of features and the associated test error
        best_subset = errors[0]
        # Get the best model with one more feature
        previous_best = n_element_test_error[-1]
        
        print 'Number features: {} \tError: {}'.format(n, best_subset[1])
        print 'Time {}'.format(n_feature_time)
        
        #if best_subset[1] <= previous_best[1]:
        #    features = best_subset[0]
        #    n_element_test_error.append(best_subset)
        #else:
        #    print 'Test error increased with {} features'.format(n, best_subset)
        #    break
        
        # Don't break out of this, get the best model for each num_feats
        features = best_subset[0]
        n_element_test_error.append(best_subset)
        
    print 'Total time taken: {}\n'.format(total_time)
    
    n_element_test_error.sort(key=lambda x:x[1])
    
    return len(n_element_test_error[0][0])

In [16]:
print 'Logistic Regression L1'
feature_removal(LogisticRegression(penalty='l1'), X, y)

Logistic Regression L1
Number features: 13 	Error: 0.148261884289
Time 12.4773414135
Number features: 12 	Error: 0.147893379192
Time 13.8209190369
Number features: 11 	Error: 0.147647709127
Time 5.10096788406
Number features: 10 	Error: 0.148507554355
Time 5.71622419357
Number features: 9 	Error: 0.148876059452
Time 6.00334906578
Number features: 8 	Error: 0.149613069647
Time 7.68797016144
Number features: 7 	Error: 0.149981574745
Time 6.82283711433
Number features: 6 	Error: 0.151455595136
Time 8.57038283348
Number features: 5 	Error: 0.152561110429
Time 3.97136473656
Number features: 4 	Error: 0.156123326373
Time 3.55817890167
Number features: 3 	Error: 0.16275641813
Time 3.47835922241
Number features: 2 	Error: 0.181918683208
Time 2.67846369743
Number features: 1 	Error: 0.219629038202
Time 1.60918569565
Total time taken: 81.8102560043



11

In [7]:
print 'Logistic Regression L2'
print
feature_removal(LogisticRegression(penalty='l2'), X, y)

Logistic Regression L2

Number features: 13 	Error: 0.149121729517
Time 5.74490332603
Number features: 12 	Error: 0.146542193834
Time 8.87190127373
Number features: 11 	Error: 0.149121729517
Time 7.48032784462
Number features: 10 	Error: 0.149121729517
Time 6.07888197899
Number features: 9 	Error: 0.14973590468
Time 4.89407992363
Number features: 8 	Error: 0.150104409778
Time 3.58520698547


KeyboardInterrupt: 

## Get best 12 features using Logistic Regression

In [98]:
n = 12
features = X.columns    

# Binarize the categorical data using a DictVectorizer
# This requires the data be fed in the form of Python dicts
vectorizer = DV(sparse=False)
X_one_hot = vectorizer.fit_transform( X.to_dict(orient='records') )
enc_feats = vectorizer.get_feature_names()

# Split into test and train sets
X_train, X_test, y_train, y_test = train_test_split(X_one_hot, y)


# Get list of all n-element subsets of column labels
n_feature_subsets = n_elt_subset(features, n)
errors = []

# Loop through each list of n-element column labels
for feature_list in n_feature_subsets:

    # Get list of indices of the features
    # This is non-trivial because of the way the OneHotEncoder 
    # encodes the categorical data
    feature_indices = []

    for feature in feature_list:
        if X[feature].dtype == object:
            enc_format = feature + '='
            cat_indices = [enc_feats.index(col) for col in enc_feats if enc_format in col]
            feature_indices += cat_indices
        else:
            feature_indices.append(enc_feats.index(feature))

    # Get subset of X_train and X_test corresponding to the right features
    X_train_sub = X_train[:, feature_indices]
    X_test_sub = X_test[:, feature_indices]

    # Fit and get testing error
    # Calculate first score (using error as criteria)
    classifier = LogisticRegression()
    t0 = time()
    classifier.fit(X_train_sub, y_train)
    error = (1 - classifier.score(X_test_sub, y_test))
    t1 = time()

    errors.append((feature_list, error))

# Sort the list of tuples
errors.sort(key=lambda x:x[1])
# Get the best performing subset of features and the associated test error
best_subset = errors[0]

In [99]:
errors[0]

(('age',
  'workclass',
  'education',
  'education-num',
  'marital-status',
  'occupation',
  'race',
  'sex',
  'capital-gain',
  'capital-loss',
  'hours-per-week',
  'native-country'),
 0.14678786389878395)