In [59]:
from time import time

from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression

from get_data import data

In [43]:
df = data()

In [48]:
X = df[ [col for col in df if col not in ['label', 'class']]]
y = df['class']

In [51]:
# Convert DataFrame to list of dicts for the DictVectorizer
X_dict = X.to_dict(orient='records')

# Initialize the DictVectorizer to perform One Hot Encoding
vectorizer = DV(sparse=False)
# Transform the data to one hot encoding representation
X_one_hot = vectorizer.fit_transform( X_dict )

In [53]:
print 'Number of features: {}'.format(X.shape[1])
print 'Number of features after One Hot Encoding: {}'.format(X_one_hot.shape[1])

Number of features: 14
Number of features after One Hot Encoding: 108


In [76]:
vectorizer.get_feature_names()

['age',
 'capital-gain',
 'capital-loss',
 'education-num',
 'education=10th',
 'education=11th',
 'education=12th',
 'education=1st-4th',
 'education=5th-6th',
 'education=7th-8th',
 'education=9th',
 'education=Assoc-acdm',
 'education=Assoc-voc',
 'education=Bachelors',
 'education=Doctorate',
 'education=HS-grad',
 'education=Masters',
 'education=Preschool',
 'education=Prof-school',
 'education=Some-college',
 'fnlwgt',
 'hours-per-week',
 'marital-status=Divorced',
 'marital-status=Married-AF-spouse',
 'marital-status=Married-civ-spouse',
 'marital-status=Married-spouse-absent',
 'marital-status=Never-married',
 'marital-status=Separated',
 'marital-status=Widowed',
 'native-country=?',
 'native-country=Cambodia',
 'native-country=Canada',
 'native-country=China',
 'native-country=Columbia',
 'native-country=Cuba',
 'native-country=Dominican-Republic',
 'native-country=Ecuador',
 'native-country=El-Salvador',
 'native-country=England',
 'native-country=France',
 'native-country=

In [55]:
# Split into test and train sets
X_train, X_test, y_train, y_test = train_test_split(X_one_hot, y, random_state=0)

In [61]:
# Instantiate the model estimator
classifier = LogisticRegression()

# Fit the model to the training data
t0 = time()
classifier.fit(X_train, y_train)
t1 = time()

print 'Time to train classifier: {} seconds'.format(t1-t0)

# Apply the learned model on unseen data
prediction = classifier.predict(X_test)

print 'Train accuracy: {}'.format(classifier.score(X_train, y_train))

Time to train classifier: 0.435038805008 seconds
Train accuracy: 0.797747747748


## Automate above procedure

In [66]:
def log_reg(df, cols):
    X = df[cols]
    y = df['class']
    
    # Convert DataFrame to list of dicts for the DictVectorizer
    X_dict = X.to_dict(orient='records')

    # Initialize the DictVectorizer to perform One Hot Encoding
    vectorizer = DV(sparse=False)
    # Transform the data to one hot encoding representation
    X_one_hot = vectorizer.fit_transform( X_dict )
    
    # Split into test and train sets
    X_train, X_test, y_train, y_test = train_test_split(X_one_hot, y, random_state=0)
    
    # Instantiate the model estimator
    classifier = LogisticRegression()

    # Fit the model to the training data
    t0 = time()
    classifier.fit(X_train, y_train)
    t1 = time()
    
    train_error = (1 - classifier.score(X_train, y_train))
    test_error = (1 - classifier.score(X_test, y_test))

    print
    print 'Test error: {}'.format(test_error)
    print 'Train error: {}'.format(train_error)
    print 'Time to train: {} seconds'.format(t1-t0)

In [67]:
log_reg(df, ['age'])


Test error: 0.25758506326
Train error: 0.25171990172
Time to train: 0.0858459472656 seconds


In [68]:
log_reg(df, ['education'])


Test error: 0.219629038202
Train error: 0.22067977068
Time to train: 0.127322912216 seconds


In [69]:
log_reg(df, ['age', 'education'])


Test error: 0.218646357941
Train error: 0.217772317772
Time to train: 0.0922818183899 seconds


In [70]:
log_reg(df, ['occupation'])


Test error: 0.243459034517
Train error: 0.239926289926
Time to train: 0.101222038269 seconds


In [71]:
log_reg(df, ['occupation', 'age'])


Test error: 0.235229087336
Train error: 0.229934479934
Time to train: 0.138720989227 seconds


In [72]:
log_reg(df, ['occupation', 'age', 'education'])


Test error: 0.210170740695
Train error: 0.207534807535
Time to train: 0.104037046432 seconds


In [75]:
log_reg(df, ['occupation', 'age', 'sex'])


Test error: 0.205503009458
Train error: 0.203849303849
Time to train: 0.0964479446411 seconds


In [None]:
from sklearn.feature_selection import RFECV

estimator = LogisticRegression()

selector = RFECV(estimator, step=1, cv=5)
selector = selector.fit(X, y)