In [1]:
from time import time

import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.grid_search import ParameterGrid
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.linear_model import LogisticRegression

from get_data import data

In [2]:
df = data()

X = df[ [col for col in df if col not in ['class']]]
y = df['class'].values

# Binarize the categorical data using a DictVectorizer
# This requires the data be fed in the form of Python dicts
vectorizer = DV(sparse=False)
X_binarized = vectorizer.fit_transform( X.to_dict(orient='records') )


# Split into test and train sets
X_train, X_test, y_train, y_test = train_test_split(X_binarized, y, train_size=.8, random_state=42)

In [3]:
C = np.linspace(.1, 10, 100).tolist() + np.linspace(20,100, 5).tolist() + np.linspace(200, 1000, 9).tolist()

In [4]:
param_grid = list(ParameterGrid({'C': C, 'penalty': ['l1']}))

In [19]:
for params in param_grid:
    
    classifier = LogisticRegression(**params)

    # Fit the model to the training data
    t0 = time()
    classifier.fit(X_train, y_train)
    t1 = time()

    accuracy = classifier.score(X_test, y_test)
    error = (1 - accuracy)

    print '\nTest error: {} Time to train: {}'.format(error, (t1-t0))
    print 'Params: {}'.format(params)


Test error: 0.143559035775 Time to train: 0.336972951889
Params: {'penalty': 'l1', 'C': 0.1}

Test error: 0.142023645018 Time to train: 0.267817020416
Params: {'penalty': 'l1', 'C': 0.2}

Test error: 0.141409488715 Time to train: 0.168585062027
Params: {'penalty': 'l1', 'C': 0.30000000000000004}

Test error: 0.141102410563 Time to train: 0.754909038544
Params: {'penalty': 'l1', 'C': 0.4}

Test error: 0.141870105942 Time to train: 0.103921175003
Params: {'penalty': 'l1', 'C': 0.5}

Test error: 0.141563027791 Time to train: 0.513739109039
Params: {'penalty': 'l1', 'C': 0.6}

Test error: 0.141563027791 Time to train: 0.145889043808
Params: {'penalty': 'l1', 'C': 0.7000000000000001}

Test error: 0.140948871488 Time to train: 0.720541000366
Params: {'penalty': 'l1', 'C': 0.8}

Test error: 0.140795332412 Time to train: 0.602434158325
Params: {'penalty': 'l1', 'C': 0.9}

Test error: 0.140948871488 Time to train: 0.167183876038
Params: {'penalty': 'l1', 'C': 1.0}

Test error: 0.141255949639 T

In [6]:
results = []

for params in param_grid:
    
    classifier = LogisticRegression(**params)

    # Fit the model to the training data
    t0 = time()
    classifier.fit(X_train, y_train)
    t1 = time()

    accuracy = classifier.score(X_test, y_test)
    error = (1 - accuracy)
    
    results.append((params['C'], error))

In [8]:
min(results, key=lambda x: x[1])

(1.1, 0.14064179333640414)

In [14]:
for c,e in results:
    print('{},'.format(e)),

0.14371257485, 0.142330723169, 0.141409488715, 0.141255949639, 0.142330723169, 0.141870105942, 0.141870105942, 0.141102410563, 0.140795332412, 0.141255949639, 0.140641793336, 0.141255949639, 0.141255949639, 0.141255949639, 0.141409488715, 0.140795332412, 0.141102410563, 0.141716566866, 0.141716566866, 0.141870105942, 0.140641793336, 0.140948871488, 0.142177184093, 0.141563027791, 0.142484262245, 0.141870105942, 0.141409488715, 0.141102410563, 0.142023645018, 0.141409488715, 0.141255949639, 0.143251957623, 0.141716566866, 0.142330723169, 0.142330723169, 0.141870105942, 0.142023645018, 0.142177184093, 0.142023645018, 0.142177184093, 0.142023645018, 0.141716566866, 0.142484262245, 0.142177184093, 0.141255949639, 0.142023645018, 0.142330723169, 0.141870105942, 0.142330723169, 0.14371257485, 0.141255949639, 0.142023645018, 0.142791340396, 0.141102410563, 0.141563027791, 0.142944879472, 0.142330723169, 0.141870105942, 0.142330723169, 0.142023645018, 0.142330723169, 0.141102410563, 0.14187010