# Logistic Regression Parameter Tuning
Training an optimal logistic regression model on the pruned dataset, exploring different solvers and hyperparameters.

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import impute
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

%matplotlib inline

In [3]:
df = pd.read_csv('ExtractedFinalDataset.csv')

In [4]:
bad_features = []
for i in range(8):
    langevin = str(i) + "__max_langevin_fixed_point__m_3__r_30"
    bad_features.append(langevin)
    for j in range(9):
        quantile = (j+1)*0.1
        if quantile != 0.5:
            feature_name = str(i) + "__index_mass_quantile__q_" + str(quantile)
            bad_features.append(feature_name)

In [5]:
df = df.drop(bad_features, axis=1)

In [6]:
df.index = df['9']
df = df.drop(['9'], axis=1)
df['Label'] = "One"
df['Label'][2001.0 <= df.index ] = "Two"
df['Label'][4001.0 <= df.index ] = "Three"
df['Label'][6001.0 <= df.index ] = "Four"
df['Label'][8001.0 <= df.index ] = "Five"
df['Label'][10001.0 <= df.index ] = "Six"

In [7]:
df = df[1:]

In [8]:
df.columns = df.columns.map(lambda t: str(t))
df = df.sort_index(axis=1)

In [9]:
extracted_features = df

In [10]:
subsample = extracted_features.sample(frac=0.05).reset_index(drop=True)
subsample.shape

(599, 1705)

In [11]:
X = subsample.drop(['Label'], 1)
y = subsample['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Test some of the different solvers and see how they perform comparatively.

LBFGS Solver

In [15]:
param_grid = {'C': [0.01, 0.1, 1, 10],
             'class_weight': ['balanced', None],
             }
model = GridSearchCV(LogisticRegression(solver='lbfgs'), param_grid)
model.fit(X_train, y_train)

print 'Training accuracy:', model.score(np.array(X_train),np.array(y_train))
print 'Test accuracy:', model.score(np.array(X_test), np.array(y_test))
print model.best_params_

Training accuracy: 0.754176610979
Test accuracy: 0.572222222222
{'C': 0.01, 'class_weight': None}


Sag Solver

In [16]:
param_grid = {'C': [0.01, 0.1, 1, 10],
             'class_weight': ['balanced', None],
             }
model = GridSearchCV(LogisticRegression(solver='sag'), param_grid)
model.fit(X_train, y_train)

print 'Training accuracy:', model.score(np.array(X_train),np.array(y_train))
print 'Test accuracy:', model.score(np.array(X_test), np.array(y_test))
print model.best_params_

Training accuracy: 0.603818615752
Test accuracy: 0.522222222222
{'C': 0.01, 'class_weight': 'balanced'}


Newton-CG Solver

In [17]:
param_grid = {'C': [0.01, 0.1, 1, 10],
             'class_weight': ['balanced', None],
             }
model = GridSearchCV(LogisticRegression(solver='newton-cg'), param_grid)
model.fit(X_train, y_train)

print 'Training accuracy:', model.score(np.array(X_train),np.array(y_train))
print 'Test accuracy:', model.score(np.array(X_test), np.array(y_test))
print model.best_params_awful_features = []
for i in range(8):
    langevin = str(i) + "__max_langevin_fixed_point__m_3__r_30"
    awful_features.append(langevin)
    for j in range(9):
        quantile = (j+1)*0.1
        if quantile != 0.5:
            feature_name = str(i) + "__index_mass_quantile__q_" + str(quantile)
            awful_features.append(feature_name)
sample = sample.drop(awful_features, axis=1)

Training accuracy: 0.949880668258
Test accuracy: 0.666666666667
{'C': 0.1, 'class_weight': 'balanced'}


Default Solver (liblinear)

In [18]:
param_grid = {'C': [0.01, 0.1, 1, 10],
             'class_weight': ['balanced', None],
             }
model = GridSearchCV(LogisticRegression(), param_grid)
model.fit(X_train, y_train)

print 'Training accuracy:', model.score(np.array(X_train),np.array(y_train))
print 'Test accuracy:', model.score(np.array(X_test), np.array(y_test))
print model.best_params_

Training accuracy: 0.856801909308
Test accuracy: 0.616666666667
{'C': 1, 'class_weight': 'balanced'}


In [19]:
param_grid = {'C': [0.01, 0.1, 1, 10],
             'class_weight': ['balanced', None],
             }
model = GridSearchCV(LogisticRegression(penalty='l1'), param_grid)
model.fit(X_train, y_train)

print 'Training accuracy:', model.score(np.array(X_train),np.array(y_train))
print 'Test accuracy:', model.score(np.array(X_test), np.array(y_test))
print model.best_params_

Training accuracy: 1.0
Test accuracy: 0.627777777778
{'C': 10, 'class_weight': 'balanced'}


It seems as though newton-cg and the default liblinear have the best potential performance. Try more grid searching on their parameters.

In [21]:
param_grid = {'C': [0.01, 1, 10],
             'class_weight': ['balanced', None],
              'multi_class' : ['ovr', 'multinomial']
             }
model = GridSearchCV(LogisticRegression(solver='newton-cg', max_iter=1000), param_grid)
model.fit(X_train, y_train)

print 'Training accuracy:', model.score(np.array(X_train),np.array(y_train))
print 'Test accuracy:', model.score(np.array(X_test), np.array(y_test))
print model.best_params_

Training accuracy: 0.99522673031
Test accuracy: 0.622222222222
{'multi_class': 'multinomial', 'C': 10, 'class_weight': None}


Now, try training a model on a 70-30 split of the full dataset, and see how it performs. This might take a while.

In [12]:
fullset = extracted_features.sample(frac=1).reset_index(drop=True)
X = fullset.drop(['Label'], 1)
y = fullset['Label']
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X, y, test_size=0.3, random_state=42)

In [25]:
model = LogisticRegression(solver='newton-cg', C=1, class_weight='balanced')
model.fit(np.array(X_train_full),np.array(y_train_full))

print 'Training accuracy:', model.score(np.array(X_train_full),np.array(y_train_full))
print 'Test accuracy:', model.score(np.array(X_test_full), np.array(y_test_full))

Training accuracy: 0.795113230036
Test accuracy: 0.793103448276


In [26]:
model = LogisticRegression(solver='newton-cg', C=10, multi_class='multinomial')
model.fit(np.array(X_train_full),np.array(y_train_full))

print 'Training accuracy:', model.score(np.array(X_train_full),np.array(y_train_full))
print 'Test accuracy:', model.score(np.array(X_test_full), np.array(y_test_full))

Training accuracy: 0.779499404052
Test accuracy: 0.778364849833


In [13]:
model = LogisticRegression(C=1, class_weight='balanced')
model.fit(np.array(X_train_full),np.array(y_train_full))

print 'Training accuracy:', model.score(np.array(X_train_full),np.array(y_train_full))
print 'Test accuracy:', model.score(np.array(X_test_full), np.array(y_test_full))

Training accuracy: 0.764123957092
Test accuracy: 0.761401557286


In [14]:
model = LogisticRegression(penalty='l1', C=0.001)
model.fit(X_train_full, y_train_full)

print 'Training accuracy:', model.score(np.array(X_train_full),np.array(y_train_full))
print 'Test accuracy:', model.score(np.array(X_test_full), np.array(y_test_full))

Training accuracy: 0.835637663886
Test accuracy: 0.812013348165


Seems like this 81% is about the best accuracy I can achieve using logistic regression, so I'll focus on my other models instead, since they are both getting 83+% accuracy.

In [15]:
from sklearn.externals import joblib
model = LogisticRegression(penalty='l1', C=0.001)
model.fit(X, y)

joblib.dump(model, 'bestlogisticregression.pkl')

['bestlogisticregression.pkl']

Scaling in theory is not good for logistic regression, let's test that theory on the dataset:

In [20]:
from sklearn.preprocessing import StandardScaler
model = LogisticRegression(penalty='l1', C=0.001)
sc = StandardScaler()
X = fullset.drop(['Label'], 1)
y = fullset['Label']
sc.fit(X)
X = sc.transform(X)
X_train_sc, X_test_sc, y_train_full, y_test_full = train_test_split(X, y, test_size=0.3, random_state=42)

model.fit(X_train_sc, y_train_full)

print 'Training accuracy:', model.score(np.array(X_train_sc),np.array(y_train_full))
print 'Test accuracy:', model.score(np.array(X_test_sc), np.array(y_test_full))

Training accuracy: 0.580691299166
Test accuracy: 0.568409343715


How about using PCA on feature space and comparing performance:

In [22]:
from sklearn.decomposition import PCA
model = LogisticRegression(penalty='l1', C=0.001)
pca = PCA(n_components=500)
X = fullset.drop(['Label'], 1)
y = fullset['Label']
pca.fit(X)
X = pca.transform(X)
X_train_pca, X_test_pca, y_train_full, y_test_full = train_test_split(X, y, test_size=0.3, random_state=42)

model.fit(X_train_pca, y_train_full)

print 'Training accuracy:', model.score(np.array(X_train_pca),np.array(y_train_full))
print 'Test accuracy:', model.score(np.array(X_test_pca), np.array(y_test_full))

Training accuracy: 0.808462455304
Test accuracy: 0.790322580645


As expected, there's a drop in performance. Since we're optimizing for accuracy, I'm going to just use the entire dataset. Might be worth investigating for speed reasons if trying to do real-time classification.