*For some reason, I had to import the libraries into this notebook as well as in the .py file to get code to work.  Normally, if you import the libraries relevant to the helper/wrapper functions from the .py file, you do not have to import again in your jupyter notebook.*

In [1]:
from lib.project_5 import load_data_from_database, make_data_dict, general_transformer, general_model
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Step 1 - Benchmarking


**Please refer to Report Writeup *.pdf for further details**

## Implementation

The following code pipeline implements the functions found in `lib/project_5.py`.

<img src="assets/benchmarking.png" width="600px">

### Loading the data from the database

In [2]:
madelon_df = load_data_from_database()

**Getting an overview of the data**

In [3]:
madelon_df.head()

Unnamed: 0,index,feat_000,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_491,feat_492,feat_493,feat_494,feat_495,feat_496,feat_497,feat_498,feat_499,label
0,0,485,477,537,479,452,471,491,476,475,...,481,477,485,511,485,481,479,475,496,-1
1,1,483,458,460,487,587,475,526,479,485,...,478,487,338,513,486,483,492,510,517,-1
2,2,487,542,499,468,448,471,442,478,480,...,481,492,650,506,501,480,489,499,498,-1
3,3,480,491,510,485,495,472,417,474,502,...,480,474,572,454,469,475,482,494,461,1
4,4,484,502,528,489,466,481,402,478,487,...,479,452,435,486,508,481,504,495,511,1


In [4]:
madelon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 502 entries, index to label
dtypes: int64(502)
memory usage: 7.7 MB


In [5]:
madelon_df.shape

(2000, 502)

**Creating our target vector**

In [6]:
y = madelon_df['label']

In [7]:
# target vector is comprised of '1's and '-1's

y.unique()

array([-1,  1])

In [8]:
# the target vector is evenly distributed

y.value_counts()

 1    1000
-1    1000
Name: label, dtype: int64

**Creating the feature matrix**

In [9]:
X = madelon_df.drop(['label', 'index'], axis=1)
X.head(1)

Unnamed: 0,feat_000,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_490,feat_491,feat_492,feat_493,feat_494,feat_495,feat_496,feat_497,feat_498,feat_499
0,485,477,537,479,452,471,491,476,475,473,...,477,481,477,485,511,485,481,479,475,496


**Confirming that our target vector and feature matrix have the appropriate shapes for sklearn**

In [10]:
X.shape, y.shape

((2000, 500), (2000,))

**Spliting the data into Train-Test portions and initializing our data dictionary**

In [11]:
madelon_data_dict = make_data_dict(X, y, 0.25, 82)

**Confirming the completion of the Train-Test-Split**

In [12]:
pd.DataFrame(madelon_data_dict['X_test']).shape

(500, 500)

In [13]:
pd.DataFrame(madelon_data_dict['X_train']).shape

(1500, 500)

### Transforming the data using StandardScaler

In [14]:
scaled_dictionary = general_transformer(StandardScaler(), madelon_data_dict)

**Confirming transformation of `X_train` and `X_test`**

In [15]:
pd.DataFrame(scaled_dictionary['X_train']).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,1.136431,0.610539,-0.217152,-1.80433,-0.072199,0.395568,1.264948,-0.383904,1.209344,-2.177202,...,0.642616,-0.695112,-0.965624,0.248084,-0.28981,-1.338782,0.658693,-0.365804,-1.764246,0.575991
1,0.203911,-0.58664,1.96842,0.834651,-2.953655,-0.495351,0.375311,-0.383904,-1.115211,0.059554,...,1.693163,-1.200648,-1.382716,1.732952,-1.359683,-0.672545,-1.768071,0.596839,1.648343,-1.465802
2,0.048491,0.311244,1.094191,2.044184,0.831308,1.138,0.350599,-0.383904,0.013859,0.059554,...,-0.551187,-0.94788,0.41078,0.943128,-1.157275,1.059673,-1.24805,-0.884151,1.272152,0.575991


In [16]:
pd.DataFrame(scaled_dictionary['X_train']).shape

(1500, 500)

In [17]:
pd.DataFrame(scaled_dictionary['X_test']).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,-1.350289,-0.45362,-0.834254,-1.254542,0.855727,-1.237783,0.721281,0.341355,1.010097,0.75854,...,0.881377,0.821496,1.078127,-1.134108,-0.492219,-2.671257,0.658693,-0.662002,-1.092476,-0.65679
1,-0.106929,0.277989,-0.937105,-0.48484,0.611536,0.098595,0.696569,-1.834421,-0.451052,-0.499635,...,-0.264674,0.821496,0.619326,0.848349,-1.7645,0.366786,-0.381349,0.522789,-1.092476,1.61615
2,0.514751,-0.38711,0.631365,0.174906,-0.121037,-1.980216,1.240235,1.066614,-0.783132,-0.499635,...,-0.598939,0.568728,0.619326,1.867222,0.433077,1.006374,0.138672,1.263284,0.573512,1.885821


In [18]:
pd.DataFrame(scaled_dictionary['X_test']).shape

(500, 500)

In [19]:
scaled_dictionary['transformer']

StandardScaler(copy=True, with_mean=True, with_std=True)

### Getting a Benchmark Score from 'Out-of-the-Box' Logistic Regression

Train and Test Score from "out-of-the-box" Logistic Regression

- no Regularlization -- setting C=1E6

In [20]:
log_reg_dict = general_model(LogisticRegression(C=1E6, penalty='l1', n_jobs=-1), scaled_dictionary)

In [21]:
log_reg_dict['processes']

[StandardScaler(copy=True, with_mean=True, with_std=True),
 LogisticRegression(C=1000000.0, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='ovr', n_jobs=-1, penalty='l1', random_state=None,
           solver='liblinear', tol=0.0001, verbose=0, warm_start=False)]

In [22]:
log_reg_dict['model']

LogisticRegression(C=1000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

**Benchmark Train and Test Scores**

In [23]:
print("Train Score: {:.4f}".format(log_reg_dict['train_score']))
print("Test Score:  {:.4f}".format(log_reg_dict['test_score']))

Train Score: 0.7953
Test Score:  0.5280


**Examining the coefficients from Logistic Regression - Top 10 Features by Size of Coefficient**

In [24]:
log_reg_coef_df = pd.DataFrame({'coef': log_reg_dict['model'].coef_[0],
                                'variable': X.columns,
                                'abscoef': np.abs(log_reg_dict['model'].coef_[0])
                               })

log_reg_coef_df.sort_values('abscoef', ascending=False, inplace=True)
log_reg_coef_df.head(40)

Unnamed: 0,abscoef,coef,variable
475,1.957598,1.957598,feat_475
48,1.766165,1.766165,feat_048
442,1.667147,-1.667147,feat_442
318,1.493372,-1.493372,feat_318
281,1.433678,-1.433678,feat_281
453,1.392728,1.392728,feat_453
451,1.385501,1.385501,feat_451
105,1.24004,-1.24004,feat_105
493,1.169382,1.169382,feat_493
378,1.091859,-1.091859,feat_378


In [25]:
# confirming that all features retained -- no regularization given high C

log_reg_coef_df[log_reg_coef_df['coef'] != 0].shape

(500, 3)

**Features with Non-Zero Coeffients: C=  **

In [26]:
log_reg_coef_df[log_reg_coef_df['coef'] != 0]

Unnamed: 0,abscoef,coef,variable
475,1.957598,1.957598,feat_475
48,1.766165,1.766165,feat_048
442,1.667147,-1.667147,feat_442
318,1.493372,-1.493372,feat_318
281,1.433678,-1.433678,feat_281
453,1.392728,1.392728,feat_453
451,1.385501,1.385501,feat_451
105,1.240040,-1.240040,feat_105
493,1.169382,1.169382,feat_493
378,1.091859,-1.091859,feat_378
