In [1]:
from lib.project_5 import general_process, load_data_from_database, make_data_dict, general_model, general_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from __future__ import print_function

# Step 1 - Benchmarking


**NOTE: EACH OF THESE SHOULD BE WRITTEN SOLELY WITH REGARD TO STEP 1 - BENCHMARKING**

### Domain and Data

**TODO:** Write a simple statement about the domain of your problem and the dataset upon which you will be working. 

### Problem Statement

**TODO:** Write a simple problem statement with regard to benchmarking your work only.

### Solution Statement

**TODO:** Write a simple solution statement with regard to benchmarking your work only.

### Metric

**TODO**: Write a statement about the metric you will be using. This section is global as it will be the metric you will use throughout this project. Provide a brief justification for choosing this metric.

### Benchmark

**TODO**: Write a statement explaining that this is the process by which you identify a benchmark for your project.

## Implementation

Implement the following code pipeline using the functions you write in `lib/project_5.py`.

<img src="assets/benchmarking.png" width="600px">

In [2]:
madelon_df = load_data_from_database()

In [3]:
madelon_df.head()

Unnamed: 0,index,feat_000,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_491,feat_492,feat_493,feat_494,feat_495,feat_496,feat_497,feat_498,feat_499,label
0,0,485,477,537,479,452,471,491,476,475,...,481,477,485,511,485,481,479,475,496,-1
1,1,483,458,460,487,587,475,526,479,485,...,478,487,338,513,486,483,492,510,517,-1
2,2,487,542,499,468,448,471,442,478,480,...,481,492,650,506,501,480,489,499,498,-1
3,3,480,491,510,485,495,472,417,474,502,...,480,474,572,454,469,475,482,494,461,1
4,4,484,502,528,489,466,481,402,478,487,...,479,452,435,486,508,481,504,495,511,1


In [4]:
madelon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 502 entries, index to label
dtypes: int64(502)
memory usage: 7.7 MB


In [5]:
madelon_df.shape

(2000, 502)

**Creating our target vector**

In [6]:
y = madelon_df['label']

In [7]:
# target vector is comprised of '1's and '-1's

y.unique()

array([-1,  1])

In [8]:
# the target vector is evenly distributed

y.value_counts()

 1    1000
-1    1000
Name: label, dtype: int64

**Creating the feature matrix**

In [9]:
X = madelon_df.drop(['label', 'index'], axis=1)
X.head(1)

Unnamed: 0,feat_000,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_490,feat_491,feat_492,feat_493,feat_494,feat_495,feat_496,feat_497,feat_498,feat_499
0,485,477,537,479,452,471,491,476,475,473,...,477,481,477,485,511,485,481,479,475,496


**Confirming that our target vector and feature matrix have the appropriate shapes for sklearn**

In [10]:
X.shape, y.shape

((2000, 500), (2000,))

**Making data dictionary**

In [11]:
madelon_data_dict = make_data_dict(X, y, 0.25, 82)

**Confirming the Train-Test-Split**

In [12]:
pd.DataFrame(madelon_data_dict['X_train']).shape

(1500, 500)

In [13]:
pd.DataFrame(madelon_data_dict['X_test']).shape

(500, 500)

**Standard Scaler**

In [14]:
madelon_scaled_dict = general_transformer(StandardScaler(), madelon_data_dict)

In [15]:
madelon_data_dict['processes']

[StandardScaler(copy=True, with_mean=True, with_std=True)]

In [16]:
scaler = madelon_scaled_dict['transformer']
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [17]:
scaler.mean_

array([ 481.688     ,  483.64066667,  511.44533333,  483.40933333,
        501.95666667,  479.336     ,  479.81266667,  476.52933333,
        486.79133333,  478.574     ,  486.248     ,  490.96533333,
        478.74133333,  482.70666667,  485.17133333,  479.906     ,
        479.16266667,  494.94533333,  483.51066667,  477.47333333,
        484.47666667,  494.24733333,  476.426     ,  479.38733333,
        500.00666667,  504.08333333,  484.518     ,  482.242     ,
        480.548     ,  491.65733333,  481.92666667,  499.77333333,
        500.84733333,  490.66333333,  486.40866667,  489.25      ,
        479.28866667,  480.64666667,  479.73333333,  476.12266667,
        476.49666667,  489.59666667,  476.30466667,  487.72933333,
        500.57466667,  480.58      ,  496.20933333,  494.418     ,
        485.254     ,  479.01733333,  517.442     ,  504.33866667,
        484.84933333,  479.952     ,  496.92733333,  494.98133333,
        480.36933333,  485.288     ,  509.16133333,  477.61933

## Test of General Model - Logistic Regression

In [18]:
log_reg_dict = general_model(LogisticRegression(C=1E10, penalty='l1', n_jobs=-1), madelon_scaled_dict)

In [19]:
log_reg_dict['processes']

[StandardScaler(copy=True, with_mean=True, with_std=True),
 LogisticRegression(C=10000000000.0, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='ovr', n_jobs=-1, penalty='l1', random_state=None,
           solver='liblinear', tol=0.0001, verbose=0, warm_start=False)]

In [20]:
log_reg_dict['model']

LogisticRegression(C=10000000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

### Getting a Benchmark Score

Train and Test Score from "out-of-the-box" Logistic Regression
- no Regularlization (C=1E10)

In [21]:
print("Train Score: {:.4f}".format(log_reg_dict['train_score']))
print("Test Score:  {:.4f}".format(log_reg_dict['test_score']))

Train Score: 0.7953
Test Score:  0.5260


**Examining the coefficients from Logistic Regression**

In [22]:
log_reg_coef_df = pd.DataFrame({'coef': log_reg_dict['model'].coef_[0],
                                'variable': X.columns,
                                'abscoef': np.abs(log_reg_dict['model'].coef_[0])
                               })

log_reg_coef_df.sort_values('abscoef', ascending=False, inplace=True)
log_reg_coef_df.head(5)

Unnamed: 0,abscoef,coef,variable
475,1.959267,1.959267,feat_475
48,1.765176,1.765176,feat_048
442,1.667132,-1.667132,feat_442
318,1.495201,-1.495201,feat_318
281,1.430702,-1.430702,feat_281


In [23]:
log_reg_coef_df.shape

(500, 3)

In [24]:
# confirm all features retained -- no regularization given C=1E10

log_reg_coef_df[log_reg_coef_df['coef'] != 0].shape

(500, 3)

## Pipeline 2 - Identifying Features via Lasso

In [77]:
madelon_scaled_dict_copy = madelon_scaled_dict

In [91]:
C_list = [0.0001, 0.055, 0.01, 0.1, 1]

In [94]:
results = {}

for C in C_list:
    
    general_model(LogisticRegression(C=C, penalty='l1', n_jobs=-1), madelon_scaled_dict_copy)
    results['C'] = C
    results['train_score'] = madelon_scaled_dict_copy['train_score']
    results['test_score'] = madelon_scaled_dict_copy['test_score']

    print (results)
    

{'test_score': 0.52800000000000002, 'C': 1, 'train_score': 0.77733333333333332}


In [78]:
log_reg_dict1 = general_model(LogisticRegression(C=0.0001, penalty='l1', n_jobs=-1), madelon_scaled_dict_copy)

In [79]:
log_reg_dict1['processes']

[StandardScaler(copy=True, with_mean=True, with_std=True),
 LogisticRegression(C=10000000000.0, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='ovr', n_jobs=-1, penalty='l1', random_state=None,
           solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
 LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
           penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
           penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=

In [80]:
log_reg_dict1['model']

LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [81]:
print("Train Score: {:.4f}".format(log_reg_dict1['train_score']))
print("Test Score:  {:.4f}".format(log_reg_dict1['test_score']))

Train Score: 0.5047
Test Score:  0.4860


**Examining the Impact of Increasing Regularization on Coefficients -- Feature Selection**

In [82]:
log_reg_coef1_df = pd.DataFrame({'coef': log_reg_dict1['model'].coef_[0],
                                 'variable': X.columns,
                                 'abscoef': np.abs(log_reg_dict1['model'].coef_[0])
                                })

log_reg_coef1_df.sort_values('abscoef', ascending=False, inplace=True)
log_reg_coef1_df.head(5)

Unnamed: 0,abscoef,coef,variable
0,0.0,0.0,feat_000
329,0.0,0.0,feat_329
342,0.0,0.0,feat_342
341,0.0,0.0,feat_341
340,0.0,0.0,feat_340


In [83]:
lasso_features_removed = log_reg_coef1_df[log_reg_coef1_df['coef'] == 0]
lasso_features_removed.count()

abscoef     500
coef        500
variable    500
dtype: int64

In [84]:
# Baseline Values: Train: 0.7953, Test: 0.5280

In [85]:
# C=1: Train: 0.7767, Test: 0.5280 -- 31 features removed

In [86]:
# C=0.1: Train: 0.7487, Test: 0.5860 -- 235 features removed

In [87]:
# C=0.01: Train: 0.6180, Test: 0.6060 -- 499 features removed

In [88]:
# C=0.055: Train: 0.7013, Test: 0.5780 -- 349 features removed -- split the difference between the last case

In [90]:
# C=0.0001: Train: 0.5047, Test: 0.4860 -- effectively all of the features removed

In [92]:
results = [general_model(LogisticRegression(C=C, penalty='l1', n_jobs=-1), madelon_scaled_dict_copy) \
           for C in C_list]

In [93]:
results

[{'X_test': array([[-1.35028888, -0.45361972, -0.83425415, ..., -0.66200238,
          -1.09247646, -0.65678969],
         [-0.10692895,  0.27798923, -0.93710458, ...,  0.52278935,
          -1.09247646,  1.61615002],
         [ 0.51475101, -0.38710982,  0.63136453, ...,  1.26328418,
           0.57351208,  1.88582084],
         ..., 
         [-0.88402891,  1.54167742,  1.71129407, ..., -0.143656  ,
           1.70208496,  0.11369835],
         [ 0.35933102, -1.5842881 ,  1.4798806 , ..., -0.81010134,
           0.41228738, -0.23302127],
         [ 0.35933102,  1.17587295,  0.65707713, ...,  0.22659142,
          -0.34009454,  1.34647921]]),
  'X_train': array([[ 1.13643097,  0.61053876, -0.21715155, ..., -0.36580445,
          -1.76424604,  0.57599117],
         [ 0.20391103, -0.58663953,  1.96842015, ...,  0.59683883,
           1.64834339, -1.46580213],
         [ 0.04849104,  0.31124418,  1.09419147, ..., -0.88415083,
           1.27215243,  0.57599117],
         ..., 
         [-