# Investigation into Sklearn Pipelines for Scaling and Model Selection

Using http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

and data https://archive.ics.uci.edu/ml/datasets/APS+Failure+at+Scania+Trucks

In [1]:
import numpy as np
import pandas as pd
import re
import os
from pandas.plotting import scatter_matrix

get_ipython().magic(u'env OMP_NUM_THREADS=2')

from IPython.display import display, HTML

import sklearn
import sklearn.model_selection
import requests
import io
import random

# Set the ransom seed used for the whole program to allow reprocibility
np.random.seed(3214412)

DEBUG = True # If true, pull a sample of the dataset for development 

env: OMP_NUM_THREADS=2


In [2]:
local_archive = "aps_failure_training_set.csv"
if not os.path.exists(local_archive):
    print("Downloading contents")
    data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_training_set.csv"
    contents=requests.get(data_url).content
    # First 20 rows of the file is a header with licensing info
    # The header with column names is on row 21
    raw_data_df=pd.read_csv(io.StringIO(contents.decode('utf-8')), skiprows=20, na_values="na")
    raw_data_df.to_csv(local_archive, index=False)
else:
    print("Loading from local")
    raw_data_df=pd.read_csv(local_archive, na_values="na")
    
raw_data_df.head()

Loading from local


Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698,,2130706000.0,280.0,0.0,0.0,0.0,0.0,0.0,...,1240520.0,493384.0,721044.0,469792.0,339156.0,157956.0,73224.0,0.0,0.0,0.0
1,neg,33058,,0.0,,0.0,0.0,0.0,0.0,0.0,...,421400.0,178064.0,293306.0,245416.0,133654.0,81140.0,97576.0,1500.0,0.0,0.0
2,neg,41040,,228.0,100.0,0.0,0.0,0.0,0.0,0.0,...,277378.0,159812.0,423992.0,409564.0,320746.0,158022.0,95128.0,514.0,0.0,0.0
3,neg,12,0.0,70.0,66.0,0.0,10.0,0.0,0.0,0.0,...,240.0,46.0,58.0,44.0,10.0,0.0,0.0,0.0,4.0,32.0
4,neg,60874,,1368.0,458.0,0.0,0.0,0.0,0.0,0.0,...,622012.0,229790.0,405298.0,347188.0,286954.0,311560.0,433954.0,1218.0,0.0,0.0


In [3]:
# This is a poc on sklearn-pipelines so drop down to 10 columns
# Grab the 10 columns with the least number of null values and column "class"
data_df = raw_data_df[raw_data_df.isnull().sum().sort_values()[:11].index].dropna()
data_df.head()

Unnamed: 0,class,aa_000,bt_000,ck_000,cj_000,ci_000,by_000,aq_000,bi_000,bj_000,ao_000
0,neg,76698,76698.08,916567.68,0.0,5245752.0,43566.0,1132040.0,947550.0,799478.0,3655166.0
1,neg,33058,33057.51,643536.96,0.0,2291079.36,17733.0,338544.0,688314.0,392208.0,2127150.0
2,neg,41040,41040.08,236099.52,0.0,2322692.16,15439.0,153698.0,160176.0,139730.0,2173634.0
3,neg,12,12.69,4525.44,0.0,2135.04,32.0,1014.0,7632.0,3090.0,7554.0
4,neg,60874,60874.03,379111.68,0.0,3565684.8,24793.0,551022.0,653692.0,399410.0,2618878.0


In [4]:
print("Count of rows: {}".format(data_df.shape[0]))
print("Count of rows with class 'neg': {}".format(data_df[data_df['class'] == 'neg']['class'].shape[0]))
print("Count of rows with class 'pos': {}".format(data_df[data_df['class'] == 'pos']['class'].shape[0]))

Count of rows: 58888
Count of rows with class 'neg': 57932
Count of rows with class 'pos': 956


In [5]:
# Describe all the columns at once
display(pd.concat([data_df[col].describe().to_frame(name=col) for col in data_df.columns if col != 'class'], axis=1))
display(data_df['class'].value_counts())

Unnamed: 0,aa_000,bt_000,ck_000,cj_000,ci_000,by_000,aq_000,bi_000,bj_000,ao_000
count,58888.0,58888.0,58888.0,58888.0,58888.0,58888.0,58888.0,58888.0,58888.0,58888.0
mean,59426.03,59426.03,712834.4,99939.76,3484558.0,22013.18,444126.4,494365.2,512368.4,3012701.0
std,145268.4,145268.4,2164271.0,1090789.0,8345773.0,53824.52,1264976.0,1489898.0,1826194.0,6830351.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,876.0,875.1775,14704.56,0.0,49216.08,219.0,4167.5,15969.5,8519.5,65708.0
50%,30952.0,30952.14,252449.3,0.0,1869525.0,12718.0,180057.0,180682.0,155115.0,1649212.0
75%,48898.5,48897.76,551467.4,0.0,2956215.0,20411.0,378592.5,381219.5,334944.0,2686392.0
max,2746564.0,2746565.0,55428670.0,60949670.0,140986100.0,1002003.0,25562650.0,44937500.0,45736320.0,122201800.0


neg    57932
pos      956
Name: class, dtype: int64

## Pipeline POC work

In [6]:
train_df = data_df.drop(labels=['class'], axis=1)
labels_srs = data_df['class']

### Basic scaling

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn import svm
scaling = StandardScaler()
svm_model = svm.SVC(kernel='linear')
chi_f_selection = SelectKBest(score_func=chi2, k=8)
pipeline_model = Pipeline([
        ('chi_selection', chi_f_selection),
        ('scaling', scaling),
        ('svc', svm_model)])
fit_model = pipeline_model.fit(train_df, labels_srs)

In [8]:
# Now try it out
print("score: {}".format(fit_model.score(train_df, y=labels_srs)))
print("Sample of Predictions:\n{}".format(fit_model.predict(train_df.sample(n=50))))

score: 0.9852601548702622
Sample of Predictions:
['neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'neg' 'neg']


### Grid Search over Pipeline paramaters
1. Feature selection functions and parameters

In [27]:
from sklearn.feature_selection import SelectKBest, chi2, f_classif, f_regression
from sklearn.model_selection import GridSearchCV

scaling = StandardScaler()
svm_model = svm.SVC(kernel='linear')
chi_f_selection = SelectKBest(score_func=chi2, k=8)
pipeline_model = Pipeline([
        ('chi_selection', chi_f_selection),
        ('scaling', scaling),
        ('svc', svm_model)])

# Commented a few options out in the interest of POC and time
param_grid = [
    {
        'chi_selection__score_func': [chi2], #, f_regression, f_classif],
        'scaling__with_mean': [True],#, False],
        'scaling__with_std': [True],#, False],
        'svc__C': [0.5]#, 1, 5, 10]
    }
]

grid = GridSearchCV(pipeline_model, cv=3, n_jobs=3, param_grid=param_grid)
grid_fit_model = grid.fit(train_df, labels_srs)

In [28]:
# Now try it out
print("score: {}".format(grid_fit_model.score(train_df, y=labels_srs)))
print("Sample of Predictions:\n{}".format(grid_fit_model.predict(train_df.sample(n=50))))

score: 0.9852601548702622
Sample of Predictions:
['neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'neg' 'neg']


In [29]:
print("Best Parameters")
grid_fit_model.best_params_

Best Parameters


{'chi_selection__score_func': <function sklearn.feature_selection.univariate_selection.chi2>,
 'scaling__with_mean': True,
 'scaling__with_std': True,
 'svc__C': 0.5}

### Cross Validation score prediction

In [31]:
from sklearn.model_selection import cross_val_score
cross_val_score(grid_fit_model, train_df, labels_srs, cv=3)

array([0.98517575, 0.98507387, 0.98537803])

## Bootstrapping

In [39]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

scaling = StandardScaler()
dtc_model = DecisionTreeClassifier()
ab_dtc_model = AdaBoostClassifier(dtc_model)
chi_f_selection = SelectKBest(score_func=chi2, k=8)
pipeline_model = Pipeline([
        ('chi_selection', chi_f_selection),
        ('scaling', scaling),
        ('boost', ab_dtc_model)])

# Commented a few options out in the interest of POC and time
param_grid = [
    {
        'chi_selection__score_func': [chi2], #, f_regression, f_classif],
        'scaling__with_mean': [True],#, False],
        'scaling__with_std': [True],#, False],
        'boost__n_estimators': [10, 50, 200],
        'boost__base_estimator': [DecisionTreeClassifier(max_depth=1, min_samples_leaf=1),
                                  DecisionTreeClassifier(max_depth=5, min_samples_leaf=3)]
    }
]

grid = GridSearchCV(pipeline_model, cv=3, n_jobs=6, param_grid=param_grid)
grid_fit_model = grid.fit(train_df, labels_srs)

# Now try it out
print("score: {}".format(grid_fit_model.score(train_df, y=labels_srs)))
print("Sample of Predictions:\n{}".format(grid_fit_model.predict(train_df.sample(n=50))))

print("Best Parameters")
grid_fit_model.best_params_

score: 1.0
Sample of Predictions:
['neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'neg' 'neg']
Best Parameters


{'boost__base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=3, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False, random_state=None,
             splitter='best'),
 'boost__n_estimators': 200,
 'chi_selection__score_func': <function sklearn.feature_selection.univariate_selection.chi2>,
 'scaling__with_mean': True,
 'scaling__with_std': True}

## Model Selection

In [46]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

classifiers = dict(
    knn3=KNeighborsClassifier(3),
    svc=SVC(kernel="linear", C=0.025),
    tree=DecisionTreeClassifier(max_depth=5),
    forest=RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    boost=AdaBoostClassifier()
)

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn import svm
scaling = StandardScaler()
svm_model = svm.SVC(kernel='linear')
chi_f_selection = SelectKBest(score_func=chi2, k=8)

def get_score(name, model):
    pipeline_model = Pipeline([
            ('chi_selection', chi_f_selection),
            ('scaling', scaling),
            (name, model)])
    fit_model = pipeline_model.fit(train_df, labels_srs)
    score = fit_model.score(train_df, y=labels_srs)
    return score

results = {name: get_score(name, model) for name, model in classifiers.items()}

for name, score in results.items():
    print("{name}: score={score}".format(name=name, score=score))

forest: score=0.9868564053797039
svc: score=0.9852261920934655
boost: score=0.9847337318299144
knn3: score=0.9925281891047412
tree: score=0.9874337725852466
