In [1]:
##### Pipelines w/ Column Transformers, and Grid-Search w/ Cross-Validation

In [2]:
# https://scikit-learn.org/stable/modules/grid_search.html
#
# Hyper-parameters are parameters that are not directly learnt within estimators. In scikit-learn they are passed as 
# arguments to the constructor of the estimator classes. Typical examples include C, kernel and gamma for Support Vector 
# Classifier, alpha for Lasso, etc.
#
# It is possible and recommended to search the hyper-parameter space for the best cross validation score.
# 
# Any parameter provided when constructing an estimator may be optimized in this manner. 
#
# A search consists of:
# - an estimator (regressor or classifier such as sklearn.svm.SVC());
# - a parameter space;
# - a method for searching or sampling candidates;
# - a cross-validation scheme; and
# - a score function.
#
# Some models allow for specialized, efficient parameter search strategies, outlined below. 
# Two generic approaches to sampling search candidates are provided in scikit-learn: for given values, GridSearchCV 
# exhaustively considers all parameter combinations, while RandomizedSearchCV can sample a given number of candidates 
# from a parameter space with a specified distribution. 
#

In [3]:
import numpy as np
import pandas as pd

In [4]:
# read data
df = pd.read_csv('data/kaggleTitanic/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# transform 'Cabin' column to 'Deck'
df['Deck'] = df['Cabin'].apply(lambda x: x[0] if pd.notna(x) else np.nan)
df.sample(frac=0.01)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck
598,599,0,3,"Boulos, Mr. Hanna",male,,0,0,2664,7.225,,C,
328,329,1,3,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31.0,1,1,363291,20.525,,S,
508,509,0,3,"Olsen, Mr. Henry Margido",male,28.0,0,0,C 4001,22.525,,S,
806,807,0,1,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,0.0,A36,S,A
676,677,0,3,"Sawyer, Mr. Frederick Charles",male,24.5,0,0,342826,8.05,,S,
503,504,0,3,"Laitinen, Miss. Kristina Sofia",female,37.0,0,0,4135,9.5875,,S,
49,50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18.0,1,0,349237,17.8,,S,
397,398,0,2,"McKane, Mr. Peter David",male,46.0,0,0,28403,26.0,,S,
195,196,1,1,"Lurette, Miss. Elise",female,58.0,0,0,PC 17569,146.5208,B80,C,B


In [6]:
X = df.drop(['Survived'], axis=1)
y = df['Survived']

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

### Following code to deal with SetttingWithCopyWarning, and ensure we are working with a copy of the data and not a view
Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

In [7]:
# set up preprocessing pipeline for numeric data
# - impute missing values with median 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
numeric_features = ['Age']
numeric_transformer = Pipeline(steps=[
    ('si', SimpleImputer(missing_values=np.nan))]) #, strategy='median'))])

In [8]:
# set up preprocessing pipeline for categorical data
# - impute missing values with constant 'X' 
# - one-hot-encode imputed categorical values 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
categorical_features = ['Pclass', 'Sex', 'Deck']
categorical_transformer = Pipeline(steps=[
    ('si', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='X')),
    ('ohe', OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore'))])

In [9]:
# set up column transformer with preprocessing pipelines for numeric and categorical data
# - only keep imputed numeric and ohe catergorical features
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder='drop') # remainder='passthrough')

In [10]:
# set up the preprocessing->model pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression  
clf = Pipeline(steps=[('pp', preprocessor),
                      ('lr', LogisticRegression(solver='liblinear'))])

In [11]:
# setup grid search
from sklearn.model_selection import GridSearchCV
param_grid = {
    'lr__penalty': ['l1', 'l2'], # logistic regression regularization penalty
    'pp__num__si__strategy':['median','mean'] # simple imputer strategy
}
gscv = GridSearchCV(clf, param_grid, cv=5, return_train_score=False)

In [12]:
# search for best params

gscv.fit(Xtrain, ytrain)

print ("-"*100)
print(gscv.best_estimator_, "\n")
print ("-"*100)
print(gscv.best_score_, "\n")
print ("-"*100)
print(gscv.best_params_, "\n")
print ("-"*100)
print(gscv.cv_results_, "\n")
print ("-"*100)

----------------------------------------------------------------------------------------------------
Pipeline(memory=None,
         steps=[('pp',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('si',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                           

In [13]:
# predict and evaluate best_estimator_ on test data

ypred = gscv.best_estimator_.predict(Xtest)

from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.7821229050279329
[[87 19]
 [20 53]]
              precision    recall  f1-score   support

           0       0.81      0.82      0.82       106
           1       0.74      0.73      0.73        73

    accuracy                           0.78       179
   macro avg       0.77      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179

