In [1]:
##### Pipelines w/ Column Transformers, and Cross-Validation

In [2]:
# https://scikit-learn.org/stable/modules/cross_validation.html

# In the basic approach, called k-fold CV, the training set is split into k smaller sets.
# The following procedure is followed for each of the k “folds”:
#  - A model is trained using  of the folds as training data;
#  - The resulting model is validated on the remaining part of the data 
#     (i.e., it is used as a test set to compute a performance measure such as accuracy).
# The performance measure reported by k-fold cross-validation is then the average of the values computed in the loop. 
# This approach can be computationally expensive, but does not waste too much data 
# (as is the case when fixing an arbitrary validation set), which is a major advantage in problems where the 
# number of samples is very small.

In [3]:
import numpy as np
import pandas as pd

In [4]:
# read data
df = pd.read_csv('data/kaggleTitanic/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# transform 'Cabin' column to 'Deck'
df['Deck'] = df['Cabin'].apply(lambda x: x[0] if pd.notna(x) else np.nan)
df.sample(frac=0.01)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0,,S,
597,598,0,3,"Johnson, Mr. Alfred",male,49.0,0,0,LINE,0.0,,S,
505,506,0,1,"Penasco y Castellana, Mr. Victor de Satode",male,18.0,1,0,PC 17758,108.9,C65,C,C
827,828,1,2,"Mallet, Master. Andre",male,1.0,0,2,S.C./PARIS 2079,37.0042,,C,
378,379,0,3,"Betros, Mr. Tannous",male,20.0,0,0,2648,4.0125,,C,
39,40,1,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,11.2417,,C,
484,485,1,1,"Bishop, Mr. Dickinson H",male,25.0,1,0,11967,91.0792,B49,C,B
247,248,1,2,"Hamalainen, Mrs. William (Anna)",female,24.0,0,2,250649,14.5,,S,
797,798,1,3,"Osman, Mrs. Mara",female,31.0,0,0,349244,8.6833,,S,


In [6]:
# only retain columns required for analysis: ['Survived', 'Pclass', 'Sex', 'Age', 'Deck']
# df = df[['Survived', 'Pclass', 'Sex', 'Age', 'Deck']]
# df.sample(frac=0.01)

In [7]:
X = df.drop(['Survived'], axis=1)
y = df['Survived']

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

### Following code to deal with SetttingWithCopyWarning, and ensure we are working with a copy of the data and not a view
Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

In [8]:
# set up preprocessing pipeline for numeric data
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
numeric_features = ['Age']
numeric_transformer = Pipeline(steps=[
    ('si', SimpleImputer(missing_values=np.nan, strategy='median'))])

In [9]:
# set up preprocessing pipeline for categorical data
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
categorical_features = ['Pclass', 'Sex', 'Deck']
categorical_transformer = Pipeline(steps=[
    ('si', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='X')),
    ('ohe', OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore'))])

In [10]:
# set up column transformer with preprocessing pipelines for numeric and categorical data
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder='drop') # remainder='passthrough')

# By default, only the specified columns in transformers are transformed and combined in the output, 
# and the non-specified columns are dropped. (default of 'drop'). By specifying remainder='passthrough', 
# all remaining columns that were not specified in transformers will be automatically passed through. 
# This subset of columns is concatenated with the output of the transformers. 

In [11]:
# set up the preprocessing->model pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression  
clf = Pipeline(steps=[('pp', preprocessor),
                      ('lr', LogisticRegression(solver='liblinear'))])

In [12]:
# cross validate
# use 5-fold cross-validation: train, validate each time and get the mean scores 

from sklearn.model_selection import cross_validate
scores = cross_validate(clf, Xtrain, ytrain, cv=5, return_train_score=False)
scores['test_score'].mean()

0.8020353136025896