In [1]:
##### Pipelines w/ Column Transformers, and Cross-Validation

In [2]:
# https://scikit-learn.org/stable/modules/cross_validation.html

# In the basic approach, called k-fold CV, the training set is split into k smaller sets.
# The following procedure is followed for each of the k “folds”:
#  - A model is trained using  of the folds as training data;
#  - The resulting model is validated on the remaining part of the data 
#     (i.e., it is used as a test set to compute a performance measure such as accuracy).
# The performance measure reported by k-fold cross-validation is then the average of the values computed in the loop. 
# This approach can be computationally expensive, but does not waste too much data 
# (as is the case when fixing an arbitrary validation set), which is a major advantage in problems where the 
# number of samples is very small.

In [3]:
import numpy as np
import pandas as pd

In [4]:
# read data
df = pd.read_csv('data/kaggleTitanic/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# transform 'Cabin' column to 'Deck'
df['Deck'] = df['Cabin'].apply(lambda x: x[0] if pd.notna(x) else np.nan)
df.sample(frac=0.01)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck
248,249,1,1,"Beckwith, Mr. Richard Leonard",male,37.0,1,1,11751,52.5542,D35,S,D
121,122,0,3,"Moore, Mr. Leonard Charles",male,,0,0,A4. 54510,8.05,,S,
702,703,0,3,"Barbara, Miss. Saiide",female,18.0,0,1,2691,14.4542,,C,
101,102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S,
280,281,0,3,"Duane, Mr. Frank",male,65.0,0,0,336439,7.75,,Q,
607,608,1,1,"Daniel, Mr. Robert Williams",male,27.0,0,0,113804,30.5,,S,
455,456,1,3,"Jalsevac, Mr. Ivan",male,29.0,0,0,349240,7.8958,,C,
593,594,0,3,"Bourke, Miss. Mary",female,,0,2,364848,7.75,,Q,
131,132,0,3,"Coelho, Mr. Domingos Fernandeo",male,20.0,0,0,SOTON/O.Q. 3101307,7.05,,S,


In [6]:
X = df.drop(['Survived'], axis=1)
y = df['Survived']

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

### Following code to deal with SetttingWithCopyWarning, and ensure we are working with a copy of the data and not a view
Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

In [7]:
# set up preprocessing pipeline for numeric data
# - impute missing values with median 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
numeric_features = ['Age']
numeric_transformer = Pipeline(steps=[
    ('si', SimpleImputer(missing_values=np.nan, strategy='median'))])

In [8]:
# set up preprocessing pipeline for categorical data
# - impute missing values with constant 'X' 
# - one-hot-encode imputed categorical values 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
categorical_features = ['Pclass', 'Sex', 'Deck']
categorical_transformer = Pipeline(steps=[
    ('si', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='X')),
    ('ohe', OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore'))])

In [9]:
# set up column transformer with preprocessing pipelines for numeric and categorical data
# - only keep imputed numeric and ohe catergorical features 
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder='drop') # remainder='passthrough')

In [10]:
# set up the preprocessing->model pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression  
clf = Pipeline(steps=[('pp', preprocessor),
                      ('lr', LogisticRegression(solver='liblinear'))])

In [11]:
# cross validate
# use 5-fold cross-validation: train, validate each time and get the mean scores 

from sklearn.model_selection import cross_validate
scores = cross_validate(clf, Xtrain, ytrain, cv=5, return_train_score=False)
scores['test_score'].mean()

0.8020353136025896