In [1]:
##### Pipelines w/ Column Transformers

In [2]:
# https://scikit-learn.org/stable/modules/compose.html#pipeline
#
# Pipeline can be used to chain multiple estimators into one. 
# This is useful as there is often a fixed sequence of steps in processing the data, 
# for example feature selection, normalization and classification. 
#
# Pipeline serves multiple purposes here:
# - Convenience and encapsulation
#    You only have to call fit and predict once on your data to fit a whole sequence of estimators.
# - Joint parameter selection
#   You can grid search over parameters of all estimators in the pipeline at once.
# - Safety
#   Pipelines help avoid leaking statistics from your test data into the trained model in cross-validation, 
#   by ensuring that the same samples are used to train the transformers and predictors.
#
# All estimators in a pipeline, except the last one, must be transformers (i.e. must have a transform method). 
# The last estimator may be any type (transformer, classifier, etc.).
#
# The Pipeline is built using a list of (key, value) pairs, 
# where the key is a string containing the name you want to give this step and value is an estimator object.

In [3]:
# https://scikit-learn.org/stable/modules/compose.html#column-transformer
# 
# Warning The compose.ColumnTransformer class is experimental and the API is subject to change.
#
# Many datasets contain features of different types, say text, floats, and dates, where each type of feature requires 
# separate preprocessing or feature extraction steps. Often it is easiest to preprocess data before applying scikit-learn 
# methods, for example using pandas. Processing your data before passing it to scikit-learn might be problematic for 
# one of the following reasons:
# - Incorporating statistics from test data into the preprocessors makes cross-validation scores unreliable 
#   (known as data leakage), for example in the case of scalers or imputing missing values.
# - You may want to include the parameters of the preprocessors in a parameter search.
#
# The ColumnTransformer helps performing different transformations for different columns of the data, 
# within a Pipeline that is safe from data leakage and that can be parametrized. ColumnTransformer works on arrays, 
# sparse matrices, and pandas DataFrames.
#
# To each column, a different transformation can be applied, such as preprocessing or a specific feature extraction method.

In [4]:
# code adapted from: 
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

In [5]:
import numpy as np
import pandas as pd

In [6]:
# read data
df = pd.read_csv('data/kaggleTitanic/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
# transform 'Cabin' column to 'Deck'
df['Deck'] = df['Cabin'].apply(lambda x: x[0] if pd.notna(x) else np.nan)
df.sample(frac=0.01)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck
687,688,0,3,"Dakic, Mr. Branko",male,19.0,0,0,349228,10.1708,,S,
424,425,0,3,"Rosblom, Mr. Viktor Richard",male,18.0,1,1,370129,20.2125,,S,
847,848,0,3,"Markoff, Mr. Marin",male,35.0,0,0,349213,7.8958,,C,
881,882,0,3,"Markun, Mr. Johann",male,33.0,0,0,349257,7.8958,,S,
567,568,0,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,,S,
194,195,1,1,"Brown, Mrs. James Joseph (Margaret Tobin)",female,44.0,0,0,PC 17610,27.7208,B4,C,B
848,849,0,2,"Harper, Rev. John",male,28.0,0,1,248727,33.0,,S,
52,53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C,D
850,851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4.0,4,2,347082,31.275,,S,


In [8]:
X = df.drop(['Survived'], axis=1)
y = df['Survived']

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

### Following code to deal with SetttingWithCopyWarning, and ensure we are working with a copy of the data and not a view
Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

In [9]:
# set up preprocessing pipeline for numeric data
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
numeric_features = ['Age']
numeric_transformer = Pipeline(steps=[
    ('si', SimpleImputer(missing_values=np.nan, strategy='median'))])

#numeric_transformer.fit_transform(df[numeric_features])

In [10]:
# set up preprocessing pipeline for categorical data
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
categorical_features = ['Pclass', 'Sex', 'Deck']
categorical_transformer = Pipeline(steps=[
    ('si', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='X')),
    ('ohe', OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore'))])

#categorical_transformer.fit_transform(df[categorical_features])

In [11]:
# set up column transformer with preprocessing pipelines for numeric and categorical data
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder='drop') # remainder='passthrough')

# By default, only the specified columns in transformers are transformed and combined in the output, 
# and the non-specified columns are dropped. (default of 'drop'). By specifying remainder='passthrough', 
# all remaining columns that were not specified in transformers will be automatically passed through. 
# This subset of columns is concatenated with the output of the transformers. 

# preprocessor.fit_transform(df)
# preprocessor.transformers_[1][1].named_steps['ohe'].get_feature_names(categorical_features)

In [12]:
# set up the preprocessing->model pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression  
clf = Pipeline(steps=[('pp', preprocessor),
                      ('lr', LogisticRegression(solver='liblinear'))])

In [13]:
# fit combined preprocessing and model pipeline on train data
clf.fit(Xtrain, ytrain)

# clf.named_steps['preprocessor'].transformers_[1][1].named_steps['ohe'].get_feature_names(categorical_features)

Pipeline(memory=None,
         steps=[('pp',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('si',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                                            

In [14]:
# predict and evaluate combined preprocessing and model pipeline on test data
ypred = clf.predict(Xtest)
from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.7821229050279329
[[87 19]
 [20 53]]
              precision    recall  f1-score   support

           0       0.81      0.82      0.82       106
           1       0.74      0.73      0.73        73

    accuracy                           0.78       179
   macro avg       0.77      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179

