# Titanic classification with Scikit-learn pipeline

There will not be many explanations about the dataset or the preprocessing, feature engineering or models. This notebook will try to show how to encode everything into a sklearn pipeline and how the new version (sklearn==1.2.0) can keep the column names of a DataFrame through all the steps.

In [30]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

## Read input data

Read the csv file with all rows.

In [31]:
df = pd.read_csv('data/train.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


Check the null values.

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Feature engineering

We will discard the `Cabin` column beacuse it has too many null values. We will extract the `Treatment` of the person with the `Name` column, and create a new binary column `AgeEstimated`.

In [33]:
def feature_engineering(df):
    return (
            df
            .assign(
                raw_treat = df.Name.str.extract(r'(\s\w+\.)'),
                Treatment = lambda x: x.raw_treat.str.lstrip().str.replace('.', '', regex=False),
                AgeEstimated = df.Age.astype(str).str.contains('\.5').astype(int),
                Sex = df.Sex.replace({'male': 0, 'female': 1}) # done here because LabelBinarizer does not work with 'set_output'
            )
            .drop(columns=['Name', 'PassengerId', 'Ticket', 'Cabin', 'raw_treat'])
        )

feature_engineering(df)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Treatment,AgeEstimated
0,0,3,0,22.0,1,0,7.2500,S,Mr,0
1,1,1,1,38.0,1,0,71.2833,C,Mrs,0
2,1,3,1,26.0,0,0,7.9250,S,Miss,0
3,1,1,1,35.0,1,0,53.1000,S,Mrs,0
4,0,3,0,35.0,0,0,8.0500,S,Mr,0
...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,13.0000,S,Rev,0
887,1,1,1,19.0,0,0,30.0000,S,Miss,0
888,0,3,1,,1,2,23.4500,S,Miss,0
889,1,1,0,26.0,0,0,30.0000,C,Mr,0


And now encode this with a `FunctionTransformer`:

In [34]:
fe_eng = FunctionTransformer(feature_engineering)

fe_eng.fit_transform(df)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Treatment,AgeEstimated
0,0,3,0,22.0,1,0,7.2500,S,Mr,0
1,1,1,1,38.0,1,0,71.2833,C,Mrs,0
2,1,3,1,26.0,0,0,7.9250,S,Miss,0
3,1,1,1,35.0,1,0,53.1000,S,Mrs,0
4,0,3,0,35.0,0,0,8.0500,S,Mr,0
...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,13.0000,S,Rev,0
887,1,1,1,19.0,0,0,30.0000,S,Miss,0
888,0,3,1,,1,2,23.4500,S,Miss,0
889,1,1,0,26.0,0,0,30.0000,C,Mr,0


## Imputer

Impute missing values with different strategies.

In [35]:
imputer = ColumnTransformer(
    [
        (
            'label_imputer',
            SimpleImputer(strategy='most_frequent'),
            ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'Treatment', 'AgeEstimated']
        ),
        (
            'num_imputer',
            SimpleImputer(strategy='mean'),
            ['Age', 'Fare']
        )
    ],
    verbose_feature_names_out=False,
    remainder='drop'
).set_output(transform='pandas')

# concatenate both steps
imputer.fit_transform(fe_eng.fit_transform(df)).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Pclass        891 non-null    object 
 1   Sex           891 non-null    object 
 2   SibSp         891 non-null    object 
 3   Parch         891 non-null    object 
 4   Embarked      891 non-null    object 
 5   Treatment     891 non-null    object 
 6   AgeEstimated  891 non-null    object 
 7   Age           891 non-null    float64
 8   Fare          891 non-null    float64
dtypes: float64(2), object(7)
memory usage: 62.8+ KB


## Scale and encode

Here we will scale some columns or use them in `OneHotEncoding`.

In [36]:
scale_encode = ColumnTransformer(
    [
        (
            'std_scaler',
            StandardScaler(),
            ['Age', 'Fare']
        ),
        (
            'minmax_scaler',
            MinMaxScaler(),
            ['SibSp', 'Parch']
        ),
        (
            'one_hot',
            OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist'),
            ['Pclass', 'Embarked', 'Treatment']
        ),
        (
            'passthrough',
            'passthrough',
            ['Sex', 'AgeEstimated']
        )
    ],
    verbose_feature_names_out=False
).set_output(transform='pandas')

# concatenate three steps
scale_encode.fit_transform(imputer.fit_transform(fe_eng.fit_transform(df)))

Unnamed: 0,Age,Fare,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,...,Treatment_Miss,Treatment_Mlle,Treatment_Mme,Treatment_Mr,Treatment_Mrs,Treatment_Ms,Treatment_Rev,Treatment_Sir,Sex,AgeEstimated
0,-0.592481,-0.502445,0.125,0.000000,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0
1,0.638789,0.786845,0.125,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0
2,-0.284663,-0.488854,0.000,0.000000,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
3,0.407926,0.420730,0.125,0.000000,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0
4,0.407926,-0.486337,0.000,0.000000,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,-0.207709,-0.386671,0.000,0.000000,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0
887,-0.823344,-0.044381,0.000,0.000000,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
888,0.000000,-0.176263,0.125,0.333333,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
889,-0.284663,-0.044381,0.000,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0


## Preprocessing pipeline

Now we will combine all this transformations into one preprocessing pipeline.

In [37]:
preproc = Pipeline(
    [
        ('fe_eng', fe_eng),
        ('imputer', imputer),
        ('scale_encode', scale_encode)
    ]
).set_output(transform='pandas')

preproc.fit_transform(df)



Unnamed: 0,Age,Fare,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,...,Treatment_Miss,Treatment_Mlle,Treatment_Mme,Treatment_Mr,Treatment_Mrs,Treatment_Ms,Treatment_Rev,Treatment_Sir,Sex,AgeEstimated
0,-0.592481,-0.502445,0.125,0.000000,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0
1,0.638789,0.786845,0.125,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0
2,-0.284663,-0.488854,0.000,0.000000,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
3,0.407926,0.420730,0.125,0.000000,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0
4,0.407926,-0.486337,0.000,0.000000,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,-0.207709,-0.386671,0.000,0.000000,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0
887,-0.823344,-0.044381,0.000,0.000000,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
888,0.000000,-0.176263,0.125,0.333333,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
889,-0.284663,-0.044381,0.000,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0


It looks like it works! You can see all the steps:

In [38]:
preproc

## Model

Just for checking that everything works we will fit a simple `LogisticRegression` model:

In [39]:
model = Pipeline(
    [
        ('preproc', preproc),
        ('cla', LogisticRegression())
    ]
).set_output(transform='pandas')

model



In [40]:
model.fit(df, df.Survived)
model.score(df, df.Survived)

0.8361391694725028

## GridSearchCV

Now that we have a complete pipeline we can fit many models with cross-validation and get the best model with the best parameters. First we need a parameter list:

In [41]:
param_grid = [
    {
        'preproc__imputer__num_imputer__strategy': ['mean', 'median'],
        'preproc__scale_encode__minmax_scaler__feature_range': [(0, 1), (-1, 1)],
        'cla': (LogisticRegression(),),
        'cla__C': [0.5, 1.0, 5.0],
        'cla__max_iter': [1000],
        'cla__class_weight': [None, 'balanced']
    },
    {
        'preproc__imputer__num_imputer__strategy': ['mean', 'median'],
        'preproc__scale_encode__minmax_scaler__feature_range': [(0, 1), (-1, 1)],
        'cla': (KNeighborsClassifier(),),
        'cla__n_neighbors': [3, 5, 7],
        'cla__weights': ['uniform', 'distance']
    },
    {
        'preproc__imputer__num_imputer__strategy': ['mean', 'median'],
        'preproc__scale_encode__minmax_scaler__feature_range': [(0, 1), (-1, 1)],
        'cla': (MLPClassifier(),),
        'cla__hidden_layer_sizes': [(20,), (25,), (30,)],
        'cla__activation': ['logistic', 'relu'],
        'cla__max_iter': [1500]
    },
    {
        'preproc__imputer__num_imputer__strategy': ['mean', 'median'],
        'preproc__scale_encode__minmax_scaler__feature_range': [(0, 1), (-1, 1)],
        'cla': (DecisionTreeClassifier(),),
        'cla__criterion': ['gini', 'entropy'],
        'cla__max_depth': [5, 8, 10]
    },
    {
        'preproc__imputer__num_imputer__strategy': ['mean', 'median'],
        'preproc__scale_encode__minmax_scaler__feature_range': [(0, 1), (-1, 1)],
        'cla': (RandomForestClassifier(),),
        'cla__n_estimators': [50, 100, 150],
        'cla__max_depth': [5, 8, 10]
    }
]   


gs = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    verbose=1
)

Fit all models.

In [42]:
gs.fit(df, df.Survived)

Fitting 5 folds for each of 132 candidates, totalling 660 fits


Get results as pandas DataFrame.

In [43]:
result = pd.DataFrame(gs.cv_results_).sort_values(by='rank_test_score').reset_index(drop=True)

result 

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_cla,param_cla__C,param_cla__class_weight,param_cla__max_iter,param_preproc__imputer__num_imputer__strategy,param_preproc__scale_encode__minmax_scaler__feature_range,...,param_cla__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.244059,0.023466,0.034375,0.002148,RandomForestClassifier(max_depth=8),,,,median,"(0, 1)",...,100,"{'cla': RandomForestClassifier(max_depth=8), '...",0.861199,0.814104,0.899131,0.874532,0.917165,0.873226,0.035337,1
1,0.228174,0.018472,0.032862,0.001750,RandomForestClassifier(max_depth=8),,,,mean,"(-1, 1)",...,100,"{'cla': RandomForestClassifier(max_depth=8), '...",0.851515,0.817981,0.913369,0.867647,0.913509,0.872804,0.036845,2
2,0.205362,0.006718,0.032877,0.002202,RandomForestClassifier(max_depth=8),,,,mean,"(0, 1)",...,100,"{'cla': RandomForestClassifier(max_depth=8), '...",0.862912,0.811965,0.896257,0.872861,0.920024,0.872804,0.036274,3
3,0.139501,0.016900,0.028059,0.002819,RandomForestClassifier(max_depth=8),,,,mean,"(0, 1)",...,50,"{'cla': RandomForestClassifier(max_depth=8), '...",0.865679,0.822126,0.884225,0.869652,0.921420,0.872620,0.032009,4
4,0.329034,0.015588,0.041890,0.004370,RandomForestClassifier(max_depth=8),,,,mean,"(0, 1)",...,150,"{'cla': RandomForestClassifier(max_depth=8), '...",0.860870,0.814171,0.904412,0.864706,0.917431,0.872318,0.036429,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,0.039572,0.003034,0.020185,0.000716,DecisionTreeClassifier(),,,,median,"(-1, 1)",...,,"{'cla': DecisionTreeClassifier(), 'cla__criter...",0.766996,0.764505,0.869118,0.723930,0.840779,0.793066,0.053545,128
128,0.039318,0.003723,0.022580,0.002854,DecisionTreeClassifier(),,,,median,"(-1, 1)",...,,"{'cla': DecisionTreeClassifier(), 'cla__criter...",0.785178,0.778209,0.858021,0.711698,0.771108,0.780843,0.046601,129
129,0.038263,0.003016,0.022315,0.002417,DecisionTreeClassifier(),,,,median,"(0, 1)",...,,"{'cla': DecisionTreeClassifier(), 'cla__criter...",0.787088,0.769652,0.849799,0.700802,0.767451,0.774959,0.047607,130
130,0.035369,0.001717,0.020845,0.001862,DecisionTreeClassifier(),,,,mean,"(0, 1)",...,,"{'cla': DecisionTreeClassifier(), 'cla__criter...",0.764559,0.784024,0.850134,0.708824,0.759407,0.773389,0.045706,131


Check the best params

In [44]:
result.iloc[0]

mean_fit_time                                                                                         0.244059
std_fit_time                                                                                          0.023466
mean_score_time                                                                                       0.034375
std_score_time                                                                                        0.002148
param_cla                                                                  RandomForestClassifier(max_depth=8)
param_cla__C                                                                                               NaN
param_cla__class_weight                                                                                    NaN
param_cla__max_iter                                                                                        NaN
param_preproc__imputer__num_imputer__strategy                                                           median
p

And the best model:

In [45]:
gs.best_estimator_

## Inference

Predict in the test set.

In [46]:
gs.best_estimator_.predict(pd.read_csv('data/test.csv'))

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,