In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier

import sklearn
sklearn.set_config(display='diagram')

In [2]:
df = pd.read_csv('../data/titanic.csv',usecols=['Survived','Age','Fare','Sex','Pclass'])
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [3]:
df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
Fare          0
dtype: int64

In [8]:
col_transf = make_column_transformer(
    (KNNImputer(),['Age']),
    (OrdinalEncoder(),['Sex']),
    remainder='passthrough'
)

col_transf

In [9]:
col_transf.fit(df)

In [11]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [10]:
col_transf.transform(df)

array([[22.        ,  1.        ,  0.        ,  3.        ,  7.25      ],
       [38.        ,  0.        ,  1.        ,  1.        , 71.2833    ],
       [26.        ,  0.        ,  1.        ,  3.        ,  7.925     ],
       ...,
       [29.69911765,  0.        ,  0.        ,  3.        , 23.45      ],
       [26.        ,  1.        ,  1.        ,  1.        , 30.        ],
       [32.        ,  1.        ,  0.        ,  3.        ,  7.75      ]])

In [13]:
pipe = make_pipeline(col_transf,DecisionTreeClassifier(criterion='entropy'))
pipe

In [14]:
x = df.drop('Survived',axis=1)
y = df.Survived

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [18]:
print(x_train.shape,y_train.shape)

(712, 4) (712,)


In [19]:
pipe.fit(x_train,y_train)

In [20]:
y_pred = pipe.predict(x_test)
y_pred

array([0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 0])

In [22]:
y_test.values

array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 1])

In [23]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.79      0.81      0.80       105
           1       0.72      0.69      0.70        74

    accuracy                           0.76       179
   macro avg       0.75      0.75      0.75       179
weighted avg       0.76      0.76      0.76       179



In [25]:
pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('knnimputer', KNNImputer(), ['Age']),
                                   ('ordinalencoder', OrdinalEncoder(), ['Sex'])])),
  ('decisiontreeclassifier', DecisionTreeClassifier(criterion='entropy'))],
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('knnimputer', KNNImputer(), ['Age']),
                                 ('ordinalencoder', OrdinalEncoder(), ['Sex'])]),
 'decisiontreeclassifier': DecisionTreeClassifier(criterion='entropy'),
 'columntransformer__n_jobs': None,
 'columntransformer__remainder': 'passthrough',
 'columntransformer__sparse_threshold': 0.3,
 'columntransformer__transformer_weights': None,
 'columntransformer__transformers': [('knnimputer', KNNImputer(), ['Age']),
  ('ordinalencoder', OrdinalEncoder(), ['Sex'])],
 'columntransformer__verbose': False,
 'columntransform