In [54]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import MissingIndicator, SimpleImputer

In [55]:
df = pd.read_csv('/content/tittanic_train.csv', usecols=['Age', 'Fare', 'Survived'])

In [56]:
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [57]:
x = df.drop('Survived', axis=1)
y = df['Survived']

In [58]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [59]:
x_train.head()

Unnamed: 0,Age,Fare
30,40.0,27.7208
10,4.0,16.7
873,47.0,9.0
182,9.0,31.3875
876,20.0,9.8458


In [60]:
si = SimpleImputer()
x_train_trf = si.fit_transform(x_train)
x_test_trf = si.transform(x_test)

In [61]:
x_train_trf

array([[ 40.        ,  27.7208    ],
       [  4.        ,  16.7       ],
       [ 47.        ,   9.        ],
       ...,
       [ 71.        ,  49.5042    ],
       [ 29.78590426, 221.7792    ],
       [ 29.78590426,  25.925     ]])

In [62]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train_trf, y_train)
y_pred = clf.predict(x_test_trf)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.6145251396648045

In [63]:
mi = MissingIndicator()
mi.fit(x_train)

In [64]:
mi.features_

array([0])

In [65]:
x_train_missing = mi.transform(x_train)

In [66]:
x_train_missing

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [

In [67]:
x_test_missing = mi.transform(x_test)


In [68]:
x_test_missing

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [

In [69]:
x_train['Age_NA'] = x_train_missing

In [70]:
x_test['Age_NA'] = x_test_missing

In [71]:
x_train.sample(5)

Unnamed: 0,Age,Fare,Age_NA
624,21.0,16.1,False
242,29.0,10.5,False
103,33.0,8.6542,False
554,22.0,7.775,False
855,18.0,9.35,False


In [72]:
x_test.sample(5)

Unnamed: 0,Age,Fare,Age_NA
663,36.0,7.4958,False
712,48.0,52.0,False
84,17.0,10.5,False
423,28.0,14.4,False
7,2.0,21.075,False


In [73]:
si = SimpleImputer()
x_train_trf2 = si.fit_transform(x_train)
x_test_trf2 = si.transform(x_test)

In [74]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train_trf2, y_train)
y_pred2 = clf.predict(x_test_trf2)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred2)

0.6312849162011173

Another way to do the same thing without using missing indicator is shown below

In [48]:
x_train.sample(5)

Unnamed: 0,Age,Fare
422,29.0,7.875
715,19.0,7.65
223,,7.8958
17,,13.0
388,,7.7292


In [49]:
si = SimpleImputer(add_indicator=True)

In [50]:
x_train = si.fit_transform(x_train)
x_test = si.transform(x_test)

In [52]:
x_train

array([[ 40.        ,  27.7208    ,   0.        ],
       [  4.        ,  16.7       ,   0.        ],
       [ 47.        ,   9.        ,   0.        ],
       ...,
       [ 71.        ,  49.5042    ,   0.        ],
       [ 29.78590426, 221.7792    ,   1.        ],
       [ 29.78590426,  25.925     ,   1.        ]])

In [53]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train, y_train)
y_pred3 = clf.predict(x_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred3)

0.6312849162011173