In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [None]:
df = pd.read_csv('datasets/titanic_toy.csv')

In [None]:
df.head()

In [None]:
df.isnull().mean()

In [None]:
df["Age_99"] = df["Age"].fillna(99)
df["Age_minus1"] = df["Age"].fillna(-1)

df["Fare_999"] = df["Fare"].fillna(999)
df["Fare_minus1"] = df["Fare"].fillna(-1)

# This could give us a warning

In [None]:
print('Original Age variable variance: ', X_train["Age"].var())
print('Age variance after 99 imputation: ', X_train["Age_99"].var())
print('Age variance after -1 imputation: ', X_train["Age_minus1"].var())

print('Original Fare variable variance: ', X_train["Fare"].var())
print('Fare variance after 999 imputation: ', X_train["Fare_999"].var())
print('Fare variance after -1 imputation: ', X_train["Fare_minus1"].var())

In [None]:
fig = plt.figure()

ax = fig.add_subplot(111)

# original variance
X_train["Age"].plot(kind='kde', ax=ax, color='blue')

# median imputation
X_train["Age_99"].plot(kind='kde', ax=ax, color='red')

# mean imputation
X_train["Age_minus1"].plot(kind='kde', ax=ax, color='green')

# add legends
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')

In [None]:
fig = plt.figure()

ax = fig.add_subplot(111)

# original variance
X_train["Fare"].plot(kind='kde', ax=ax, color='blue')

# median imputation
X_train["Fare_999"].plot(kind='kde', ax=ax, color='red')

# mean imputation
X_train["Fare_minus1"].plot(kind='kde', ax=ax, color='green')

# add legends
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')

In [None]:
# Check Covariance
X_train.cov()

In [None]:
# Check Correlation
X_train.corr()

## Using Sklearn

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [None]:
imputer_99 = SimpleImputer(strategy='constant', fill_value=99)
imputer_999 = SimpleImputer(strategy='constant', fill_value=999)

In [None]:
# transform columns

trf = ColumnTransformer([
    ('imputer_99', imputer_99, ['Age']),
    ('imputer_999', imputer_999, ['Fare'])
], remainder='passthrough')

In [None]:
trf.fit(X_train)

In [None]:
trf.named_transformers_['imputer_99'].statistics_

In [None]:
trf.named_transformers_['imputer_999'].statistics_

In [None]:
X_train = trf.transform(X_train)
X_test = trf.transform(X_test)

In [None]:
X_train