In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [None]:
df = pd.read_csv('datasets/titanic_toy.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().mean()

In [None]:
X = df.drop(columns=["Survived"])
y = df["Survived"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [None]:
X_train.shape, X_test.shape

In [None]:
X_train.isnull().mean()

In [None]:
mean_age = X_train["Age"].mean()
median_age = X_train["Age"].median()

mean_fare = X_train["Fare"].mean()
median_fare = X_train["Fare"].median()

In [None]:
X_train['Age_median'] = X_train['Age'].fillna(median_age)
X_train['Age_mean'] = X_train['Age'].fillna(mean_age)

X_train['Fare_median'] = X_train['Fare'].fillna(median_fare)
X_train['Fare_mean'] = X_train['Fare'].fillna(mean_fare)

In [None]:
X_train.sample(5)

In [None]:
print('Original Age variable variance: ', X_train["Age"].var())
print('Age variance after median imputation: ', X_train["Age_median"].var())
print('Age variance after mean imputation: ', X_train["Age_mean"].var())

print('Original Fare variable variance: ', X_train["Fare"].var())
print('Fare variance after median imputation: ', X_train["Fare_median"].var())
print('Fare variance after mean imputation: ', X_train["Fare_mean"].var())

In [None]:
fig = plt.figure()

ax = fig.add_subplot(111)

# original variance
X_train["Age"].plot(kind='kde', ax=ax, color='blue')

# median imputation
X_train["Age_median"].plot(kind='kde', ax=ax, color='red')

# mean imputation
X_train["Age_mean"].plot(kind='kde', ax=ax, color='green')

# add legends
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')

In [None]:
fig = plt.figure()

ax = fig.add_subplot(111)

# original variance
X_train["Fare"].plot(kind='kde', ax=ax, color='blue')

# median imputation
X_train["Fare_median"].plot(kind='kde', ax=ax, color='red')

# mean imputation
X_train["Fare_mean"].plot(kind='kde', ax=ax, color='green')

# add legends
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')

In [None]:
# Check Covariance
X_train.cov()

In [None]:
# Check Correlation
X_train.corr()