In [1]:
import numpy as np 
import pandas as pd 
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [3]:
df = pd.read_csv('titanic_toy.csv')
df.head()

Unnamed: 0,Age,Fare,Family,Survived
0,22.0,7.25,1,0
1,38.0,71.2833,1,1
2,26.0,7.925,0,1
3,35.0,53.1,1,1
4,35.0,8.05,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Fare      846 non-null    float64
 2   Family    891 non-null    int64  
 3   Survived  891 non-null    int64  
dtypes: float64(2), int64(2)
memory usage: 28.0 KB


In [5]:
df.isnull().mean()

Age         0.198653
Fare        0.050505
Family      0.000000
Survived    0.000000
dtype: float64

In [6]:
x = df.drop(columns=['Survived'])
y = df['Survived']

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [8]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((712, 3), (179, 3), (712,), (179,))

In [9]:
x_train.isnull().mean()

Age       0.196629
Fare      0.050562
Family    0.000000
dtype: float64

In [10]:
imputer1 = SimpleImputer(strategy='median')
imputer2 = SimpleImputer(strategy='mean')

In [11]:
trf = ColumnTransformer([
    ('median_imputer', imputer1, ['Age']),
    ('mean_imputer', imputer2, ['Fare'])
], remainder='passthrough')

In [12]:
trf.fit(x_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [15]:
trf.named_transformers_['median_imputer'].statistics_

array([28.])

In [16]:
trf.named_transformers_['mean_imputer'].statistics_

array([32.51778772])

In [17]:
x_train = trf.transform(x_train)
x_test = trf.transform(x_test)

In [19]:
pd.DataFrame(x_train, columns=['Age', 'Fare', 'Family'])

Unnamed: 0,Age,Fare,Family
0,45.5,28.500000,0.0
1,23.0,13.000000,0.0
2,32.0,7.925000,0.0
3,26.0,7.854200,1.0
4,6.0,31.275000,6.0
...,...,...,...
707,21.0,7.650000,0.0
708,28.0,31.000000,0.0
709,41.0,32.517788,2.0
710,14.0,120.000000,3.0


In [21]:
pd.DataFrame(x_train, columns=['Age', 'Fare', 'Family']).isnull().mean()

Age       0.0
Fare      0.0
Family    0.0
dtype: float64