# Showcase of custom transformers

In [1]:
import pandas as pd
from sklearn.pipeline import make_pipeline

# To automatically reload module after src code changes
%load_ext autoreload
%autoreload 1
%aimport src.ImputeByGroup
from src.ImputeByGroup import ImputeNumericalByGroup, ImputeCategoricalByGroup

## Load data

In [2]:
X_train = pd.read_csv("tests/titanic_train.csv")
X_test = pd.read_csv("tests/titanic_test.csv")
# y_train = X_train.pop("Survived")

In [3]:
X_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
X_test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

## Imputation via groupby, as part of a pipeline

In [5]:
# copy == False in 2nd step
# b/c only need to make copy once to protect original data
pl = make_pipeline(
    ImputeNumericalByGroup(target_col="Age", groupby_col=["Pclass", "Embarked"], return_df=True),
    ImputeCategoricalByGroup(target_col="Embarked", groupby_col=["Pclass"], return_df=True, copy=False)
).fit(X_train)

In [6]:
# No more missing values in Age and Embarked!
print(pl.transform(X_train).isna().sum(), "\n")

print(pl.transform(X_test).isna().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64 

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
