## Leak-tight pipelines in *sklearn*

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import Binarizer, OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.utils.estimator_checks import check_estimator

In [2]:
sklearn.__version__  # '0.20.3'

'0.20.3'

---
Load in the well known *Titanic* data. Put a validation set aside.

In [3]:
data_trainval = pd.read_csv(os.path.join('data', 'train.csv'))  # the Titanic-data from Kaggle

In [4]:
X_trainval = data_trainval[data_trainval.columns.drop('Survived')]
y_trainval = data_trainval.Survived

X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.5, stratify = y_trainval, random_state=42)

In [5]:
X_train.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
# X_train.dtypes
# X_train.isnull().sum()
# X_train.Pclass.value_counts(dropna=False)
# X_train.hist('Age')

---
Assume that, after looking at types, values/categories, NaNs, etc., we've decided to proceed by using the following columns and modyfing them as specified:

column | preprocessing steps
--- | ---
Pclass | one-hot encode
Sex | encode to 0,1
SibSp | binarize to 0,1 (1 for values of *SibSp* > 0)
Parch | binarize to 0,1 (1 for values of *Parch* > 0)
Fare | replace missing values with the average or the median; apply $x \rightarrow \log(1+x)$; rescale and shift so that the minimum and maximum take predefined values
Embarked | replace missing values with the most frequent one; one-hot encode

end then to train the *logistic-regression*-based classifier on those preprocessed columns.

Assume also that we'd like to include hyperparameters of the *Fare*-transformer in our *grid-search* over hyperparameters of the logistic regression.

It is worth stressing that - by definition - our model may be trained on the *training dataset* only. It may extract any statistics', categories and anything else from this dataset. But when the trained model will be evaluated on the validation data it is not allowed to change its state.

Using the sklearn out-of-the-box functions and classes we can comfortably construct the specified *transform-classify* production line:

* we chain the transformers (that act on a single column in nested way) with the *make_pipeline* function (or the *Pipeline* class),
* we glue the various pipelines that act on different columns together using *ColumnTransformer*,
* the resulting complete transfomer is then chained with the classifier.

In [7]:
# ?OneHotEncoder
# ?OrdinalEncoder
# ?Binarizer
# ?FunctionTransformer
# ?MinMaxScaler
# ?SimpleImputer
# ?make_pipeline
# ?Pipeline
# ?ColumnTransformer

In [8]:
# Fare

fare_transformer = make_pipeline(
    SimpleImputer(strategy='median'),
    FunctionTransformer(np.log1p, validate=False),
    MinMaxScaler(feature_range=(0, 1))
)

In [9]:
# come up with a list of triples consiting of:
# an arbitrary name, the chosen Transformer class instance, and list of the relevant columns

preprocessors = [
    ('pclass_ohe', OneHotEncoder(handle_unknown = 'ignore'), ['Pclass']),
    ('sex_encode', OrdinalEncoder(categories=[['male', 'female']]), ['Sex']),
    ('bin', Binarizer(threshold = 0), ['SibSp', 'Parch']),
    ('fare_transform', fare_transformer, ['Fare'])
]

preprocessing_pipe = ColumnTransformer(preprocessors)

In [10]:
classifier = Pipeline(steps = [
    ('preproc', preprocessing_pipe),
    ('clf', LogisticRegression(solver='saga', max_iter=1e5))
])

In [11]:
classifier.fit(X_train, y_train)
classifier.score(X_train, y_train)

0.7820224719101123

In [12]:
classifier.score(X_val, y_val)

0.7802690582959642

Btw. names of the transformers, that we were forced to come up with, have their purpose. Knowing them we can dig into the transformer. E.g. let's check how were the sexes encoded:

In [13]:
{num: preprocessing_pipe.named_transformers_['sex_encode'].inverse_transform([[num]])[0][0] for num in [0, 1]}

{0: 'male', 1: 'female'}

In [14]:
# search through a grid of the hyperparameters

param_grid = {}
# param_grid['preproc__fare_transform__simpleimputer__strategy'] = ['median', 'mean']
param_grid['preproc__fare_transform__minmaxscaler__feature_range'] = [(0.4, 0.6), (0, 1), (-2, 3)]
param_grid['clf__C'] = [0.01, 0.1, 1, 10]
# param_grid['clf__penalty'] = ['l1', 'l2']

# do the grid-search
classifier_gridcv = GridSearchCV(classifier, param_grid, cv=4, refit=True, return_train_score=True, n_jobs=-1, iid=False)
classifier_gridcv.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('preproc', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('pclass_ohe', OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=...penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'preproc__fare_transform__minmaxscaler__feature_range': [(0.4, 0.6), (0, 1), (-2, 3)], 'clf__C': [0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [15]:
# The fitted *GridSearchCV* instance contains the `cv_results_` dictionary

{idx: key for (idx, key) in enumerate(classifier_gridcv.cv_results_.keys())}

{0: 'mean_fit_time',
 1: 'std_fit_time',
 2: 'mean_score_time',
 3: 'std_score_time',
 4: 'param_clf__C',
 5: 'param_preproc__fare_transform__minmaxscaler__feature_range',
 6: 'params',
 7: 'split0_test_score',
 8: 'split1_test_score',
 9: 'split2_test_score',
 10: 'split3_test_score',
 11: 'mean_test_score',
 12: 'std_test_score',
 13: 'rank_test_score',
 14: 'split0_train_score',
 15: 'split1_train_score',
 16: 'split2_train_score',
 17: 'split3_train_score',
 18: 'mean_train_score',
 19: 'std_train_score'}

In [16]:
column_translate = {key: idx for (idx, key) in enumerate(classifier_gridcv.cv_results_.keys())}
choose = [4, 5, 11, 18]
sort_by = 11
results = pd.DataFrame(classifier_gridcv.cv_results_).iloc[:, choose].rename(columns=column_translate).sort_values(sort_by, ascending=False)
results

Unnamed: 0,4,5,11,18
6,1.0,"(0.4, 0.6)",0.782005,0.776786
9,10.0,"(0.4, 0.6)",0.782005,0.776786
7,1.0,"(0, 1)",0.781964,0.779036
8,1.0,"(-2, 3)",0.775248,0.776777
10,10.0,"(0, 1)",0.775248,0.776788
11,10.0,"(-2, 3)",0.775248,0.776035
3,0.1,"(0.4, 0.6)",0.754996,0.767053
5,0.1,"(-2, 3)",0.752744,0.749062
4,0.1,"(0, 1)",0.745966,0.766304
2,0.01,"(-2, 3)",0.705586,0.7071


In [17]:
classifier_tuned = classifier_gridcv.best_estimator_
classifier_tuned.score(X_train, y_train)

0.7797752808988764

Notice that the `classifier` (and the `classifier_tuned`) has an internal state that depends on the training-data. The validation data should not modify this state. It is easy to be sure of that: our whole classifier is a single class instance and we never call its `fit` method with any other data than the `X_train` and `y_train`.

In [18]:
classifier_tuned.score(X_val, y_val)

0.7802690582959642

Now it is also easy to remove some of the transformed features if we ever decide to do that, or add new ones. We can also pause here, return in a month, and pick up where we left without hassle.

**Exercise**

We  have forgotten to hadle the *Embarked* feature. Modify the *preprocessing_pipe* object to fulfill the strategy described in the table. Did it improve the validation score?

---
Proposed solution below

---
---
---
---
---
---
---
---
---
---
---
---
---
---
---
---
---

In [19]:
embarked_transformer = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown = 'ignore')
)

preprocessors = [
    ('pclass_ohe', OneHotEncoder(handle_unknown = 'ignore'), ['Pclass']),
    ('sex_encode', OrdinalEncoder(categories=[['female', 'male']]), ['Sex']),
    ('bin', Binarizer(threshold = 0), ['SibSp', 'Parch']),
    ('embarked_ohe', embarked_transformer, ['Embarked']),
    ('fare_transform', fare_transformer, ['Fare'])
]
preprocessing_pipe = ColumnTransformer(preprocessors)

classifier = Pipeline(steps = [
    ('preproc', preprocessing_pipe),
    ('clf', LogisticRegression(solver='saga', max_iter=1e5))
])

param_grid = {}
param_grid['preproc__fare_transform__minmaxscaler__feature_range'] = [(0.4, 0.6), (0, 1), (-2, 3)]
param_grid['clf__C'] = [0.01, 0.1, 1, 10]

classifier_gridcv = GridSearchCV(classifier, param_grid, cv=4, refit=True, return_train_score=True, n_jobs=-1, iid=False)
classifier_gridcv.fit(X_train, y_train)
classifier_gridcv.best_estimator_.score(X_val, y_val)

0.7892376681614349

---
---

## Custom transformers

It's all well and good: *scikit-learn* has a ton of useful classes, obviously. Alas, we may not yet have time to write our own classifiers or regressors but what if we want some preprocessing functionality from say *pandas* rather than pure *scikit-learn*?

We can wrap it in a *transformer class* that complies with *sklearn*'s API. We need to inherit from *BaseEstimator* and *TransformerMixin*, and then we implement `fit` and `transform` methods:

In [20]:
# ?BaseEstimator
# ?TransformerMixin

https://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html#sklearn.base.BaseEstimator
https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html#sklearn.base.TransformerMixin

Here's the simplest example: a transformer that does exactly nothing.

In [21]:
class Identity(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        return X

It can be used in a pipeline. Let's use it to pass the *Age* feature unmodified through our previously constructed preprocessing transformer.

In [22]:
preprocessors = [
    ('pclass_ohe', OneHotEncoder(handle_unknown = 'ignore'), ['Pclass']),
    ('sex_encode', OrdinalEncoder(categories=[['female', 'male']]), ['Sex']),
    ('bin', Binarizer(threshold = 0), ['SibSp', 'Parch']),
    ('embarked_ohe', embarked_transformer, ['Embarked']),
    ('fare_transform', fare_transformer, ['Fare']),
    ('id', Identity(), ['Age'])
]

preprocessing_pipe = ColumnTransformer(preprocessors)

In [23]:
X_train_prepr = preprocessing_pipe.fit_transform(X_train)

In [24]:
# come up with reasonable column names for the preprocessed features
columns_prepr \
= ["Class_" + str(class_) for class_ in preprocessing_pipe.named_transformers_['pclass_ohe'].categories_[0]] \
+ ['Sex'] \
+ ['SibSp_', 'ParCh_'] \
+ ["Embarked_" + str(city) for city in preprocessing_pipe.named_transformers_['embarked_ohe'].named_steps['onehotencoder'].categories_[0]] \
+ ['Fare_transformed'] \
+ ['Age']

X_train_prepr_df = pd.DataFrame(X_train_prepr, columns = columns_prepr)
X_train_prepr_df[:4]

Unnamed: 0,Class_1,Class_2,Class_3,Sex,SibSp_,ParCh_,Embarked_C,Embarked_Q,Embarked_S,Fare_transformed,Age
0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.359602,21.0
1,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.755891,17.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.489446,41.0
3,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.495832,8.0


Make the operation of casting the *X_train_prepr* into a pandas dataframe with nice columns also a transformer!

In [25]:
class Polish(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns= columns
        return None
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        return pd.DataFrame(X, columns=self.columns)
    
X_train_prepr_df = Polish(columns_prepr).fit_transform(X_train_prepr)
X_train_prepr_df[:4]

Unnamed: 0,Class_1,Class_2,Class_3,Sex,SibSp_,ParCh_,Embarked_C,Embarked_Q,Embarked_S,Fare_transformed,Age
0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.359602,21.0
1,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.755891,17.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.489446,41.0
3,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.495832,8.0


---
There's a lot of missing values in the *Age* column. What should we do about it?

In [26]:
X_train_prepr_df.Age.isnull().sum() / len(X_train_prepr_df)

0.20449438202247192

Here's a suggestion that someone has given to me. Group the passangers by other features and see if you can infer the missing *Age* based on which group they belong to. Let's for example group by *Class*, *Sex*, *SibSp* and *ParCh*:

In [27]:
groupby_columns = ['Class_1', 'Class_2', 'Sex', 'SibSp_', 'ParCh_']
X_train_prepr_grouped = X_train_prepr_df[X_train_prepr_df.Age.notnull()].groupby(groupby_columns)

# see statistics of *Age* in each group
X_train_prepr_grouped_age = pd.concat([X_train_prepr_grouped['Age'].mean(), X_train_prepr_grouped['Age'].std(), X_train_prepr_grouped['Age'].count()], axis = 1)
X_train_prepr_grouped_age.columns = ['mean', 'std', 'count']
X_train_prepr_grouped_age

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,std,count
Class_1,Class_2,Sex,SibSp_,ParCh_,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,0.0,0.0,0.0,0.0,23.5,11.638657,25
0.0,0.0,0.0,0.0,1.0,24.777778,15.570628,9
0.0,0.0,0.0,1.0,0.0,26.277778,9.731107,9
0.0,0.0,0.0,1.0,1.0,18.923077,17.332347,13
0.0,0.0,1.0,0.0,0.0,30.159341,11.919843,91
0.0,0.0,1.0,0.0,1.0,3.21,3.945656,2
0.0,0.0,1.0,1.0,0.0,28.7,4.6916,10
0.0,0.0,1.0,1.0,1.0,18.294118,12.824609,17
0.0,1.0,0.0,0.0,0.0,36.045455,9.717136,11
0.0,1.0,0.0,0.0,1.0,18.857143,13.993196,7


Make a dictionary from group-labels to the mean *Age* values

In [28]:
age_by_group = {group_label: group['Age'].mean() for (group_label, group) in X_train_prepr_grouped}
age_by_group

{(0.0, 0.0, 0.0, 0.0, 0.0): 23.5,
 (0.0, 0.0, 0.0, 0.0, 1.0): 24.77777777777778,
 (0.0, 0.0, 0.0, 1.0, 0.0): 26.27777777777778,
 (0.0, 0.0, 0.0, 1.0, 1.0): 18.923076923076923,
 (0.0, 0.0, 1.0, 0.0, 0.0): 30.15934065934066,
 (0.0, 0.0, 1.0, 0.0, 1.0): 3.21,
 (0.0, 0.0, 1.0, 1.0, 0.0): 28.7,
 (0.0, 0.0, 1.0, 1.0, 1.0): 18.294117647058822,
 (0.0, 1.0, 0.0, 0.0, 0.0): 36.04545454545455,
 (0.0, 1.0, 0.0, 0.0, 1.0): 18.857142857142858,
 (0.0, 1.0, 0.0, 1.0, 0.0): 32.333333333333336,
 (0.0, 1.0, 0.0, 1.0, 1.0): 29.307692307692307,
 (0.0, 1.0, 1.0, 0.0, 0.0): 33.96551724137931,
 (0.0, 1.0, 1.0, 0.0, 1.0): 9.943333333333333,
 (0.0, 1.0, 1.0, 1.0, 0.0): 34.95454545454545,
 (0.0, 1.0, 1.0, 1.0, 1.0): 11.9,
 (1.0, 0.0, 0.0, 0.0, 0.0): 38.705882352941174,
 (1.0, 0.0, 0.0, 0.0, 1.0): 32.625,
 (1.0, 0.0, 0.0, 1.0, 0.0): 37.6,
 (1.0, 0.0, 0.0, 1.0, 1.0): 37.0,
 (1.0, 0.0, 1.0, 0.0, 0.0): 44.775862068965516,
 (1.0, 0.0, 1.0, 0.0, 1.0): 34.77777777777778,
 (1.0, 0.0, 1.0, 1.0, 0.0): 37.25,
 (1.0, 0.0, 1

Make a column of group-labels for all passangers. 

In [29]:
groups = pd.Series([tuple(row) for row in X_train_prepr_df[groupby_columns].values])
groups[:10]

0    (0.0, 0.0, 1.0, 0.0, 0.0)
1    (1.0, 0.0, 1.0, 0.0, 1.0)
2    (0.0, 0.0, 0.0, 0.0, 1.0)
3    (0.0, 0.0, 0.0, 1.0, 1.0)
4    (1.0, 0.0, 1.0, 1.0, 0.0)
5    (0.0, 0.0, 0.0, 0.0, 0.0)
6    (0.0, 1.0, 1.0, 0.0, 0.0)
7    (0.0, 0.0, 1.0, 0.0, 0.0)
8    (1.0, 0.0, 1.0, 1.0, 0.0)
9    (0.0, 0.0, 0.0, 0.0, 0.0)
dtype: object

Translate group-labels to mean-ages using the dictionary.

In [30]:
mean_group_age = groups.apply(lambda group: age_by_group[group])
mean_group_age[:10]

0    30.159341
1    34.777778
2    24.777778
3    18.923077
4    37.250000
5    23.500000
6    33.965517
7    30.159341
8    37.250000
9    23.500000
dtype: float64

Make a new *Age_imputed* column, which is equal to *Age* except that where *Age* had NaNs *Age_imputed* has the mean group values.

In [31]:
X_train_prepr_df2 = X_train_prepr_df.copy()
X_train_prepr_df2.Age[X_train_prepr_df2.Age.isnull()] = mean_group_age[X_train_prepr_df2.Age.isnull()]
X_train_prepr_df2.rename(columns = {'Age': 'Age_imputed'}, inplace=True)
X_train_prepr_df2[:4]

Unnamed: 0,Class_1,Class_2,Class_3,Sex,SibSp_,ParCh_,Embarked_C,Embarked_Q,Embarked_S,Fare_transformed,Age_imputed
0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.359602,21.0
1,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.755891,17.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.489446,41.0
3,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.495832,8.0


In [32]:
X_train_prepr_df2.Age_imputed.isnull().sum()

0

Ok, say we're happy with this and we actually want to transform *Age* into *Age_imputed* within an *sklearn* pipeline. We gotta create a new transformer. Consider which operations perfomerd above manually go into the *fit* method and which into the *transform* method.

In [33]:
class AgeImputer(BaseEstimator, TransformerMixin):
    def __init__(self, groupby_columns):
        self.groupby_columns = groupby_columns
        return None
    
    def fit(self, X, y=None):
        X_grouped = X[X.Age.notnull()].groupby(self.groupby_columns)
        self.age_by_group = {group_label: group['Age'].mean() for (group_label, group) in X_grouped}
        return self
        
    def transform(self, X):
        groups = pd.Series([tuple(row) for row in X[self.groupby_columns].values])
        mean_group_age = groups.apply(lambda group: self.age_by_group[group])
        X_ = X.copy()
        X_.Age[X_.Age.isnull()] = mean_group_age[X_.Age.isnull()]
        X_.rename(columns = {'Age': 'Age_imputed'}, inplace=True)
        return X_

In [34]:
X_train_prepr_df2 = AgeImputer(groupby_columns).fit_transform(X_train_prepr_df)
X_train_prepr_df2[:4]

Unnamed: 0,Class_1,Class_2,Class_3,Sex,SibSp_,ParCh_,Embarked_C,Embarked_Q,Embarked_S,Fare_transformed,Age_imputed
0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.359602,21.0
1,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.755891,17.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.489446,41.0
3,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.495832,8.0


So now our whole pipeline is the following

In [35]:
age_imputer = make_pipeline(Polish(columns_prepr), AgeImputer(groupby_columns))

classifier = Pipeline(steps = [
    ('preproc', preprocessing_pipe),
    ('age', age_imputer),
    ('clf', LogisticRegression(solver='saga', max_iter=1e5))
])

Again, we can include the arguments of the new transformer in a grid-search.

In [36]:
param_grid = {}
param_grid['preproc__fare_transform__minmaxscaler__feature_range'] = [(0.4, 0.6), (0, 1), (-2, 3)]
param_grid['age__ageimputer__groupby_columns'] = [['Class_1', 'Class_2', 'Sex', 'SibSp_', 'ParCh_'], ['SibSp_', 'ParCh_']]
param_grid['clf__C'] = [0.01, 0.1, 1, 10]

classifier_gridcv = GridSearchCV(classifier, param_grid, cv=4, refit=True, return_train_score=True, n_jobs=-1, iid=False)
classifier_gridcv.fit(X_train, y_train)
classifier_gridcv.best_estimator_.score(X_val, y_val)

0.8094170403587444

**Exercise**

Make a new binary features which encodes wheter the *Cabin* values was a NaN or not. Include it in our *classifier* object and run the fitting and scoring anew.

---
Proposed solution below

---
---
---
---
---
---
---
---
---
---
---
---
---
---
---
---
---

In [37]:
class IsNullBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        X = X.isnull().astype('int')
        return X

In [40]:
preprocessors = [
    ('pclass_ohe', OneHotEncoder(handle_unknown = 'ignore'), ['Pclass']),
    ('sex_encode', OrdinalEncoder(categories=[['female', 'male']]), ['Sex']),
    ('bin', Binarizer(threshold = 0), ['SibSp', 'Parch']),
    ('embarked_ohe', embarked_transformer, ['Embarked']),
    ('fare_transform', fare_transformer, ['Fare']),
    ('id', Identity(), ['Age']),
    ('cabin_unknown', IsNullBinarizer(), ['Cabin'])
]

preprocessing_pipe = ColumnTransformer(preprocessors)
preprocessing_pipe.fit(X_train, y_train)

columns_prepr \
= ["Class_" + str(class_) for class_ in preprocessing_pipe.named_transformers_['pclass_ohe'].categories_[0]] \
+ ['Sex'] \
+ ['SibSp_', 'ParCh_'] \
+ ["Embarked_" + str(city) for city in preprocessing_pipe.named_transformers_['embarked_ohe'].named_steps['onehotencoder'].categories_[0]] \
+ ['Fare_transformed'] \
+ ['Age'] \
+ ['Cabin_unknown']

age_imputer = make_pipeline(Polish(columns_prepr), AgeImputer(groupby_columns))

classifier = Pipeline(steps = [
    ('preproc', preprocessing_pipe),
    ('age', age_imputer),
    ('clf', LogisticRegression(solver='saga', max_iter=1e5))
])

param_grid = {}
param_grid['preproc__fare_transform__minmaxscaler__feature_range'] = [(0.4, 0.6), (0, 1), (-2, 3)]
param_grid['age__ageimputer__groupby_columns'] = [['Class_1', 'Class_2', 'Sex', 'SibSp_', 'ParCh_'], ['SibSp_', 'ParCh_']]
param_grid['clf__C'] = [0.01, 0.1, 1, 10]

classifier_gridcv = GridSearchCV(classifier, param_grid, cv=4, refit=True, return_train_score=True, n_jobs=-1, iid=False)
classifier_gridcv.fit(X_train, y_train)
classifier_gridcv.best_estimator_.score(X_val, y_val)

0.8183856502242153