# ColumnTransfomer and Pipeline

In [1]:
# Known libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer

# The new tools
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Set the random state for the whole notebook
np.random.seed(42) # set the random_state for all scikit-learn objects in this notebook
# that use a random_state

# standardise all plot sizes
plt.rcParams['figure.figsize'] = (12,6)

In [2]:
df = pd.read_csv('penguins.csv')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female


In [3]:
# Alternative way of ending up with X
df.loc[:,'bill_length_mm':'sex'].head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,39.1,18.7,181.0,3750.0,male
1,39.5,17.4,186.0,3800.0,female
2,40.3,18.0,195.0,3250.0,female
3,,,,,
4,36.7,19.3,193.0,3450.0,female


In [4]:
df.drop(columns=['species', 'island']).head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,39.1,18.7,181.0,3750.0,male
1,39.5,17.4,186.0,3800.0,female
2,40.3,18.0,195.0,3250.0,female
3,,,,,
4,36.7,19.3,193.0,3450.0,female


In [5]:
# Let's train-test-split
X = df.drop(columns=['species', 'island']) # We take select columns except for species and island as input features
y = df['species'] # species is the variable of interest, the one we want to predict

X_train, X_test, y_train, y_test = train_test_split(X, y)

# ColumnTransformer

"Applies transformers to columns of an array or pandas DataFrame." In other words, the ColumnTransformer transforms columns of a pandas DataFrame (feature engineering).

https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

The ColumnTransformer helps us to do all our Feature Engineering in one go.

# Pipeline

"Pipeline of transforms with a final estimator."

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

Pipeline allows us to apply sequential transformations to the same data.

# Example

We want to do the following things:

#### 1) One_Hot_Encode the sex column

- from sklearn.preprocessing import OneHotEncoder 

#### 2) Fill missing values

- ``df.fillna()`` -- pandas way
- we will actually start with SimpleImputer (from sklearn.impute import SimpleImputer) -- scikit-learn way
- IterativeImputer

Three basic mechanisms that describe missing data:

- MCAR: Missing completely at random - SimpleImputer will be all you need
- MAR: Missing at random - Were IterativeImputer or even imputing by mean or median have value
- MNAR: Missing not at random - 

#### 3) Interaction between bill length and bill depth: bill area

- df['new_column'] = df['column1']*df['column2'] -- pandas way
- PolynomialFeatures(interaction_only=True) -- scikit learn way

#### 4) Bin body mass

- KBinsDiscretizer

#### 5) Use all other columns as they are

## 1) One_Hot_Encode the sex column

In [6]:
# OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [7]:
# .fit() in the case of the One Hot Encoder learns what the categories of this column are ('male', 'female')
# and map that to a number
one_hot_encoder.fit(X_train[['sex']])

ValueError: Input contains NaN

We cannot continue because there are null values in the sex column. We first have to deal with the null values.

## 2) Impute Missing Values

### 2.1) Categorical features

Observations seem to be uniformly distributed over the sexees. Let's just use the most frequent observation.

In [8]:
# Create list of categorical features
categorical_features = ['sex']

In [9]:
# Create and fit a simple imputer to show what it does
simple_imputer = SimpleImputer(strategy='most_frequent')
# strategy='most_frequent': inserts the mode
# alternative you could use strategy='constant' in combination with fill_value='...'

In [10]:
# Fit the simple imputer
simple_imputer.fit(X_train[categorical_features])

SimpleImputer(strategy='most_frequent')

In [11]:
# Verify that the original dataset has missing values
X_train.isna().any()

bill_length_mm       True
bill_depth_mm        True
flipper_length_mm    True
body_mass_g          True
sex                  True
dtype: bool

In [12]:
# Check if there are still missing values after transforming the data
pd.DataFrame(simple_imputer.transform(X_train[categorical_features])).isna().any()

0    False
dtype: bool

### 2.2) Numerical values

Ok, so let's try to impute the values of the numerical columns as well

In [13]:
X.columns

Index(['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g',
       'sex'],
      dtype='object')

In [14]:
# Create list of numerical features
numerical_features = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']

In [15]:
# Impute numerical values
simple_imputer_numerical = SimpleImputer(strategy='mean')

In [16]:
# Fit transform the data
simple_imputer_numerical.fit_transform(X_train[numerical_features])

array([[  42.5,   20.7,  197. , 4500. ],
       [  38.8,   17.2,  180. , 3800. ],
       [  43.4,   14.4,  218. , 4600. ],
       ...,
       [  38.6,   17.2,  199. , 3750. ],
       [  47.2,   13.7,  214. , 4925. ],
       [  37.7,   16. ,  183. , 3075. ]])

So far we had to:
    - Instantiate Imputer for categoricals
    - Instantiate Imputer for numerical values
    - `.fit_transform()` imputer for categoricals and assign that to the original columns
    - `.fit_transform()` imputer for numerical values and assign the values to the original columns
    
This still seems ok. However, imagine we want to do a lot more transformations. This becomes tedious and it will become even more tedious if we have to apply a transformation to each column of the test set later on. The ColumnTransformer comes to our rescue! Here we can combine the separate transformations into one "place". Let's see how that works.

In [17]:
# Build a column_transformer
column_transformer = ColumnTransformer([ # the parameter 'transformers' should be a list of transformers (objects with a `.transform()` method)
    # The input has to be a tuple (name, transformer, columns)
    ('categorical_imputer', SimpleImputer(strategy='most_frequent'), categorical_features),
    ('numerical_imputer', SimpleImputer(strategy='mean'), numerical_features)
])

In [18]:
# Fit the column transformer
column_transformer.fit(X_train)

ColumnTransformer(transformers=[('categorical_imputer',
                                 SimpleImputer(strategy='most_frequent'),
                                 ['sex']),
                                ('numerical_imputer', SimpleImputer(),
                                 ['bill_length_mm', 'bill_depth_mm',
                                  'flipper_length_mm', 'body_mass_g'])])

In [19]:
# Transform X_train
X_train_fe = column_transformer.transform(X_train)

In [20]:
# This makes it easy to transform the X_test data in one line
X_test_fe = column_transformer.transform(X_test)

# The same will be possible for unseen data in a future application of this model
# This is what makes the ColumnTransformer so powerful

### 1) Go back to OneHotEncoding

Ok, now let's also one_hot_ecnode the sex column with the ColumnTransformer.

In [21]:
column_transformer = ColumnTransformer([ # the parameter 'transformers' should be a list of transformers (objects with a `.transform()` method)
    # The input has to be a tuple (name, transformer, columns)
    ('categorical_imputer', SimpleImputer(strategy='most_frequent'), categorical_features),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('numerical_imputer', SimpleImputer(strategy='mean'), numerical_features)
])

In [22]:
# Fit the column transformer
column_transformer.fit(X_train)

ValueError: Input contains NaN

Why does this not work?

ColumnTransformer applies the transformations separately. If you define two transformations for one column, in this case the sex column, ColumnTransformer will use the original column for both transformations. This will not work because the OneHotEncoder cannot handle null values. What can we do about that? --> Pipeline comes to our rescue

### Pipeline

In [23]:
from sklearn.pipeline import Pipeline

In the ColumnTransformer case you have to pass a list of tuples with (name, transformer, columns).

For the pipeline it is a list of tuples with (name, transformer)

In [24]:
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')), # (name, transformer)
    # the pipeline will use the result of the previous step as input for the next step
    ('one_hot_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

In [25]:
categorical_features

['sex']

In [25]:
# Now exchange the individual steps of the ColumnTransformer for sex with the pipeline
column_transformer = ColumnTransformer([ # the parameter 'transformers' should be a list of transformers (objects with a `.transform()` method)
    ('transform_sex', categorical_pipeline, categorical_features),
    ('numerical_imputer', SimpleImputer(strategy='mean'), numerical_features)
])

In [26]:
# Now try to fit the column transformer
column_transformer.fit(X_train)

ColumnTransformer(transformers=[('transform_sex',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('one_hot_encoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 ['sex']),
                                ('numerical_imputer', SimpleImputer(),
                                 ['bill_length_mm', 'bill_depth_mm',
                                  'flipper_length_mm', 'body_mass_g'])])

In [27]:
X_train.head(1)

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
17,42.5,20.7,197.0,4500.0,male


In [28]:
# Transform X_train
pd.DataFrame(column_transformer.transform(X_train),
            columns=['femal', 'male', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'])

Unnamed: 0,femal,male,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,0.0,1.0,42.5,20.7,197.0,4500.0
1,0.0,1.0,38.8,17.2,180.0,3800.0
2,1.0,0.0,43.4,14.4,218.0,4600.0
3,0.0,1.0,39.0,18.7,185.0,3650.0
4,0.0,1.0,45.5,15.0,220.0,5000.0
...,...,...,...,...,...,...
253,1.0,0.0,42.6,13.7,213.0,4950.0
254,0.0,1.0,39.7,18.4,190.0,3900.0
255,1.0,0.0,38.6,17.2,199.0,3750.0
256,1.0,0.0,47.2,13.7,214.0,4925.0


## 3) Interaction between bill_length and bill_depth: bill area

In [30]:
# How would you calculate interaction terms?
# df['bill_area'] = df['bill_length_mm']*df['bill_depth_mm']

In [31]:
# Do the same with PolynomialFeatures
interaction_term = PolynomialFeatures(interaction_only=True, include_bias=False)

In [32]:
# Define interaction columns
interaction_features = ['bill_length_mm', 'bill_depth_mm']

In [33]:
# Create an interaction pipeline
interaction_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('interaction', PolynomialFeatures(interaction_only=True, include_bias=False))
])

In [34]:
# Fit transform X_train
interaction_pipeline.fit(X_train[interaction_features])

Pipeline(steps=[('impute', SimpleImputer()),
                ('interaction',
                 PolynomialFeatures(include_bias=False,
                                    interaction_only=True))])

In [35]:
# Transform the interaction pipeline and inspect the results
pd.DataFrame(interaction_pipeline.transform(X_train[interaction_features]))

Unnamed: 0,0,1,2
0,42.5,20.7,879.75
1,38.8,17.2,667.36
2,43.4,14.4,624.96
3,39.0,18.7,729.30
4,45.5,15.0,682.50
...,...,...,...
253,42.6,13.7,583.62
254,39.7,18.4,730.48
255,38.6,17.2,663.92
256,47.2,13.7,646.64


In [36]:
# Now exchange the individual steps of the ColumnTransformer for sex with the pipeline
column_transformer = ColumnTransformer([ # the parameter 'transformers' should be a list of transformers (objects with a `.transform()` method)
    ('transform_sex', categorical_pipeline, categorical_features),
    ('interaction_pipeline', interaction_pipeline, interaction_features)
])

In [37]:
# Fit it
column_transformer.fit(X_train)

ColumnTransformer(transformers=[('transform_sex',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('one_hot_encoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 ['sex']),
                                ('interaction_pipeline',
                                 Pipeline(steps=[('impute', SimpleImputer()),
                                                 ('interaction',
                                                  PolynomialFeatures(include_bias=False,
                                                                     interaction_only=True))]),
                                 ['bill_length_mm', 'bill_depth_mm'])])

In [38]:
# Transform it
column_transformer.transform(X_train)

array([[  0.  ,   1.  ,  42.5 ,  20.7 , 879.75],
       [  0.  ,   1.  ,  38.8 ,  17.2 , 667.36],
       [  1.  ,   0.  ,  43.4 ,  14.4 , 624.96],
       ...,
       [  1.  ,   0.  ,  38.6 ,  17.2 , 663.92],
       [  1.  ,   0.  ,  47.2 ,  13.7 , 646.64],
       [  1.  ,   0.  ,  37.7 ,  16.  , 603.2 ]])

## 4) Bin body mass

In [39]:
# Create list of features for binning
binning_features = ['body_mass_g']

In [40]:
# Create a KBinsDiscretizer
binning = KBinsDiscretizer(encode='ordinal')

## Bring everything together in the ColumnTransformer

Here is a 'clean' example of how to bring all that together.

In [41]:
# One pipeline for the column sex
categorical_pipeline = Pipeline([
    ('imputation', SimpleImputer(strategy='most_frequent')),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')) # handle_unknown ignore will allow to handel unknown values in the test set
])

In [42]:
# One pipeline for the interaction terms
interaction_pipeline = Pipeline([
    ('imputation', SimpleImputer(strategy='mean')),
    ('interaction_term', PolynomialFeatures(interaction_only=True, include_bias=False))
])

In [43]:
# One pipeline for binning
binning_pipeline = Pipeline([
    ('impuation', SimpleImputer(strategy='median')),
    ('binning', KBinsDiscretizer(encode='ordinal'))
])

In [44]:
# A ColumnTransformer to combine all of these
feature_engineering = ColumnTransformer([
    ('categorical_encoding', categorical_pipeline, categorical_features),
    ('interaction_term', interaction_pipeline, interaction_features),
    ('binning', binning_pipeline, binning_features),
    ('impute', SimpleImputer(strategy='mean'), ['flipper_length_mm'])
])

In [45]:
# Fit the ColumnTransformer
feature_engineering.fit(X_train)

ColumnTransformer(transformers=[('categorical_encoding',
                                 Pipeline(steps=[('imputation',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('one_hot_encoder',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['sex']),
                                ('interaction_term',
                                 Pipeline(steps=[('imputation',
                                                  SimpleImputer()),
                                                 ('interaction_term',
                                                  PolynomialFeatures(include_bias=False,
                                                                     interaction_only=True))]),
                                 ['bill_length_mm', 'bill_depth_mm']),
                                ('binning',
                           

In [46]:
# Create X_train_fe
X_train_fe = feature_engineering.transform(X_train)

# FunctionTransformer - Write your custom transformer

In [47]:
type(X_train.fillna(0))

pandas.core.frame.DataFrame

In [48]:
X_train.transform()

TypeError: transform() missing 1 required positional argument: 'func'

In [49]:
# We want to fill the missing flipper length values by sex
def fillna_by_sex(X_train):
    median_fl = X_train.groupby('sex')['flipper_length_mm'].transform('median')
    return pd.DataFrame(X_train['flipper_length_mm'].fillna(median_fl))

In [50]:
X_train.isna().sum()

bill_length_mm       1
bill_depth_mm        1
flipper_length_mm    1
body_mass_g          1
sex                  9
dtype: int64

In [51]:
fillna_by_sex(X_train).isna().sum()

flipper_length_mm    1
dtype: int64

In [52]:
from sklearn.preprocessing import FunctionTransformer

In [53]:
# A ColumnTransformer to combine all of these
feature_engineering = ColumnTransformer([
    ('categorical_encoding', categorical_pipeline, categorical_features),
    ('interaction_term', interaction_pipeline, interaction_features),
    ('binning', binning_pipeline, binning_features),
    ('impute_fl', FunctionTransformer(fillna_by_sex), ['flipper_length_mm', 'sex'])
    #('impute', SimpleImputer(strategy='mean'), ['flipper_length_mm'])
])

In [54]:
feature_engineering.fit(X_train)

ColumnTransformer(transformers=[('categorical_encoding',
                                 Pipeline(steps=[('imputation',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('one_hot_encoder',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['sex']),
                                ('interaction_term',
                                 Pipeline(steps=[('imputation',
                                                  SimpleImputer()),
                                                 ('interaction_term',
                                                  PolynomialFeatures(include_bias=False,
                                                                     interaction_only=True))]),
                                 ['bill_length_mm', 'bill_depth_mm']),
                                ('binning',
                           

In [55]:
feature_engineering.transform(X_train)

array([[  0.  ,   1.  ,  42.5 , ..., 879.75,   3.  , 197.  ],
       [  0.  ,   1.  ,  38.8 , ..., 667.36,   2.  , 180.  ],
       [  1.  ,   0.  ,  43.4 , ..., 624.96,   3.  , 218.  ],
       ...,
       [  1.  ,   0.  ,  38.6 , ..., 663.92,   1.  , 199.  ],
       [  1.  ,   0.  ,  47.2 , ..., 646.64,   3.  , 214.  ],
       [  1.  ,   0.  ,  37.7 , ..., 603.2 ,   0.  , 183.  ]])

## Apply the same to the training data

The strength of the ColumnTransformer, combined with Pipeline, is that we can just use it to transform the test data without having to especially engineer anything.

In [56]:
X_test_fe = feature_engineering.transform(X_test)

## Fit a model

In [57]:
m = LogisticRegression(max_iter=5000)

In [58]:
m.fit(X_train_fe, y_train)

LogisticRegression(max_iter=5000)

# Pipeline allows you to do one more thing

In [59]:
model = Pipeline([
            ('feature_eingineer', feature_engineering), # feature_engineering is a ColumnTransformer
            ('logistic_regression', LogisticRegression())
])

# This is a Pipeline of a ColumnTransformer and a model, where the ColumnTransformer consists
# of various pipelines

In [60]:
model.fit(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
model.predict(X_test)