# ColumnTransfomer and Pipeline

In [1]:
# Known libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer

# The new tools
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Set the random state for the whole notebook
np.random.seed(42)

# standardise all plot sizes
plt.rcParams['figure.figsize'] = (12,6)

In [2]:
df = pd.read_csv('penguins.csv')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female


In [3]:
# Let's train-test-split
X = df.drop(columns=['species', 'island'])
y = df['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# ColumnTransformer

"Applies transformers to columns of an array or pandas DataFrame."

https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

The ColumnTransformer helps us to do all our Feature Engineering in one go.

# Pipeline

"Pipeline of transforms with a final estimator."

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

Pipeline allows us to apply sequential transformations to the same data.

# Example

We want to do the following things:

#### 1) One_Hot_Encode the sex column

#### 2) Fill missing values

#### 3) Interaction between bill length and bill depth: bill area

#### 4) Bin body mass

#### 5) Use all other columns as they are

## 1) One_Hot_Encode the sex column

In [4]:
# OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [5]:
# .fit() in the case of the One Hot Encoder learns what the categories of this column are ('male', 'female')
# and map that to a number
one_hot_encoder.fit(X_train[['sex']])

ValueError: Input contains NaN

We cannot continue because there are null values in the sex column. We first have to deal with the null values.

## 2) Impute Missing Values

### 2.1) String values

Observations seem to be uniformly distributed over the sexees. Let's just use the most frequent observation.

In [6]:
# Create list of categorical features
categorical_features = ['sex']

In [7]:
# Create and fit a simple imputer to show what it does
simple_imputer = SimpleImputer(strategy='most_frequent')

In [8]:
# Fit the simple imputer
simple_imputer.fit(X_train[categorical_features])

SimpleImputer(strategy='most_frequent')

In [9]:
# Check if there are still missing values after transforming the data
X_train.isna().any()

bill_length_mm       True
bill_depth_mm        True
flipper_length_mm    True
body_mass_g          True
sex                  True
dtype: bool

In [11]:
pd.DataFrame(simple_imputer.transform(X_train(categorical_features))).isna().any()

TypeError: 'DataFrame' object is not callable

### 2.2) Numerical values

Ok, so let's try to impute the values of the numerical columns as well

In [12]:
# Create list of numerical features
numerical_features = ['bill_length_mm',	'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']

In [13]:
# Impute numerical values
simple_imputer_numerical = SimpleImputer(strategy='mean')

In [14]:
# Fit transform the data
simple_imputer_numerical.fit_transform(X_train(numerical_features))

TypeError: 'DataFrame' object is not callable

So far we had to:
    - Instantiate Imputer for strings
    - Instantiate Imputer for numerical values
    - `.fit_transform()` imputer for strings and assign that to the original columns
    - `.fit_transform()` imputer for numerical values and assign the values to the original columns
    
This still seems ok. However, imagine we want to do a lot more transformations. This becomes tedious and it will become even more tedious if we have to apply a transformation to each column of the test set later on. The ColumnTransformer comes to our rescue! Here we can combine the separate transformations into one "place". Let's see how that works.

In [None]:
# Build a column_transformer
column_transformer = ColumnTransformer([ # the parameter 'transformers' should be a list of transformers (objects with a `.transform()` method)
    # The input has to be a tuple (name, transformer, columns)
    ('categorical_imputer', SimpleImputer(strategy='most_frequent'), categorical_features),
    ('numerical_imputer', SimpleImputer(strategy='mean'), numerical_features)
])

In [None]:
# Fit the column transformer
column_transformer.fit(X_train)

In [None]:
# Transform X_train


### 1) Go back to OneHotEncoding

Ok, now let's also one_hot_ecnode the sex column with the ColumnTransformer.

In [None]:
column_transformer = ColumnTransformer([ # the parameter 'transformers' should be a list of transformers (objects with a `.transform()` method)
    # The input has to be a tuple (name, transformer, columns)
    ('string_imputer', SimpleImputer(strategy='most_frequent'), categorical_features),
    ('one_hot_encoder', OneHotEncoder(drop='first'), categorical_features),
    ('numerical_imputer', SimpleImputer(strategy='mean'), numerical_features)
])

In [None]:
# Fit the column transformer
column_transformer.fit(X_train)

Why does this not work?

ColumnTransformer applies the transformations separately. If you define two transformations for one column, in this case the sex column, ColumnTransformer will use the original column for both transformations. This will not work because the OneHotEncoder cannot handle null values. What can we do about that? --> Pipeline comes to our rescue

### Pipeline

In [None]:
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

In [None]:
# Now exchange the individual steps of the ColumnTransformer for sex with the pipeline
column_transformer = ColumnTransformer([ # the parameter 'transformers' should be a list of transformers (objects with a `.transform()` method)
    ('transform_sex', string_column_pipeline, categorical_features),
    ('numerical_imputer', SimpleImputer(strategy='mean'), numerical_features)
])

In [None]:
# Now try to fit the column transformer
column_transformer.fit(X_train)

In [None]:
# Transform X_train

## 3) Interaction between bill_length and bill_depth: bill area

In [None]:
# How would you calculate interaction terms?


In [None]:
# Do the same with PolynomialFeatures
interaction_term = PolynomialFeatures(interaction_only=True, include_bias=False)

In [None]:
# Define interaction columns
interaction_features = ...

In [None]:
# Create an interaction pipeline
interaction_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('interaction', PolynomialFeatures(interaction_only=True, include_bias=False))
])

In [None]:
# Fit transform X_train


In [None]:
# Fit the interaction pipeline
interaction_pipeline.fit(X_train[interaction_features])

In [None]:
# Transform the interaction pipeline and inspect the results


In [None]:
# Now exchange the individual steps of the ColumnTransformer for sex with the pipeline
column_transformer = ColumnTransformer([ # the parameter 'transformers' should be a list of transformers (objects with a `.transform()` method)
    ('transform_sex', string_column_pipeline, categorical_features),
    ('interaction_pipeline', interaction_pipeline, interaction_features)
])

In [None]:
# Fit it
column_transformer.fit(X_train)

In [None]:
# Transform it


## 4) Bin body mass

In [None]:
# Create list of features for binning
binning_features = ...

In [None]:
# Create a KBinsDiscretizer
binning = KBinsDiscretizer(encode='ordinal')

In [None]:
# Fit_transform the discretizer


## Bring everything together in the ColumnTransformer

Here is a 'clean' example of how to bring all that together.

In [None]:
# One pipeline for the column sex
string_pipeline = Pipeline([
    ('imputation', SimpleImputer(strategy='most_frequent')),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# One pipeline for the interaction terms
interaction_pipeline = Pipeline([
    ('imputation', SimpleImputer(strategy='mean')),
    ('interaction_term', PolynomialFeatures(interaction_only=True, include_bias=False))
])

In [None]:
# One pipeline for binning
binning_pipeline = Pipeline([
    ('impuation', SimpleImputer(strategy='median')),
    ('binning', KBinsDiscretizer(encode='ordinal'))
])

In [None]:
# A ColumnTransformer to combine all of these
feature_engineering = ColumnTransformer([
    ('string_encoding', string_pipeline, categorical_features),
    ('interaction_term', interaction_pipeline, interaction_features),
    ('binning', binning_pipeline, binning_features),
    ('impute', SimpleImputer(strategy='mean'), ['flipper_length_mm'])
])

In [None]:
# Fit the ColumnTransformer
feature_engineering.fit(X_train)

In [None]:
# Create X_train_fe
X_train_fe = ...

# Maybe show functiontransformer?

## Apply the same to the training data

The strength of the ColumnTransformer, combined with Pipeline, is that we can just use it to transform the test data without having to especially engineer anything.

In [None]:
X_test_fe = ...

# Pipeline allows you to do one more thing

In [None]:
model = Pipeline([
            ('feature_eingineer', feature_engineering),
            ('logistic_regression', LogisticRegression())
])

In [None]:
model.fit(X_train, y_train)

In [None]:
model.predict(X_test)