# Machine Learning Workflow

## Imports

In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn import set_config; set_config(display='diagram')

from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer

## Data

**We are going to predict the charges of health insurance as a function of various features using the following datatest**

In [None]:
data = pd.read_csv('data.csv')
data.head()

In [None]:
X = data.drop(columns='charges')
y = data['charges']

X_train = X[:1100]
y_train = y[:1100]
X_test = X[1100:]
y_test = y[1100:]

✏️ Today's challenge:

    impute missing values
    scale numerical features
    encode categorical features
    fine tune model and preprocessing

In one cell 💪

## Pipelines

### **a) Pipeline → → →**

chains together multiple steps in sequence

In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import StandardScaler

# Preprocess "age"
pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

pipe.fit(X_train[['age']])
pipe.transform(X_train[['age']])

In [None]:
# access steps
pipe[1]
pipe['scaler']

### **b) Column Transformer ⑂**

Apply specific changes to specific columns in parallel

In [None]:
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer

# Impute then Scale for numerical variables: 
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())])

# Encode categorical variables
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Paralellize "num_transformer" and "One hot encoder"
preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, ['age','bmi']),
    ('cat_tr', cat_transformer, ['smoker', 'region'])])

In [None]:
# from sklearn import set_config; set_config(display='diagram')

# visualizing pipelines in HTML
preprocessor

In [None]:
X_train_transformed = preprocessor.fit_transform(X_train)

display(X_train.head(3))
display(pd.DataFrame(X_train_transformed).head(3))

🤔 Where are the column names?

👉 get_feature_names_out()

In [None]:
# This should be fixed in Scikit-Learn 1.0.2: all transformers will have this method.
# SimpleImputer does not have a get_feature_names_out, so we need to add it manually.
SimpleImputer.get_feature_names_out = (lambda self, names=None: self.feature_names_in_)

In [None]:
preprocessor.get_feature_names_out()

In [None]:
pd.DataFrame(X_train_transformed, 
             columns=preprocessor.get_feature_names_out()
            ).head()

🤔 What happened to children column ? What if we want to keep it untouched?

👉 remainder=passthrough

In [None]:
preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, ['age','bmi']),
    ('cat_tr', cat_transformer, ['region','smoker'])],
    remainder='passthrough')
preprocessor

In [None]:
X_new = preprocessor.fit_transform(X_train)
pd.DataFrame(X_new).head(3)

In [None]:
pd.DataFrame(X_new, 
             columns=preprocessor.get_feature_names_out()
            ).head(3)

### **c) Custom: Function Transformer →**

- Encapsulates a python function into a scikit transformer (→) object

- Can work with Pipelines (→ →) or with ColumnTransformers (⑂)

In [None]:
# from sklearn.preprocessing import FunctionTransformer

# Create a transformer that compresses data to 2 digits (for instance!)
rounder = FunctionTransformer(np.round)
rounder = FunctionTransformer(lambda array: np.round(array, decimals=2))

In [None]:
# Add it at the end of our numerical transformer
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('rounder', rounder)])

preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, ['bmi', 'age']),
    ('cat_tr', cat_transformer, ['region', 'smoker'])],
    remainder='passthrough')
preprocessor

In [None]:
pd.DataFrame(preprocessor.fit_transform(X_train)).head(2)

### **d) FeatureUnion | |**

- Applies transformers in paralell

- Concatenate feature matrices outputs of each transformer

👉 Usefull to create whole new features

ex: let's build and add a new bmi_age_ratio feature

In [None]:
# from sklearn.pipeline import FeatureUnion

# Create a custom transformer that multiplies two columns
bmi_age_ratio_constructor = FunctionTransformer(lambda df: pd.DataFrame(df["bmi"] / df["age"]))

union = FeatureUnion([
    ('preprocess', preprocessor), # columns 0-8
    ('bmi_age_ratio', bmi_age_ratio_constructor) # new colums 9
])
union

In [None]:
pd.DataFrame(union.fit_transform(X_train)).head(1)

### **Summary with make_*** shortcuts ⚡️**

In [None]:
# from sklearn.pipeline import make_pipeline
# from sklearn.pipeline import make_union
# from sklearn.compose import make_column_transformer

Pipeline([
    ('my_name_for_imputer', SimpleImputer()),
    ('my_name_for_scaler', StandardScaler())
])

# Equivalent to
make_pipeline(SimpleImputer(), StandardScaler())

In [None]:
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc_basic = make_column_transformer((num_transformer, ['age', 'bmi']),
                                       (cat_transformer, ['smoker', 'region']),
                                       remainder='passthrough')

preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)
preproc_full

**make_column_selector select features automatically based on dtype**

In [None]:
X_train.dtypes

In [None]:
# from sklearn.compose import make_column_selector

num_col = make_column_selector(dtype_include=['float64'])
cat_col = make_column_selector(dtype_include=['object','bool'])

**🎉 Complete preprocessing pipeline 🎉**

In [None]:
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())

cat_transformer = OneHotEncoder()

preproc_basic = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough')

preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)
preproc_full

## Including Models to Pipelines

- Model objects can be plugged into pipelines
- Pipelines inherit the methods of the last object in the sequence
    - Transformers: ***fit*** and ***transform***
    - Models: ***fit, score, predict***, etc...

### a) Full pipe

In [None]:
# from sklearn.linear_model import Ridge

# Preprocessor
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc = make_column_transformer(
    (num_transformer, make_column_selector(dtype_include=['float64'])),
    (cat_transformer, make_column_selector(dtype_include=['object','bool'])),
    remainder='passthrough')

# Add Estimator
pipe = make_pipeline(preproc, Ridge())
pipe

In [None]:
# Train pipeline
pipe.fit(X_train,y_train)

# Make predictions
pipe.predict(X_test.iloc[0:2])

# Score model
pipe.score(X_test,y_test)

### b) Cross validate a pipeline

In [None]:
# from sklearn.model_selection import cross_val_score

# Cross validate pipeline
cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2').mean()

### c) Grid search a pipeline

- Check which combination of preprocessing/modelling hyperparameters work best
- It is possible to grid search hyperparameters of any component of the pipeline
- Sklearn Syntax: step_name__transformer_name__hyperparam_name
- Check available hyperparameters pipe.get_params()

In [None]:
# Inspect all pipe components parameters to find the one you want to gridsearch
pipe.get_params()

In [None]:
# from sklearn.model_selection import GridSearchCV

# Instanciate grid search
grid_search = GridSearchCV(
    pipe, 
    param_grid={
        # Access any component of the pipeline, as far back as you want
        'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
        'ridge__alpha': [0.1, 0.5, 1, 5, 10]},
    cv=5,
    scoring="r2")

grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
pipe_tuned = grid_search.best_estimator_

## Debug your pipe

In [None]:
# Access component of pipeline with `name_steps`
pipe_tuned.named_steps.keys()

In [None]:
# Check intermediate steps
pipe_tuned.named_steps["columntransformer"].fit_transform(X_train).shape

## Exporting models/pipelines

In [None]:
# import pickle

# Export pipeline as pickle file
with open("pipeline.pkl", "wb") as file:
    pickle.dump(pipe_tuned, file)

# Load pipeline from pickle file
my_pipeline = pickle.load(open("pipeline.pkl","rb"))

my_pipeline.score(X_test, y_test)

## Auto ML

In [None]:
#!pip install TPOT
#!pip install torch

In [None]:
import os
from tpot import TPOTRegressor

X_train_preproc = preproc_basic.fit_transform(X_train)
X_test_preproc = preproc_basic.transform(X_test)

In [None]:
# instanciate TPOTClassifier
tpot = TPOTRegressor(generations=4, population_size=20, verbosity=2, scoring='r2', n_jobs=-1, cv=2)
# process autoML with TPOT
tpot.fit(X_train_preproc, y_train)

# print score
print(tpot.score(X_test_preproc, y_test))

In [None]:
# export TPOT pipeline to a python file
tpot.export(os.path.join(os.getcwd(),'tpot_iris_pipeline.py'))

! cat 'tpot_iris_pipeline.py'