# Machine Learning Workflow

## Imports

In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn import set_config; set_config(display='diagram')

from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer

## Data

**We are going to predict the charges of health insurance as a function of various features using the following datatest**

In [2]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,age,bmi,children,smoker,region,charges
0,19.0,27.9,0,True,southwest,16884.924
1,18.0,33.77,1,False,southeast,1725.5523
2,,33.0,3,False,southeast,4449.462
3,33.0,22.705,0,False,northwest,21984.47061
4,32.0,28.88,0,False,northwest,3866.8552


In [29]:
data.shape

(1338, 6)

In [3]:
X = data.drop(columns='charges')
y = data['charges']

X_train = X[:1100]
y_train = y[:1100]
X_test = X[1100:]
y_test = y[1100:]

✏️ Today's challenge:

    impute missing values
    scale numerical features
    encode categorical features
    fine tune model and preprocessing

In one cell 💪

## Pipelines

### **a) Pipeline → → →**

chains together multiple steps in sequence

In [4]:
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import StandardScaler

# Preprocess "age"
pipe = Pipeline([
    ('my_imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

pipe.fit(X_train[['age']])
pipe.transform(X_train[['age']])

array([[-1.45565969],
       [-1.52644066],
       [ 0.        ],
       ...,
       [-1.24331678],
       [ 0.88011225],
       [-1.03097388]])

In [7]:
pipe

In [6]:
# access steps
pipe[1]
pipe['my_imputer']

### **b) Column Transformer ⑂**

Apply specific changes to specific columns in parallel

In [8]:
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer

# Impute then Scale for numerical variables: 
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())])

# Encode categorical variables
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Paralellize "num_transformer" and "One hot encoder"
preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, ['age','bmi']),
    ('cat_tr', cat_transformer, ['smoker', 'region'])])

In [9]:
# from sklearn import set_config; set_config(display='diagram')

# visualizing pipelines in HTML
preprocessor

In [10]:
X_train_transformed = preprocessor.fit_transform(X_train)

display(X_train.head(3))
display(pd.DataFrame(X_train_transformed).head(3))

Unnamed: 0,age,bmi,children,smoker,region
0,19.0,27.9,0,True,southwest
1,18.0,33.77,1,False,southeast
2,,33.0,3,False,southeast


Unnamed: 0,0,1,2,3,4,5,6,7
0,-1.45566,-0.479092,0.0,1.0,0.0,0.0,0.0,1.0
1,-1.526441,0.492337,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.36491,1.0,0.0,0.0,0.0,1.0,0.0


🤔 Where are the column names?

👉 get_feature_names_out()

In [11]:
# This should be fixed in Scikit-Learn 1.0.2: all transformers will have this method.
# SimpleImputer does not have a get_feature_names_out, so we need to add it manually.
SimpleImputer.get_feature_names_out = (lambda self, names=None: self.feature_names_in_)

In [12]:
preprocessor.get_feature_names_out()

array(['num_tr__age', 'num_tr__bmi', 'cat_tr__smoker_False',
       'cat_tr__smoker_True', 'cat_tr__region_northeast',
       'cat_tr__region_northwest', 'cat_tr__region_southeast',
       'cat_tr__region_southwest'], dtype=object)

In [13]:
pd.DataFrame(X_train_transformed, 
             columns=preprocessor.get_feature_names_out()
            ).head()

Unnamed: 0,num_tr__age,num_tr__bmi,cat_tr__smoker_False,cat_tr__smoker_True,cat_tr__region_northeast,cat_tr__region_northwest,cat_tr__region_southeast,cat_tr__region_southwest
0,-1.45566,-0.479092,0.0,1.0,0.0,0.0,0.0,1.0
1,-1.526441,0.492337,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.36491,1.0,0.0,0.0,0.0,1.0,0.0
3,-0.464726,-1.338815,1.0,0.0,0.0,1.0,0.0,0.0
4,-0.535507,-0.316911,1.0,0.0,0.0,1.0,0.0,0.0


🤔 What happened to children column ? What if we want to keep it untouched?

👉 remainder=passthrough

In [14]:
preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, ['age','bmi']),
    ('cat_tr', cat_transformer, ['region','smoker'])],
    remainder='passthrough')
preprocessor

In [15]:
X_new = preprocessor.fit_transform(X_train)
pd.DataFrame(X_new).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-1.45566,-0.479092,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,-1.526441,0.492337,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2,0.0,0.36491,0.0,0.0,1.0,0.0,1.0,0.0,3.0


In [16]:
pd.DataFrame(X_new, 
             columns=preprocessor.get_feature_names_out()
            ).head(3)

Unnamed: 0,num_tr__age,num_tr__bmi,cat_tr__region_northeast,cat_tr__region_northwest,cat_tr__region_southeast,cat_tr__region_southwest,cat_tr__smoker_False,cat_tr__smoker_True,remainder__children
0,-1.45566,-0.479092,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,-1.526441,0.492337,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2,0.0,0.36491,0.0,0.0,1.0,0.0,1.0,0.0,3.0


### **c) Custom: Function Transformer →**

- Encapsulates a python function into a scikit transformer (→) object

- Can work with Pipelines (→ →) or with ColumnTransformers (⑂)

In [17]:
# from sklearn.preprocessing import FunctionTransformer

# Create a transformer that compresses data to 2 digits (for instance!)
# rounder = FunctionTransformer(np.round)
rounder = FunctionTransformer(lambda array: np.round(array, decimals=2))

In [18]:
# Add it at the end of our numerical transformer
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('rounder', rounder)])

preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, ['bmi', 'age']),
    ('cat_tr', cat_transformer, ['region', 'smoker'])],
    remainder='passthrough')
preprocessor

In [19]:
pd.DataFrame(preprocessor.fit_transform(X_train)).head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-0.48,-1.46,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.49,-1.53,0.0,0.0,1.0,0.0,1.0,0.0,1.0


In [None]:
from sklearn.base import TransformerMixin, BaseEstimator

class MyCustomTranformer(TransformerMixin, BaseEstimator): 
    # BaseEstimator generates get_params() and set_params() methods that all pipelines require
    # TransformerMixin creates fit_transform() method from fit() and transform()
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        # Store here what needs to be stored during .fit(X_train) as instance attributes.
        # Return "self" to allow chaining .fit().transform()
        self.mymean = X.mean()
        self.mystd = X.std()
        return self
    
    def transform(self, X, y=None):
        # Return result as dataframe for integration into ColumnTransformer
        pass

### **d) FeatureUnion | |**

- Applies transformers in paralell

- Concatenate feature matrices outputs of each transformer

👉 Usefull to create whole new features

ex: let's build and add a new bmi_age_ratio feature

In [21]:
# from sklearn.pipeline import FeatureUnion

# Create a custom transformer that multiplies two columns
bmi_age_ratio_constructor = FunctionTransformer(lambda df: pd.DataFrame(df["bmi"] / df["age"]))

union = FeatureUnion([
    ('preprocess', preprocessor), # columns 0-8
    ('bmi_age_ratio', bmi_age_ratio_constructor) # new colums 9
])
union

In [22]:
pd.DataFrame(union.fit_transform(X_train)).head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.48,-1.46,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.468421


### **Summary with make_*** shortcuts ⚡️**

In [20]:
# from sklearn.pipeline import make_pipeline
# from sklearn.pipeline import make_union
# from sklearn.compose import make_column_transformer

Pipeline([
    ('my_name_for_imputer', SimpleImputer()),
    ('my_name_for_scaler', StandardScaler())
])

# Equivalent to
make_pipeline(SimpleImputer(), StandardScaler())

In [23]:
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc_basic = make_column_transformer((num_transformer, ['age', 'bmi']),
                                       (cat_transformer, ['smoker', 'region']),
                                       remainder='passthrough')

preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)
preproc_full

**make_column_selector select features automatically based on dtype**

In [24]:
X_train.dtypes

age         float64
bmi         float64
children      int64
smoker         bool
region       object
dtype: object

In [25]:
# from sklearn.compose import make_column_selector

num_col = make_column_selector(dtype_include=['float64'])
cat_col = make_column_selector(dtype_include=['object','bool'])

**🎉 Complete preprocessing pipeline 🎉**

In [26]:
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())

cat_transformer = OneHotEncoder()

preproc_basic = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough')

preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)
preproc_full

## Including Models to Pipelines

- Model objects can be plugged into pipelines
- Pipelines inherit the methods of the last object in the sequence
    - Transformers: ***fit*** and ***transform***
    - Models: ***fit, score, predict***, etc...

### a) Full pipe

In [27]:
# from sklearn.linear_model import Ridge

# Preprocessor
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc = make_column_transformer(
    (num_transformer, make_column_selector(dtype_include=['float64'])),
    (cat_transformer, make_column_selector(dtype_include=['object','bool'])),
    remainder='passthrough')

# Add Estimator
pipe = make_pipeline(preproc, Ridge())
pipe

In [28]:
# Train pipeline
pipe.fit(X_train,y_train)

# Make predictions
pipe.predict(X_test.iloc[0:2])

# Score model
pipe.score(X_test,y_test)

0.7472459359430912

### b) Cross validate a pipeline

In [30]:
# from sklearn.model_selection import cross_val_score

# Cross validate pipeline
cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2').mean()

0.7463235584349777

### c) Grid search a pipeline

- Check which combination of preprocessing/modelling hyperparameters work best
- It is possible to grid search hyperparameters of any component of the pipeline
- Sklearn Syntax: step_name__transformer_name__hyperparam_name
- Check available hyperparameters pipe.get_params()

In [31]:
# Inspect all pipe components parameters to find the one you want to gridsearch
pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7fb4686fd8b0>),
                                   ('onehotencoder', OneHotEncoder(),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7fb46b71e490>)])),
  ('ridge', Ridge())],
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline',
                                  Pipeline(steps=[('simpleimputer',
                                

In [32]:
# from sklearn.model_selection import GridSearchCV

# Instanciate grid search
grid_search = GridSearchCV(
    pipe, 
    param_grid={
        # Access any component of the pipeline, as far back as you want
        'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
        'ridge__alpha': [0.1, 0.5, 1, 5, 10]},
    cv=5,
    scoring="r2")

grid_search.fit(X_train, y_train)
grid_search.best_params_

{'columntransformer__pipeline__simpleimputer__strategy': 'median',
 'ridge__alpha': 1}

In [33]:
pipe_tuned = grid_search.best_estimator_

In [34]:
pipe_tuned

## Debug your pipe

In [35]:
# Access component of pipeline with `name_steps`
pipe_tuned.named_steps.keys()

dict_keys(['columntransformer', 'ridge'])

In [38]:
# Check intermediate steps
pipe_tuned.named_steps["columntransformer"].fit_transform(X_train).shape

(1100, 9)

## Exporting models/pipelines

In [39]:
# import pickle

# Export pipeline as pickle file
with open("pipeline.pkl", "wb") as file:
    pickle.dump(pipe_tuned, file)

# Load pipeline from pickle file
my_pipeline = pickle.load(open("pipeline.pkl","rb"))

my_pipeline.score(X_test, y_test)

0.7472449607503218

## Auto ML

In [None]:
#!pip install TPOT
#!pip install torch

In [40]:
preproc_basic

In [41]:
import os
from tpot import TPOTRegressor

X_train_preproc = preproc_basic.fit_transform(X_train)
X_test_preproc = preproc_basic.transform(X_test)



In [42]:
# instanciate TPOTClassifier
tpot = TPOTRegressor(generations=4, population_size=20, verbosity=2, scoring='r2', n_jobs=-1, cv=2)
# process autoML with TPOT
tpot.fit(X_train_preproc, y_train)

# print score
print(tpot.score(X_test_preproc, y_test))

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8484712148914415

Generation 2 - Current best internal CV score: 0.8486540720593998

Generation 3 - Current best internal CV score: 0.8486540720593998

Generation 4 - Current best internal CV score: 0.8496492302216051

Best pipeline: RandomForestRegressor(input_matrix, bootstrap=True, max_features=0.7000000000000001, min_samples_leaf=11, min_samples_split=12, n_estimators=100)
0.8632426171819684


In [43]:
# export TPOT pipeline to a python file
tpot.export(os.path.join(os.getcwd(),'tpot_iris_pipeline.py'))

! cat 'tpot_iris_pipeline.py'

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.8496492302216051
exported_pipeline = RandomForestRegressor(bootstrap=True, max_features=0.7000000000000001, min_samples_leaf=11, min_samples_split=12, n_estimators=100)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)


