Comparing the Performance of Pipelines
===

Author: Nathan A. Mahynski

Date: 2023/08/31

Description: How can we determine which models or pipelines are actually better than each other?

In [2]:
using_colab = 'google.colab' in str(get_ipython())
if using_colab:
    !pip install git+https://github.com/mahynski/pychemauth@main

try:
    import pychemauth
except:
    raise ImportError("pychemauth not installed")

import matplotlib.pyplot as plt
%matplotlib inline

import watermark
%load_ext watermark

%load_ext autoreload
%autoreload 2

In [6]:
import numpy as np

In [None]:
%watermark -t -m -v --iversions

<h3>Load the Data</h3>

In [7]:
from sklearn.datasets import load_iris as load_data
X, y = load_data(return_X_y=True, as_frame=True)

In [8]:
# Let's turn the indices into names
names = dict(zip(np.arange(3), ['setosa', 'versicolor', 'virginica'])) 
y = y.apply(lambda x: names[x])

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X.values,
    y.values, # Let's try to predict the salary based on the other numerical features.
    shuffle=True,
    random_state=42,
    test_size=0.2,
    stratify=y # It is usually important to balance the test and train set so they have the same fraction of classes
)

Comparing Several Models
---

It is important to note that 

PLS-DA defines TEFF differently
We have some authenticators, and some are compliant vs. rigorous

Still, this illustrates the point of being able to compare different models.

In [None]:
from pychemauth.classifier.simca import SIMCA_Authenticator

compliant_simca = imblearn.pipeline.Pipeline(
    steps=[
        ("simca", SIMCA_Authenticator(
            n_components=1, 
            alpha=0.05, 
            scale_x=True, 
            style='dd-simca', 
            target_class='setosa', 
            use='compliant'
        )
    )
])

param_grid = [{
    'simca__n_components':np.arange(1, 4),
}]

gs_compliant_simca = GridSearchCV(
    estimator=compliant_simca,
    param_grid=param_grid,
    n_jobs=-1,
    cv=sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0),
    error_score=0,
    refit=True
)

In [None]:
rigorous_simca = imblearn.pipeline.Pipeline(
    steps=[
        ("simca", SIMCA_Authenticator(
            n_components=1, 
            alpha=0.05, 
            scale_x=True, 
            style='dd-simca', 
            target_class='setosa', 
            use='rigorous'
        )
    )
])

param_grid = [{
    'simca__n_components':np.arange(1, 4),
}]

gs_rigorous_simca = GridSearchCV(
    estimator=rigorous_simca,
    param_grid=param_grid,
    n_jobs=-1,
    cv=sklearn.model_selection.StratifiedKFold(n_splits=3, shuffle=True, random_state=0),
    error_score=0,
    refit=True
)

_ = gs.fit(X_train, y_train)

In [None]:
pipeline = imblearn.pipeline.Pipeline(
    steps=[
        ("plsda", PLSDA(
            n_components=5, 
            alpha=0.05,
            scale_x=True, 
            not_assigned='UNKNOWN',
            style='soft', 
            score_metric='TEFF'
        )
    )
])

# Let's optimize the TEFF of the model and allow alpha to vary - this is a "compliant" approach.
param_grid = [{
    'plsda__n_components':[3, 4],
    'plsda__alpha': [0.07, 0.05, 0.03, 0.01],
}]

gs = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    n_jobs=-1,
    cv=sklearn.model_selection.StratifiedKFold(n_splits=3, shuffle=True, random_state=0),
    error_score=0,
    refit=True
)

_ = gs.fit(X_train, y_train)