In [1]:
!pip install --quiet pytorch-lightning==2.1.0 sentencepiece==0.1.99 transformers==4.34.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m774.6/774.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm.notebook import tqdm
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
import joblib
import json

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizerFast, BertModel, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl
from torchmetrics.functional.classification import accuracy, auroc, multilabel_f1_score, roc, binary_f1_score
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, multilabel_confusion_matrix

In [3]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [4]:
RANDOM_SEED = 42

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8
pl.seed_everything(RANDOM_SEED)

INFO:lightning_fabric.utilities.seed:Seed set to 42


42

In [5]:
ROOT_PATH = "/content/drive/MyDrive/Colab Notebooks/ultimate"

In [6]:
LABEL_COLUMNS = joblib.load(f"{ROOT_PATH}/models/ml_binarizer.pkl").classes_

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [7]:
def load_preds(file_path: str):
    """
    Load the predictions for evaulation.
    """
    if os.path.isfile(file_path):
        preds = torch.load(file_path)
        return preds
    else:
        print(f"Invalid File path: {file_path}")


In [8]:
!pip install --quiet nannyml

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.8/20.8 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.3/59.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m260.5/260.5 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.6/133.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.8/81.8 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

In [9]:
import nannyml as nml
from IPython.display import display

In [10]:
train_preds = load_preds(f"{ROOT_PATH}/results/evaluation/train.pt")
val_preds = load_preds(f"{ROOT_PATH}/results/evaluation/val.pt")
test_preds = load_preds(f"{ROOT_PATH}/results/evaluation/test.pt")

In [11]:
LABEL_COLUMNS

array(['abbreviation', 'aircraft', 'airfare', 'airline', 'airport',
       'capacity', 'cheapest', 'city', 'distance', 'flight', 'flight_no',
       'flight_time', 'ground_fare', 'ground_service', 'meal', 'quantity',
       'restriction'], dtype=object)

# Task:

You have both of the models in production and no labeled data is available to you. How would you compare them? Which metrics would you use for this kind of comparison? For example, you can use metrics based on confidence values or related ones.
---

-  We can use confidence based approach here. [NannyML](https://nannyml.readthedocs.io/en/stable/), provides some intereting methods help monitor our models in production.

- Confidence-based Performance Estimation (CBPE), is one such approach which can be used here to estimate the performance based on a reference data. Here is some relevant detail for us:

    * Estimation of Performance of the monitored model, in production use cases when we do not have the ground truth data. CBPE provides an unbiased estimation of the performance of the monitored model based on the monitored outputs only (when ground truth is unavailalbe).
    * CBPE suffers from concept-drift, and works well with models which are well-caliberated.
    * CBPE will remain accurate under data drift i.e. when distribution of inputs P(X) changes but probability of target given inputs P(Y|X) stays the same (or in other words - if probabilities remain well-calibrated)

## Approach

Since we do not have multiple models and not a live production system, so we will simulate both.

* To simulate two different model, we are going to use different threshold on predicted probability to result in different predictions.
* We will use training data as the `reference` data and validation as the `analysis` data (as the name suggests, we will be estimating the models performance on this data).
    * To simulate the production scenario where we do not have ground truth, we will not be using the `true_labels` in the `validation` data in our analysis

* NannyML library does not provide any method for method for `multilabel` problems, but it provides methods for `binary-classification` problems, so we are going to treat our problem as multiple binary classification problems (which is what multilabel classifcation is in reality).

* In particular we are using `CBPE` estimator and we will be monitoring the `f1` score as it balances the precision and recall.

* Based on the `CBPE` estimator we are going to calculate the number of alerts incidents when the predicted probability is not in the estimated in the confidence threshold range.

In [58]:
LABEL_TO_IDX = {label: idx for idx, label in enumerate(LABEL_COLUMNS)}

In [59]:
LABEL_TO_IDX

{'abbreviation': 0,
 'aircraft': 1,
 'airfare': 2,
 'airline': 3,
 'airport': 4,
 'capacity': 5,
 'cheapest': 6,
 'city': 7,
 'distance': 8,
 'flight': 9,
 'flight_no': 10,
 'flight_time': 11,
 'ground_fare': 12,
 'ground_service': 13,
 'meal': 14,
 'quantity': 15,
 'restriction': 16}

In [60]:
def simulated_evaluation_data_df(label: str,
                                 threshold: float,
                                 ref_preds: torch.Tensor,
                                 analysis_preds: torch.Tensor):
    """
    Function to simulate the data for evaluation in production.
    The data will be used to train a label specific estimator for comparing models based on estimated performance.
    """
    if label in LABEL_TO_IDX:
        idx = LABEL_TO_IDX[label]
        ref_df = pd.DataFrame({"y_pred_proba": train_preds["preds"][:,idx].numpy().tolist(),
                                    "y_pred": np.array(train_preds["preds"][:,idx].numpy() > threshold, dtype=int).tolist() ,
                                    "y_true": train_preds["labels"][:,idx].numpy().tolist()})

        analysis_1_df = pd.DataFrame({"y_pred_proba": val_preds["preds"][:,idx].numpy().tolist(),
                                        "y_pred": np.array(val_preds["preds"][:,idx].numpy() > threshold, dtype=int).tolist()})


        # Generate Gaussian noise with the same shape as predictions (std-dev-0.1)
        noise = torch.normal(0, 0.1, size=val_preds["preds"][:,idx].size())
        # Add the noise to the predictions
        noisy_predictions = val_preds["preds"][:,idx] + noise
        # Clip the values to ensure they are within [0, 1]
        noisy_predictions = torch.clamp(noisy_predictions, 0., 1.)
        analysis_2_df = pd.DataFrame({"y_pred_proba": noisy_predictions.numpy().tolist(),
                                        "y_pred": np.array(val_preds["preds"][:,idx].numpy() > threshold, dtype=int).tolist()})

        return {"ref": ref_df, "analysis_1": analysis_1_df, "analysis_2": analysis_2_df}
    else:
        raise Exception("Unknown Label")

In [95]:
simulated_data = {label: simulated_evaluation_data_df(label, 0.2, train_preds, val_preds) for label in LABEL_COLUMNS}

In [107]:
def train_cbpe_estimator(simulated_data: dict, label: str, plots=False, tabular=False, verbose=True):
    """
    Train the Confidence-based Performance Estimation (CBPE) on reference data
    and perform analysis on the analysis datasets.
    """
    estimator = nml.CBPE(
        y_pred_proba='y_pred_proba',
        y_pred='y_pred',
        y_true='y_true',
        metrics=['f1'],
        problem_type='classification_binary',
    )

    ref = simulated_data[label]["ref"]
    estimator.fit(ref)
    results_1 = estimator.estimate(simulated_data[label]["analysis_1"])
    results_2 = estimator.estimate(simulated_data[label]["analysis_2"])

    if tabular:
        display(results_1.filter(period='analysis').to_df())
        display(results_2.filter(period='analysis').to_df())
    if plots:
        results_1.plot().show()
        results_2.plot().show()


    num_alerts_1 = results_1.filter(period='analysis').to_df()["f1", "alert"].sum()
    num_alerts_2 = results_2.filter(period='analysis').to_df()["f1", "alert"].sum()
    if verbose:
        print("Analysis1: Number of alerts: ", num_alerts_1)
        print("Analysis2: Number of alerts: ", num_alerts_2)

    return {"analysis_1": results_1, "analysis_2": results_2, "num_alerts_1": num_alerts_1, "num_alerts_2": num_alerts_2}

In [97]:
LABEL_COLUMNS

array(['abbreviation', 'aircraft', 'airfare', 'airline', 'airport',
       'capacity', 'cheapest', 'city', 'distance', 'flight', 'flight_no',
       'flight_time', 'ground_fare', 'ground_service', 'meal', 'quantity',
       'restriction'], dtype=object)

## Exploring the results on analysis data

In [98]:
analysis_result_airfare = train_cbpe_estimator(simulated_data, "airfare", tabular=False, plots=True)



Analysis1: Number of alerts:  1
Analysis2: Number of alerts:  10


In [99]:
analysis_result_city = train_cbpe_estimator(simulated_data, "city", tabular=False, plots=True)



Analysis1: Number of alerts:  7
Analysis2: Number of alerts:  8


In [100]:
analysis_result_distance = train_cbpe_estimator(simulated_data, "distance", tabular=False, plots=True)



Analysis1: Number of alerts:  10
Analysis2: Number of alerts:  10


In [101]:
analysis_result_airfare = train_cbpe_estimator(simulated_data, "airfare", tabular=False, plots=True)



Analysis1: Number of alerts:  1
Analysis2: Number of alerts:  10


In [102]:
analysis_result_flight = train_cbpe_estimator(simulated_data, "flight", tabular=False, plots=True)



Analysis1: Number of alerts:  10
Analysis2: Number of alerts:  10


In [108]:
# `Total Number of generated alerts` as a simple metric for the estimation of model in production

import warnings
warnings.filterwarnings('ignore')

# creating report of total number of alerts across all the intent classe
report = []
for label in LABEL_COLUMNS:
    data = train_cbpe_estimator(simulated_data, label, tabular=False, plots=False, verbose=False)
    data["intent"] = label
    report.append({"intent": data["intent"], "alerts_1": data["num_alerts_1"], "alerts_2": data["num_alerts_2"]})

report = pd.DataFrame(report)
report



Unnamed: 0,intent,alerts_1,alerts_2
0,abbreviation,10,10
1,aircraft,10,10
2,airfare,1,10
3,airline,10,10
4,airport,9,9
5,capacity,10,10
6,cheapest,0,0
7,city,7,8
8,distance,10,10
9,flight,10,10


In [110]:
print("Total number of alerts from model 1:",report["alerts_1"].sum())
print("Total number of alerts from model 2:",report["alerts_2"].sum())

Total number of alerts from model 1: 106
Total number of alerts from model 2: 118


## Comments:

* The approach taken here uses model confidence to estimate the thresholds on predicted probabilities and estimates the metric (here `f1`).
* As it can be seen from the above, we can estimate that the `model1` performs better than `model2` in production based on the number of alert incident generated.


Caveats:
* We can obverve from the analysis plots, that thresholds for some classes is estimated to be very high hence resulting in lots of alerts. This indicates the need for caliberation of probabilitis for the reference model. We can also do manual analysis and set constant threshold values as is supported by `nannyml`, which I leave for further exploration.