In [1]:
%reload_kedro

2022-01-07 12:47:24,691 - kedro.framework.session.store - INFO - `read()` not implemented for `BaseSessionStore`. Assuming empty store.
2022-01-07 12:47:24,730 - root - INFO - ** Kedro project Kedro Classification
2022-01-07 12:47:24,730 - root - INFO - Defined global variable `context`, `session`, `catalog` and `pipelines`
2022-01-07 12:47:24,740 - root - INFO - Registered line magic `run_viz`


In [21]:
import numpy as np
import bisect
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.utils.validation import check_is_fitted
from pandas import DataFrame
from logging import getLogger
from sklearn.model_selection import train_test_split

class InductiveConformalPredictor():
    """
    Standard Conformal Predictor with uncertainty non-conformity score.
    Args:
        predictor: classifier used in upstream task.

    FROM: https://medium.com/data-from-the-trenches/measuring-models-uncertainty-with-conformal-prediction-f6aa8debb50e
    """

    def __init__(self, predictor):
        self.predictor = predictor
        check_is_fitted(self.predictor, attributes=["classes_"])

        self._le = LabelEncoder()
        self.classes = self._le.fit_transform(predictor.classes_)

    def fit(self, X, y):
        self.calibration_score = self._uncertainty_conformity_score(X)
        self.calibration_class = self._le.transform(y)
        return self

    def _uncertainty_conformity_score(self, data):
        uncertainty_score = 1 - self.predictor.predict_proba(data)
        return uncertainty_score

    def predict_proba(self, X, mondrian=True):
        check_is_fitted(self, attributes=["calibration_score"])

        conformity_score = self._uncertainty_conformity_score(X)
        conformal_pred = np.zeros(conformity_score.shape)

        for c in self.classes:
            if mondrian:
                calibration_filt = self.calibration_score[self.calibration_class == c]
                calib = calibration_filt[:, c]
            else:
                calib = self.calibration_score[range(len(self.calibration_class)), 
                                                          self.calibration_class]

            sorted_calib = np.sort(calib)
            conformal_pred[:, c] = [float(bisect.bisect(sorted_calib, x))/len(calib)
                                    for x in conformity_score[:, c]]

        return conformal_pred

    def predict(self, X, mondrian=True, alpha=0.05):
        _conformal_proba = self.predict_proba(X=X, mondrian=mondrian)
        conformal_pred = (_conformal_proba > alpha).astype(int)

        mlb = MultiLabelBinarizer()
        mlb.fit([self._le.classes_])
        pred = mlb.inverse_transform(conformal_pred)

        return pred




def return_conformity_scores(
    data, params, model
) -> DataFrame:
    cfm = InductiveConformalPredictor(predictor=model)

    X, Y = data[params.get('features')], data[params.get('target')]

    X_train, X_test, y_train, y_test = train_test_split(
        X, Y, test_size=0.20, random_state=42)

    cfm.fit(X_train, y_train)

    y_test_conf = cfm.predict(X, alpha=0.25)

    data = data.copy()

    data['y_test_conf'] = y_test_conf 
    
    return data

In [22]:
data_test = catalog.load('scored_test')
model = catalog.load('model')

d2 = return_conformity_scores(data_test, context.params, model)

2022-01-07 12:54:34,119 - kedro.io.data_catalog - INFO - Loading data from `scored_test` (CSVDataSet)...
2022-01-07 12:54:34,169 - kedro.io.data_catalog - INFO - Loading data from `model` (PickleDataSet)...


In [23]:
d2.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_44,col_45,col_46,col_47,col_48,col_49,prob,pred,target,y_test_conf
0,1.647943,0.071771,1.836811,-1.595237,0.968375,-0.940628,-0.123829,0.264213,-1.125585,1.161435,...,-1.508175,3.349051,1.376435,-0.516511,4.204935,-0.257704,0.14,0,0,"(0, 1)"
1,-0.855277,5.471114,-3.625721,1.206801,-1.14205,-1.269227,-1.354375,-1.119926,-0.177536,1.043657,...,1.600557,4.61055,0.052992,0.441464,4.39031,0.951909,0.02,0,0,"(0, 1)"
2,0.667976,3.441049,6.38976,0.201343,-0.62656,-0.704253,0.251808,-1.516564,-0.299735,1.963431,...,-3.572313,-4.221592,-0.003879,-3.658716,5.833175,-0.745341,0.0,0,0,"(1,)"
3,3.005024,-5.843523,0.337008,-0.194097,1.077622,-0.354436,1.241825,-0.03421,-1.169102,-0.687605,...,-0.612483,2.126298,-1.198277,0.695897,-2.755106,2.053477,0.01,0,0,"(0, 1)"
4,4.713855,1.969048,-8.680431,-1.891758,0.52236,-0.109662,0.285357,-1.531064,1.225086,0.417965,...,-3.332199,1.316492,-0.338031,-0.156429,-0.346083,0.302137,0.05,0,0,"(0, 1)"


In [24]:
d2.y_test_conf.value_counts()

(0, 1)    3065
(1,)       553
(0,)       132
Name: y_test_conf, dtype: int64

In [25]:
d2.groupby(['y_test_conf'])['prob', 'target'].agg([max, min, 'mean'])

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,prob,prob,prob,target,target,target
Unnamed: 0_level_1,max,min,mean,max,min,mean
y_test_conf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
"(0,)",1.0,0.88,0.924091,1,1,1.0
"(0, 1)",0.87,0.01,0.145794,1,0,0.141925
"(1,)",0.0,0.0,0.0,1,0,0.009042


In [26]:
from sklearn.metrics import confusion_matrix
from numpy import bincount

cats = d2.y_test_conf.unique()

for cat in cats:
    d = d2[d2.y_test_conf == cat].copy()
    print("Mean: %.2f" % d.target.mean())
    print("Bincount: %s" % bincount(d.target))
    print()

Mean: 0.14
Bincount: [2630  435]

Mean: 0.01
Bincount: [548   5]

Mean: 1.00
Bincount: [  0 132]



In [27]:
data_test = catalog.load('catboost.scored_test')
model = catalog.load('catboost.model')

d2 = return_conformity_scores(data_test, context.params, model)

2022-01-07 12:54:41,806 - kedro.io.data_catalog - INFO - Loading data from `catboost.scored_test` (CSVDataSet)...
2022-01-07 12:54:41,856 - kedro.io.data_catalog - INFO - Loading data from `catboost.model` (PickleDataSet)...


In [28]:
d2.y_test_conf.value_counts()

(0, 1)    2807
(1,)       801
(0,)       142
Name: y_test_conf, dtype: int64

In [29]:
d2.groupby(['y_test_conf'])['prob', 'target'].agg([max, min, 'mean'])

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,prob,prob,prob,target,target,target
Unnamed: 0_level_1,max,min,mean,max,min,mean
y_test_conf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
"(0,)",1.0,0.9999444,0.9999881,1,0,0.992958
"(0, 1)",0.999942,2.702996e-06,0.1271759,1,0,0.150338
"(1,)",3e-06,1.89509e-10,8.627684e-07,1,0,0.011236


In [30]:
from sklearn.metrics import confusion_matrix
from numpy import bincount

cats = d2.y_test_conf.unique()

for cat in cats:
    d = d2[d2.y_test_conf == cat].copy()
    print("Mean: %.2f" % d.target.mean())
    print("Bincount: %s" % bincount(d.target))
    print()

Mean: 0.15
Bincount: [2385  422]

Mean: 0.01
Bincount: [792   9]

Mean: 0.99
Bincount: [  1 141]

