# The importance of good co-ordinates when using MACEst

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd

import seaborn as sns 
from tqdm.autonotebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import scale

from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import PCA
from sklearn.metrics import log_loss, brier_score_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NeighborhoodComponentsAnalysis

from macest.classification import models as clmod
from macest.classification import plots as clplot




In [3]:
# sns.set_context("talk")
# sns.set_context("poster")
sns.set_context("notebook")
sns.set_style('darkgrid')

In [4]:
cols = ['lettr',
'x-box',
'y-box',
'width',
'high',
'onpix',
'x-bar',
'y-bar',
'x2bar',
'y2bar',
'xybar',
'x2ybr',
'xy2br',
'x-ege',
'xegvy',
'y-ege',
'yegvx']

### We will try to add confidence to a classic ML challenge, classifying images of letters based upon some statistical attributes (https://archive.ics.uci.edu/ml/datasets/letter+recognition)

In [5]:
letters_df = pd.read_csv('/Users/Rhys/Documents/Oracle_placement/Modelling_Uncertainty/experiments/Letter_recognition/letter-recognition.data', header = None, names = cols).sample(frac=1)
letters_df.reset_index(drop=True, inplace=True)

FileNotFoundError: [Errno 2] File /Users/Rhys/Documents/Oracle_placement/Modelling_Uncertainty/experiments/Letter_recognition/letter-recognition.data does not exist: '/Users/Rhys/Documents/Oracle_placement/Modelling_Uncertainty/experiments/Letter_recognition/letter-recognition.data'

In [None]:
y = letters_df['lettr']
X = letters_df.drop('lettr', axis =1)

### In the original feature space we have lots of correlated variables, and the feature importance is unlikely to be even amongst all of them

In [None]:
plt.figure(figsize = (12,10))
sns.heatmap(X.corr(),
            cmap = 'coolwarm',
            annot = True,
            vmin = -1.1, vmax =1.1)

In [None]:
X = X/16

In [None]:
enc = LabelEncoder()
y = enc.fit_transform(y)

In [None]:
X_pp_train, X_conf_train, y_pp_train, y_conf_train  = train_test_split(X, y, test_size=0.66, random_state=0)

X_conf_train, X_cal, y_conf_train, y_cal = train_test_split(X_conf_train, y_conf_train,
                                                            test_size=0.4, random_state=0)
X_cal, X_test, y_cal,  y_test, = train_test_split(X_cal, y_cal, test_size=0.4, random_state=0)

In [None]:
print(X_pp_train.shape[0])
print(X_conf_train.shape[0])
print(X_cal.shape[0])
print(X_test.shape[0])

In [None]:
model = RandomForestClassifier(random_state =0,
                               n_estimators =800,
                               n_jobs =-1)
model.fit(X_pp_train, y_pp_train)

In [None]:
# sns.barplot(X.columns,  model.feature_importances_)

In [None]:
print(model.score(X_pp_train, y_pp_train))
print(model.score(X_conf_train, y_conf_train))
print(model.score(X_cal, y_cal))
print(model.score(X_test, y_test))

### We will use the L2 metric, this is implictly saying that our measure of similarity between data points is the euclidean distance in feature space

In [None]:
param_bounds = clmod.SearchBounds(k_bounds = (3,50))
neighbour_search_params = clmod.HnswGraphArgs(query_args = dict(ef = 1000))
optimiser_args = dict(popsize = 25)

In [None]:
macest_model = clmod.ModelWithConfidence(model,
                                      X_conf_train,
                                      y_conf_train, 
                                      search_method_args = neighbour_search_params)

macest_model.fit(X_cal, y_cal, param_range = param_bounds, optimiser_args= optimiser_args)

## Unseen data

In [None]:
preds = model.predict(X_test)
conf_preds = macest_model.predict_proba(X_test)
rf_conf = model.predict_proba(X_test)
rf_point_prediction_conf = np.amax(rf_conf, axis=1)
macest_point_prediction_conf = macest_model.predict_confidence_of_point_prediction(X_test)

In [None]:
clplot.plot_calibration_curve([rf_point_prediction_conf,
                              macest_point_prediction_conf], 
                              ['Uncalibrated RF','MACEst' ],
                              preds, y_test)

In [None]:
clplot.plot_quantile_spaced_calibration_curve([rf_point_prediction_conf,
                             macest_point_prediction_conf,
                                ], 
                              ['Uncalibrated RF','MACEst' ],
                              preds, y_test)

In [None]:
clplot.plot_calibration_metrics([rf_point_prediction_conf,
                             macest_point_prediction_conf,
                                ], 
                              ['Uncalibrated RF','MACEst' ],
                              preds, y_test)

In [None]:
clplot.plot_forecast_metrics([rf_point_prediction_conf,
                             macest_point_prediction_conf], 
                              ['Uncalibrated RF','MACEst' ],
                              preds, y_test)

## We see that MACEst performs better than the raw estimates from the random forest however it's still not great, let's try inducing a better co-ordinate system 

### PCA

In [None]:
X_pp_train, X_conf_train, y_pp_train, y_conf_train  = train_test_split(X, y, test_size=0.66, random_state=0)

pca = PCA(n_components=0.95, whiten = True)
pca.fit(X_pp_train)

X_pp_train = pca.transform(X_pp_train)
X_conf_train = pca.transform(X_conf_train)

X_conf_train, X_cal, y_conf_train, y_cal = train_test_split(X_conf_train, y_conf_train,
                                                            test_size=0.4, random_state=0)
X_cal, X_test, y_cal,  y_test, = train_test_split(X_cal, y_cal, test_size=0.4, random_state=0)

In [None]:
model = RandomForestClassifier(random_state =0,
                               n_estimators =800,
                               n_jobs =-1)
model.fit(X_pp_train, y_pp_train)

In [None]:
X_cal.shape

In [None]:
print(model.score(X_pp_train, y_pp_train))
print(model.score(X_conf_train, y_conf_train))
print(model.score(X_cal, y_cal))
print(model.score(X_test, y_test))

### We are now saying points are similar if the euclidean distance between their projection is whitned pca space (similar to mahalanobis) is small

In [None]:
neighbour_search_params = clmod.HnswGraphArgs(init_args = dict(method = 'hnsw',
                                                               space = 'l2'))

In [None]:
macest_model = clmod.ModelWithConfidence(model,
                                      X_conf_train,
                                      y_conf_train, 
                                      search_method_args = neighbour_search_params)

macest_model.fit(X_cal, y_cal, param_range = param_bounds, optimiser_args= optimiser_args)

In [None]:
preds = model.predict(X_test)
conf_preds = macest_model.predict_proba(X_test)
rf_conf = model.predict_proba(X_test)
rf_point_prediction_conf = np.amax(rf_conf, axis=1)
macest_point_prediction_conf = macest_model.predict_confidence_of_point_prediction(X_test)

In [None]:
clplot.plot_calibration_curve([rf_point_prediction_conf,macest_point_prediction_conf], 
                              ['Uncalibrated RF','MACEst' ],
                              preds, y_test)

In [None]:
clplot.plot_quantile_spaced_calibration_curve([rf_point_prediction_conf, macest_point_prediction_conf], 
                                              ['Uncalibrated RF','MACEst' ],
                                              preds, y_test)

In [None]:
clplot.plot_calibration_metrics([rf_point_prediction_conf, macest_point_prediction_conf], 
                                 ['Uncalibrated RF','MACEst' ],
                                 preds, y_test)

In [None]:
clplot.plot_forecast_metrics([rf_point_prediction_conf,
                             macest_point_prediction_conf,], 
                              ['Uncalibrated RF','MACEst' ],
                              preds, y_test)

## That is better but still not great, can we do better ?

### MACEst works by finding a set of nearest neighbours and then uses the distance to these k neighbours as a proxy for the epistemic uncertainty, because of this paradigm the natural metric to one which induces a good nearest neighbour distance. This method exists and is known as neighbourhood component analysis (https://www.cs.toronto.edu/~hinton/absps/nca.pdf)
#### Let's compare the results if we use this metric

In [None]:
X_pp_train, X_conf_train, y_pp_train, y_conf_train  = train_test_split(X, y, test_size=0.66, random_state=0)

nca = NeighborhoodComponentsAnalysis(n_components = X_pp_train.shape[1] ,
                                     max_iter = 30,
                                     verbose =1)
nca.fit(X_pp_train, y_pp_train)

X_pp_train = nca.transform(X_pp_train)
X_conf_train = nca.transform(X_conf_train)

X_conf_train, X_cal, y_conf_train, y_cal = train_test_split(X_conf_train, y_conf_train,
                                                            test_size=0.4, random_state=0)
X_cal, X_test, y_cal,  y_test, = train_test_split(X_cal, y_cal, test_size=0.3, random_state=0)

In [None]:
model = RandomForestClassifier(random_state =0,
                               n_estimators =800,
                               n_jobs =-1)
model.fit(X_pp_train, y_pp_train)

In [None]:
macest_model = clmod.ModelWithConfidence(model,
                                      X_conf_train,
                                      y_conf_train, 
                                      search_method_args = neighbour_search_params)

macest_model.fit(X_cal, y_cal, param_range = param_bounds, optimiser_args= optimiser_args)

In [None]:
print(model.score(X_pp_train, y_pp_train))
print(model.score(X_conf_train, y_conf_train))
print(model.score(X_cal, y_cal))
print(model.score(X_test, y_test))

In [None]:
preds = model.predict(X_test)
conf_preds = macest_model.predict_proba(X_test)
rf_conf = model.predict_proba(X_test)
rf_point_prediction_conf = np.amax(rf_conf, axis=1)
macest_point_prediction_conf = macest_model.predict_confidence_of_point_prediction(X_test)

In [None]:
clplot.plot_calibration_curve([rf_point_prediction_conf,
                              macest_point_prediction_conf,], 
                              ['Uncalibrated RF','MACEst' ],
                              preds, y_test)

In [None]:
clplot.plot_quantile_spaced_calibration_curve([rf_point_prediction_conf,
                             macest_point_prediction_conf,
                                ], 
                              ['Uncalibrated RF','MACEst' ],
                              preds, y_test)

In [None]:
clplot.plot_calibration_metrics([rf_point_prediction_conf,
                                 macest_point_prediction_conf], 
                                 ['Uncalibrated RF','MACEst' ],
                                 preds, y_test)

In [None]:
clplot.plot_forecast_metrics([rf_point_prediction_conf,
                              macest_point_prediction_conf,], 
                              ['Uncalibrated RF','MACEst' ],
                              preds, y_test)

## In this metric we see that MACEst works very well