# MACE with Sparse arrays

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from scipy.sparse import csr_matrix 
from scipy.sparse import random as sp_rand

import numpy as np 
import matplotlib.pyplot as plt 
from matplotlib import cm
import seaborn as sns 

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

from macest.classification import models as clmod
from macest.classification import plots as clplot

import nmslib 

In [3]:
sns.set_context("notebook")
sns.set_style('darkgrid')

## Generate random sparse data, we will use the scipy sparse csr matrix

In [4]:
n_rows = 10**3
n_cols = 5 * 10**3

In [5]:
X = csr_matrix(sp_rand(n_rows, n_cols))
y = np.random.randint(0, 2, n_rows)

In [6]:
X

<1000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 50000 stored elements in Compressed Sparse Row format>

In [7]:
type(X) == csr_matrix

True

In [8]:
print(X.dtype)

float64


In [9]:
X_pp_train, X_conf_train, y_pp_train, y_conf_train  = train_test_split(X, y, test_size=0.66, random_state=10)
X_conf_train, X_cal, y_conf_train, y_cal = train_test_split(X_conf_train, y_conf_train,
                                                            test_size=0.5, random_state=0)
X_cal, X_test, y_cal,  y_test, = train_test_split(X_cal, y_cal, test_size=0.5, random_state=0)

In [10]:
model = RandomForestClassifier(random_state =0,
                               n_estimators =800,
                               n_jobs =-1)

model.fit(csr_matrix(X_pp_train), y_pp_train)

### When using sparse matrices we need to tell the Search method to use sparse data, we do this with the space argument and data type argument passed to the graph. We will use the cosine space as a measure of similarity here

In [11]:
param_bounds = clmod.SearchBounds(alpha_bounds = (0, 500), k_bounds = (5,15))
neighbour_search_params = clmod.HnswGraphArgs(query_args = dict(ef = 1100), 
                                              init_args = dict(method="hnsw",
                                                               space="cosinesimil_sparse",
                                                               data_type=nmslib.DataType.SPARSE_VECTOR))

In [12]:
macest_model = clmod.ModelWithConfidence(model,
                                         X_conf_train,
                                         y_conf_train, 
                                         search_method_args=neighbour_search_params)

macest_model.fit(X_cal, y_cal)

In [13]:
preds = model.predict(X_test)
rf_conf_preds = model.predict_proba(X_test)
macest_point_prediction_conf = macest_model.predict_confidence_of_point_prediction(X_test) 

#### Also note here that the rf confidence estimates are over-confident a lot of the time, the labels are random therefore a good confidence estimate should be  ~ 50/50. We see below that MACEst correctly learns this

In [14]:
print(np.max(rf_conf_preds, axis=1))

[0.70375 0.8025  0.6     0.64875 0.635   0.53    0.555   0.69125 0.5925
 0.67625 0.64875 0.76    0.6425  0.63875 0.5525  0.69125 0.635   0.63625
 0.70375 0.68875 0.675   0.585   0.61375 0.61375 0.6875  0.53875 0.685
 0.73625 0.5975  0.72125 0.7225  0.58875 0.63375 0.61125 0.58    0.74875
 0.60375 0.6675  0.605   0.62625 0.7225  0.50875 0.6175  0.69375 0.6675
 0.74125 0.61625 0.63625 0.55625 0.58875 0.725   0.62625 0.59375 0.6325
 0.59125 0.56375 0.70125 0.66875 0.685   0.55625 0.6     0.6325  0.5475
 0.71    0.61625 0.655   0.6025  0.69    0.62625 0.61375 0.76    0.5325
 0.6675  0.585   0.57375 0.64    0.59125 0.62625 0.59    0.6125  0.5175
 0.655   0.54625 0.555   0.61875 0.55875 0.695   0.5025  0.6475  0.65875
 0.66625 0.6125  0.76375 0.6525  0.75375 0.6475  0.61125 0.5575  0.63
 0.65875 0.68    0.61125 0.56125 0.5425  0.60875 0.51125 0.575   0.65625
 0.655   0.6525  0.5975  0.665   0.605   0.61    0.615   0.5775  0.51375
 0.52125 0.64625 0.6725  0.71375 0.5475  0.6425  0.715   0.501

In [15]:
print(macest_point_prediction_conf)

[0.59487106 0.57266399 0.59595903 0.5960263  0.59572118 0.59873666
 0.59260776 0.59362389 0.59737518 0.59412792 0.59454327 0.59755493
 0.59400505 0.58259815 0.594659   0.58351116 0.57113512 0.59473492
 0.59614569 0.58257615 0.59523294 0.596121   0.59598689 0.5834981
 0.59840931 0.42828906 0.59759749 0.59500512 0.59593662 0.59670729
 0.58154355 0.59350248 0.59374335 0.59586743 0.54183203 0.59449364
 0.59388544 0.59448501 0.5714903  0.59832047 0.58119105 0.59527867
 0.59873971 0.59603496 0.57100381 0.58103343 0.58246355 0.58122099
 0.59466362 0.5803304  0.59454239 0.59568834 0.59440366 0.59335104
 0.59440225 0.5938606  0.59620664 0.59346594 0.59334467 0.58194153
 0.59350063 0.58168441 0.59800347 0.59503156 0.59555559 0.57168772
 0.59428156 0.59536565 0.59506325 0.58024393 0.56832245 0.59559276
 0.58025743 0.59668309 0.58103038 0.5673401  0.59512612 0.59596073
 0.59327091 0.5946641  0.59535757 0.5932177  0.5928846  0.59517873
 0.59580844 0.59518757 0.58257852 0.57499098 0.59571429 0.59549

## If we pass a sparse array without telling hnsw what happens ?

In [16]:
param_bounds = clmod.SearchBounds(alpha_bounds = (0, 500), k_bounds = (5,15))

In [17]:
macest_model = clmod.ModelWithConfidence(model,
                                         X_conf_train,
                                         y_conf_train,)

ValueError: Training data type and space are not compatible, your space is l2 and training data type is <class 'scipy.sparse._csr.csr_matrix'>.Sparse metric: False. Sparse data: True. Initialisation args: {'method': 'hnsw', 'space': 'l2'}.

As we did not specify the sparse type, this resulted in an error. As expected.