# MACE with Sparse arrays

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from scipy.sparse import csr_matrix 
from scipy.sparse import random as sp_rand

import numpy as np 
import matplotlib.pyplot as plt 
from matplotlib import cm
import seaborn as sns 

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

from macest.classification import models as clmod
from macest.classification import plots as clplot

import nmslib 

In [3]:
sns.set_context("notebook")
sns.set_style('darkgrid')

## Generate random sparse data, we will use the scipy sparse csr matrix

In [4]:
n_rows = 10**3
n_cols = 5 * 10**3

In [5]:
X = csr_matrix(sp_rand(n_rows, n_cols))
y = np.random.randint(0, 2, n_rows)

In [6]:
X

<1000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 50000 stored elements in Compressed Sparse Row format>

In [9]:
type(X)  == csr_matrix

True

In [10]:
print(X.dtype)

float64


In [11]:
X_pp_train, X_conf_train, y_pp_train, y_conf_train  = train_test_split(X, y, test_size=0.66, random_state=10)
X_conf_train, X_cal, y_conf_train, y_cal = train_test_split(X_conf_train, y_conf_train,
                                                            test_size=0.5, random_state=0)
X_cal, X_test, y_cal,  y_test, = train_test_split(X_cal, y_cal, test_size=0.5, random_state=0)

In [12]:
model = RandomForestClassifier(random_state =0,
                               n_estimators =800,
                               n_jobs =-1)

model.fit(csr_matrix(X_pp_train), y_pp_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=800,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

### When using sparse matrices we need to tell the Search method to use sparse data, we do this with the space argument and data type argument passed to the graph. We will use the cosine space as a measure of similarity here

In [13]:
param_bounds = clmod.SearchBounds(alpha_bounds = (0, 500), k_bounds = (5,15))
neighbour_search_params = clmod.HnswGraphArgs(query_args = dict(ef = 1100), 
                                             init_args = dict(method="hnsw",
                                                              space="cosinesimil_sparse",
                                                              data_type=nmslib.DataType.SPARSE_VECTOR) )

In [14]:
macest_model = clmod.ModelWithConfidence(model,
                                      X_conf_train,
                                      y_conf_train, 
                                      search_method_args =neighbour_search_params)

macest_model.fit(X_cal, y_cal)

In [15]:
preds = model.predict(X_test)
rf_conf_preds = model.predict_proba(X_test)
macest_point_prediction_conf = macest_model.predict_confidence_of_point_prediction(X_test) 

#### Also note here that the rf confidence estimates are over-confident a lot of the time, the labels are random therefore a good confidence estimate should be  ~ 50/50. We see below that MACEst correctly learns this

In [16]:
print(np.max(rf_conf_preds, axis=1))

[0.685   0.59875 0.665   0.7025  0.62125 0.5875  0.59    0.68375 0.5425
 0.62375 0.68    0.65375 0.54375 0.66    0.7     0.595   0.57625 0.625
 0.66375 0.69625 0.50375 0.63625 0.6125  0.6125  0.72625 0.64625 0.63125
 0.63625 0.6325  0.665   0.66875 0.7125  0.64875 0.63625 0.58875 0.61375
 0.6125  0.6225  0.5875  0.7375  0.65125 0.605   0.61    0.62375 0.67375
 0.60125 0.69    0.655   0.51375 0.625   0.63375 0.5025  0.66625 0.6775
 0.57125 0.68625 0.60375 0.6075  0.60625 0.6625  0.60375 0.64125 0.62375
 0.60375 0.6925  0.62    0.59375 0.64375 0.6875  0.5475  0.57125 0.635
 0.68125 0.7375  0.70625 0.73375 0.70625 0.60375 0.5375  0.7075  0.59625
 0.60375 0.675   0.72    0.615   0.7425  0.59625 0.56125 0.63125 0.5825
 0.57625 0.6425  0.62375 0.6     0.5875  0.665   0.5775  0.76875 0.65125
 0.655   0.68    0.6325  0.58125 0.595   0.71875 0.615   0.71125 0.695
 0.61125 0.5225  0.65875 0.67125 0.63875 0.525   0.6575  0.70125 0.56875
 0.6625  0.7125  0.7075  0.55625 0.61125 0.705   0.67125 0.5

In [17]:
print(macest_point_prediction_conf)

[0.55306744 0.55547168 0.55171575 0.55184952 0.55726857 0.55160916
 0.55632061 0.55115047 0.5546665  0.549813   0.5485467  0.55157338
 0.5530305  0.55615865 0.550307   0.55038621 0.55539254 0.55609952
 0.55126499 0.55241815 0.55023759 0.55275673 0.55427799 0.5518714
 0.55475444 0.55086169 0.55359554 0.55510819 0.55159835 0.55724951
 0.55686946 0.55377181 0.55281983 0.55756224 0.55491592 0.54902467
 0.55203312 0.55253595 0.55464405 0.54972063 0.55055486 0.55277654
 0.5514939  0.55435009 0.55696357 0.55204932 0.5570146  0.55163793
 0.44554395 0.55342185 0.55440481 0.44619032 0.55135824 0.55606582
 0.55345907 0.55417807 0.55531871 0.55378064 0.55207138 0.55152367
 0.55158936 0.55376203 0.55625489 0.55349402 0.55765677 0.55691775
 0.55484441 0.55278987 0.55521356 0.54901082 0.5562535  0.5541736
 0.55262952 0.55450833 0.5525661  0.55031546 0.55292846 0.5547166
 0.55278165 0.5560574  0.55490919 0.55320966 0.55040712 0.54753708
 0.55190786 0.55178037 0.55424021 0.55209439 0.5524814  0.5527870

## If we pass a sparse array without telling hnsw what happens ?

In [19]:
param_bounds = clmod.SearchBounds(alpha_bounds = (0, 500), k_bounds = (5,15))

In [20]:
macest_model = clmod.ModelWithConfidence(model,
                                      X_conf_train,
                                      y_conf_train,)

ValueError: Training data type and space are not compatible, your space is l2 and training data type is <class 'scipy.sparse.csr.csr_matrix'>

As we did not specify the sparse type, this resulted in an error. As expected.

## Let's mix sparse metrics and sparse data types in the search method args

In [22]:
param_bounds = clmod.SearchBounds(alpha_bounds = (0, 500), k_bounds = (5,15))
# note below that the default data type is not sparse so not passing an argument
# means the graph is expecting a dense vector
neighbour_search_params = clmod.HnswGraphArgs(query_args = dict(ef = 1100), 
                                             init_args = dict(method="hnsw",
                                                              space="cosinesimil",))


In [23]:
macest_model = clmod.ModelWithConfidence(model,
                                      X_conf_train,
                                      y_conf_train,
                                      search_method_args =neighbour_search_params)


ValueError: Training data type and space are not compatible, your space is cosinesimil and training data type is <class 'scipy.sparse.csr.csr_matrix'>

### Vice versa

In [24]:
param_bounds = clmod.SearchBounds(alpha_bounds = (0, 500), k_bounds = (5,15))
neighbour_search_params = clmod.HnswGraphArgs(query_args = dict(ef = 1100), 
                                             init_args = dict(method="hnsw",
                                                              space="cosinesimil",
                                                              data_type=nmslib.DataType.SPARSE_VECTOR) )


In [25]:
macest_model = clmod.ModelWithConfidence(model,
                                      X_conf_train,
                                      y_conf_train,
                                      search_method_args =neighbour_search_params)


ValueError: Data type and space are not compatible, your space is cosinesimil and search data type is data_type nmslib.DataType.SPARSE_VECTOR