Contining on the part 1 and 2 of this demo series, we will focus on using kNN-based methods to estimate the latent factors for new instances in the test set (instead of using majority vote).  

Reference
---------
1. [Make kNN 300 times faster](https://towardsdatascience.com/make-knn-300-times-faster-than-scikit-learns-in-20-lines-5e29d74e76bb)
   - [Faiss: Metric type and distances](https://github.com/facebookresearch/faiss/wiki/MetricType-and-distances)
   - [Summary of methods](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes)

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import os, sys

# Colab 
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

# Plotting
import matplotlib.pylab as plt
# %matplotlib inline

from matplotlib.pyplot import figure
import seaborn as sns
from IPython.display import display

# Progress
from tqdm import tqdm

################################################################
# Configure system environment
# - Please modify input_dir according to your local enviornment
#
################################################################

cur_dir = os.getcwd()
project_dir = 'machine_learning_examples/cf_ensemble'
if IN_COLAB: 
    # Run this demo on Google Colab
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Parameters for data
    input_dir = f"/content/drive/MyDrive/Colab Notebooks/{project_dir}"
    # /content/drive/MyDrive/Colab Notebooks/machine_learning_examples/data/data-is-life

    sys.path.append(input_dir)
else: 
    input_dir = cur_dir
    
if input_dir != cur_dir: 
    sys.path.append(input_dir)
    print(f"> Adding {input_dir} to sys path ...")
    print(sys.path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
> Adding /content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble to sys path ...
['', '/content', '/env/python', '/usr/lib/python37.zip', '/usr/lib/python3.7', '/usr/lib/python3.7/lib-dynload', '/usr/local/lib/python3.7/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.7/dist-packages/IPython/extensions', '/root/.ipython', '/content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble', '/content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble']


In [2]:
# Tensorflow
import tensorflow as tf
print(tf.__version__)
# import tensorflow_probability as tfp
# tfd = tfp.distributions
from tensorflow import keras

# from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Dense, Dropout, Lambda, Embedding
from tensorflow.keras.optimizers import RMSprop
from keras.utils.vis_utils import plot_model
from tensorflow.keras import backend as K
#################################################################

# Scikit-learn 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_predict, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
#################################################################

# CF-ensemble-specific libraries
import utils_stacking as ustk
import utils_classifier as uclf
import utils_sys as usys
import utils_cf as uc 
import scipy.sparse as sparse
from utils_sys import highlight
#################################################################

# Misc
import pprint
import tempfile
from typing import Dict, Text

np.set_printoptions(precision=3, edgeitems=5, suppress=True)

2.8.0


### Generating training data

In [3]:
# %matplotlib inline
import data_pipeline as dp

# get the dataset
X0, y0 = dp.generate_imbalanced_data(class_ratio=0.95, verbose=1)

> n_classes: 2
[0 1]

> counts:
Counter({0: 4297, 1: 703})



### Choosing base classifiers

In [4]:
# Create Base Learners
base_learners = [
                 ('RF', RandomForestClassifier(n_estimators= 200, 
                                                   oob_score = True, 
                                                   class_weight = "balanced", 
                                                   random_state = 20, 
                                                   ccp_alpha = 0.1)), 
                 ('KNNC', KNeighborsClassifier(n_neighbors = len(np.unique(y0))
                                                     , weights = 'distance')),
                #  ('SVC', SVC(kernel = 'linear', probability=True,
                #                    class_weight = 'balanced'
                #                   , break_ties = True)), 

                 ('GNB', GaussianNB()), 
                 ('QDA',  QuadraticDiscriminantAnalysis()), 
                 ('MLPClassifier', MLPClassifier(alpha=1, max_iter=1000)), 
                 # ('DT', DecisionTreeClassifier(max_depth=5)),
                 # ('GPC', GaussianProcessClassifier(1.0 * RBF(1.0))),
                ]

In [5]:
import cf_models as cm

tLoadPretrained = True 
######################
fold_number = 0
n_iterations = 1
data_dir = os.path.join(input_dir, 'data')
######################

if tLoadPretrained: 
    R, T, U, L_train, L_test = dp.load_pretrained_level1_data(fold_number=fold_number, verbose=1, data_dir=data_dir) 
else: 
    # Use the previously selected base predictors (`base_learners`) to generate the level-1 dataset
    R, T, U, L_train, L_test = cm.demo_cf_stacking(input_data=(X0, y0), 
                                                   input_dir=input_dir, n_iter=n_iterations, base_learners=base_learners, verbose=1)

# Derived quantities
n_train = R.shape[1]
p_threshold = uc.estimateProbThresholds(R, L=L_train, pos_label=1, policy='fmax')
lh = uc.estimateLabels(T, p_th=p_threshold) # We cannot use L_test (cheating), but we have to guesstimate
L = np.hstack((L_train, lh)) 
X = np.hstack((R, T))

assert len(U) == X.shape[0]
print(f"> shape(R):{R.shape} || shape(T): {T.shape} => shape(X): {X.shape}")

2.8.0
[info] list of base classifiers:
['RF' 'KNNC' 'GNB' 'QDA' 'MLPClassifier']

(estimateProbThresholds) policy: fmax
[info] probability thresholds:
[0.497 0.533 0.881 1.    0.51 ]

(estimateProbThresholds) policy: fmax
> shape(R):(5, 3750) || shape(T): (5, 1250) => shape(X): (5, 5000)


### Confidence and color matrix

In [6]:
import polarity_model as pm 

alpha = 10.0 
conf_measure = 'brier'
policy_threshold = 'fmax'

Pc, C0, Cw, Cn, *rest = \
    uc.evalConfidenceMatrices(X, L, alpha=alpha, 
                                    p_threshold=p_threshold, 
                                    conf_measure=conf_measure, policy_threshold=policy_threshold, 
                                    
                                    # Optional debug/test parameters 
                                    U=U, fold_number=fold_number, 
                                    # n_train = n_train, 
                                    is_cascade=True,
                                    verbose=0)
y_colors = pm.verify_colors(Pc) 

(make_cn) Using UNWEIGHTED Cw, non-weighted MF to approximate ratings ...


### Fast K nearest neighbors (to be completed)
- scikit learn's KNN does not scale well; use Facebook's faiss library instead

In [7]:
# install openMP (as a prerequisite prior to installing faiss)
!sudo apt-get install libomp-dev 
# !pip install faiss

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libomp-dev is already the newest version (5.0.1-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-470
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.


In [8]:
# import utils_sys as usys
try: 
    import faiss
except: 
    # pip install faiss
    usys.install('faiss')
    import faiss
    

In [9]:
from utilities import normalize
import scipy.sparse as sparse
# from sklearn.preprocessing import normalize

class FaissKNN:
    def __init__(self, k=5, normalize=True):
        self.index = None
        self.y = None
        self.y_tag = None # other meta data for the label/target such as polarities, colors
        self.k = k
        self.normalize_input = normalize

    def fit(self, X, y):
        self.index = faiss.IndexFlatL2(X.shape[1]) # Each x in X is in row-vector format i.e. X has shape  (n_instances, n_dim)
        # Note: Rating matrix (X), however, is in column-vector format; therefore, we need to remember to take transpose before using it as an input
  
        if self.normalize_input: 
            X = normalize(X, axis=1) # X is in row-vector format

        self.index.add(X.astype(np.float32))
        self.y = y
    def tag(self, Pc): 
        """
        `Pc`: A 2D array of attendant information that accompanies the labels `y`  
        """
        if sparse.issparse(Pc): Pc = Pc.A
        pass

    def predict(self, X):
        distances, indices = self.index.search(X.astype(np.float32), k=self.k)
        # shape(distances): (n_instances, k)
        # shape(indices):   (n_instances, k)

        votes = self.y[indices] # note: shape(votes)=shape(indices)
        predictions = np.array([np.argmax(np.bincount(x)) for x in votes])
        # np.bincount([1, 1, 1, 0, 1, 0, 0, 0, 1, 1]) 
        # ~> array([4, 6]) because index 0 occurs 4 times, and 1 occurs 6 times
        return predictions
    def search(self, X): 
        distances, indices = self.index.search(X.astype(np.float32), k=self.k)
        return distances, indices
        

In [10]:
from numpy import linalg as LA
from analyzer import is_sparse
from sklearn.preprocessing import normalize

# Normalize each data point so that they have a unit length
R = normalize(R, axis=0, norm='l2')
T = normalize(T, axis=0, norm='l2')
test_points = np.random.choice(T.shape[1], 10)
for t in test_points: 
    # print(f"norm({t})={LA.norm(T[:, t], 2)}") 
    assert np.allclose(1.0, LA.norm(T[:, t], 2))

if is_sparse(Pc): Pc = Pc.A
Pr, Pt = Pc[:,:n_train], Pc[:,n_train:] # color matrix is already in column-vector format
assert Pr.shape == R.shape
y_colors = np.ravel(Pr)

X_train = R.T
X_test = T.T

fknn = FaissKNN(k=5)
fknn.fit(X_train, L_train) # Note: X_train = np.tranpose(R)
# fknn.tag(y_colors)

distances, indices = fknn.search(X_test)
assert len(indices) == X_test.shape[0]

print(Pr.shape)
print("> colors")
print(Pr[:, indices[0]])
print("> labels")
print(fknn.y[indices[0]])

print(f"> D:\n{distances}\n")
print(f"> I:\n{indices}\n")

# print(fknn.y)
# fknn.predict(X_train[:10])

# for i in range(X_test.shape[0]): 
#     if i > 3: break
#     fknn.predict(X_test[i].reshape(1, -1))

(5, 3750)
> colors
[[ 2. -2.  2. -2. -2.]
 [-1.  1. -1.  1.  1.]
 [-1.  1. -1.  1.  1.]
 [-1.  1. -1.  1.  1.]
 [-1.  1. -1.  1.  1.]]
> labels
[1 0 1 0 0]
> D:
[[0.    0.    0.    0.    0.   ]
 [0.    0.    0.    0.    0.   ]
 [0.    0.    0.    0.    0.   ]
 [0.001 0.001 0.001 0.001 0.001]
 [0.    0.    0.    0.    0.   ]
 ...
 [0.    0.    0.    0.    0.   ]
 [0.    0.    0.    0.    0.   ]
 [0.    0.    0.    0.    0.   ]
 [0.    0.001 0.001 0.001 0.001]
 [0.    0.    0.    0.    0.   ]]

> I:
[[1955 2259  865 2282   55]
 [2880 3086 3001 2823 3203]
 [3632 3220 3641 2334 1955]
 [1058 1090  275  112 3335]
 [1110 3699 1035 3309 1987]
 ...
 [ 332 1793 2346 2646  486]
 [3710 3175 3162 2486  428]
 [ 744 3618 3030 2934 3011]
 [3187 2754 1487 2814  662]
 [ 664 2153 1434  514  729]]



In [11]:
pairs = [(5, 331), (5, 3130), (5, 1756), (5, 2000)]
for i, (u, v) in enumerate(pairs): 
    print(f"pair #[{i}]")
    print(X_train[u])
    print(X_train[v])
    print("~> ", LA.norm(X_train[u]-X_train[v], 2))

pair #[0]
[0.445 0.895 0.022 0.    0.003]
[0.445 0.895 0.009 0.    0.017]
~>  0.019009330904733536
pair #[1]
[0.445 0.895 0.022 0.    0.003]
[0.447 0.894 0.003 0.    0.019]
~>  0.025342701931234855
pair #[2]
[0.445 0.895 0.022 0.    0.003]
[0.446 0.895 0.003 0.    0.014]
~>  0.022526869412195658
pair #[3]
[0.445 0.895 0.022 0.    0.003]
[0.446 0.    0.001 0.893 0.069]
~>  1.2660104792942901
