Contining on the part 1 and 2 of this demo series, we will focus on the kNN-based methods for estimating the latent factors in the test set (instead of using majority vote) 

Reference
---------
1. [Make kNN 300 times faster](https://towardsdatascience.com/make-knn-300-times-faster-than-scikit-learns-in-20-lines-5e29d74e76bb)
   - [Faiss: Metric type and distances](https://github.com/facebookresearch/faiss/wiki/MetricType-and-distances)
   - [Summary of methods](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes)

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import os, sys

# Colab 
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

# Plotting
import matplotlib.pylab as plt
# %matplotlib inline

from matplotlib.pyplot import figure
import seaborn as sns
from IPython.display import display

# Progress
from tqdm import tqdm

################################################################
# Configure system environment
# - Please modify input_dir according to your local enviornment
#
################################################################

cur_dir = os.getcwd()
project_dir = 'machine_learning_examples/cf_ensemble'
if IN_COLAB: 
    # Run this demo on Google Colab
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Parameters for data
    input_dir = f"/content/drive/MyDrive/Colab Notebooks/{project_dir}"
    # /content/drive/MyDrive/Colab Notebooks/machine_learning_examples/data/data-is-life

    sys.path.append(input_dir)
else: 
    input_dir = cur_dir
    
if input_dir != cur_dir: 
    sys.path.append(input_dir)
    print(f"> Adding {input_dir} to sys path ...")
    print(sys.path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
> Adding /content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble to sys path ...
['', '/content', '/env/python', '/usr/lib/python37.zip', '/usr/lib/python3.7', '/usr/lib/python3.7/lib-dynload', '/usr/local/lib/python3.7/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.7/dist-packages/IPython/extensions', '/root/.ipython', '/content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble', '/content/drive/MyDrive/Colab Notebooks/machine_learning_examples/cf_ensemble']


In [2]:
# Tensorflow
import tensorflow as tf
print(tf.__version__)
# import tensorflow_probability as tfp
# tfd = tfp.distributions
from tensorflow import keras

# from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Dense, Dropout, Lambda, Embedding
from tensorflow.keras.optimizers import RMSprop
from keras.utils.vis_utils import plot_model
from tensorflow.keras import backend as K
#################################################################

# Scikit-learn 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_predict, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
#################################################################

# CF-ensemble-specific libraries
import utils_stacking as ustk
import utils_classifier as uclf
import utils_sys as usys
import utils_cf as uc 
import scipy.sparse as sparse
from utils_sys import highlight
#################################################################

# Misc
import pprint
import tempfile
from typing import Dict, Text

np.set_printoptions(precision=3, edgeitems=5, suppress=True)

2.8.0


### Generating training data

In [3]:
# %matplotlib inline
import data_pipeline as dp

# get the dataset
X0, y0 = dp.generate_imbalanced_data(class_ratio=0.95, verbose=1)

> n_classes: 2
[0 1]

> counts:
Counter({0: 4297, 1: 703})



### Choosing base classifiers

In [4]:
# Create Base Learners
base_learners = [
                 ('RF', RandomForestClassifier(n_estimators= 200, 
                                                   oob_score = True, 
                                                   class_weight = "balanced", 
                                                   random_state = 20, 
                                                   ccp_alpha = 0.1)), 
                 ('KNNC', KNeighborsClassifier(n_neighbors = len(np.unique(y0))
                                                     , weights = 'distance')),
                #  ('SVC', SVC(kernel = 'linear', probability=True,
                #                    class_weight = 'balanced'
                #                   , break_ties = True)), 

                 ('GNB', GaussianNB()), 
                 ('QDA',  QuadraticDiscriminantAnalysis()), 
                 ('MLPClassifier', MLPClassifier(alpha=1, max_iter=1000)), 
                 # ('DT', DecisionTreeClassifier(max_depth=5)),
                 # ('GPC', GaussianProcessClassifier(1.0 * RBF(1.0))),
                ]

In [5]:
import cf_models as cm

tLoadPretrained = True 
######################
fold_number = 0
n_iterations = 1
data_dir = os.path.join(input_dir, 'data')
######################

if tLoadPretrained: 
    R, T, U, L_train, L_test = dp.load_pretrained_level1_data(fold_number=fold_number, verbose=1, data_dir=data_dir) 
else: 
    # Use the previously selected base predictors (`base_learners`) to generate the level-1 dataset
    R, T, U, L_train, L_test = cm.demo_cf_stacking(input_data=(X0, y0), 
                                                   input_dir=input_dir, n_iter=n_iterations, base_learners=base_learners, verbose=1)

# Derived quantities
n_train = R.shape[1]
p_threshold = uc.estimateProbThresholds(R, L=L_train, pos_label=1, policy='fmax')
lh = uc.estimateLabels(T, p_th=p_threshold) # We cannot use L_test (cheating), but we have to guesstimate
L = np.hstack((L_train, lh)) 
X = np.hstack((R, T))

assert len(U) == X.shape[0]
print(f"> shape(R):{R.shape} || shape(T): {T.shape} => shape(X): {X.shape}")

2.8.0
[info] list of base classifiers:
['RF' 'KNNC' 'GNB' 'QDA' 'MLPClassifier']

(estimateProbThresholds) policy: fmax
[info] probability thresholds:
[0.497 0.533 0.881 1.    0.714]

(estimateProbThresholds) policy: fmax
> shape(R):(5, 3750) || shape(T): (5, 1250) => shape(X): (5, 5000)


### Confidence and color matrix

In [6]:
alpha = 10.0 
conf_measure = 'brier'
policy_threshold = 'fmax'

Pc, C0, Cw, Cn, *rest = \
    uc.evalConfidenceMatrices(X, L, alpha=alpha, 
                                    p_threshold=p_threshold, 
                                    conf_measure=conf_measure, policy_threshold=policy_threshold, 
                                    
                                    # Optional debug/test parameters 
                                    U=U, fold_number=fold_number, 
                                    # n_train = n_train, 
                                    is_cascade=True,
                                    verbose=0)

(make_cn) Using UNWEIGHTED Cw, non-weighted MF to approximate ratings ...


### Fast K nearest neighbors (to be completed)
- scikit learn's KNN does not scale well; use Facebook's faiss library instead

In [7]:
# install openMP (as a prerequisite prior to installing faiss)
!sudo apt-get install libomp-dev 
# !pip install faiss

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libomp-dev is already the newest version (5.0.1-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-470
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.


In [8]:
# import utils_sys as usys
try: 
    import faiss
except: 
    # pip install faiss
    usys.install('faiss')
    import faiss
    

In [9]:
class FaissKNN:
    def __init__(self, k=5):
        self.index = None
        self.y = None
        self.y_tag = None # other meta data for the label/target such as polarities, colors
        self.k = k

    def fit(self, X, y):
        self.index = faiss.IndexFlatL2(X.shape[1]) # Each x in X is in row-vector format i.e. X has shape  (n_instances, n_dim)
        # Note: Rating matrix (X), however, is in column-vector format; therefore, we need to remember to take transpose before using it as an input

        self.index.add(X.astype(np.float32))
        self.y = y
    def tag(self, tags): 
        # the order of properties should be consistent with `self.y`
        assert len(tags) == len(self.y)
        self.y_tag = tags

    def predict(self, X):
        distances, indices = self.index.search(X.astype(np.float32), k=self.k)
        print(f"> distances (n={len(distances)}):\n{distances.shape}\n")
        print(f"{distances}")
        print(f"> indices (n={len(indices)}):\n{indices.shape}\n")
        print(f"{indices}")
        print("#" * 5)
        votes = self.y[indices] # note: shape(votes)=shape(indices)
        print(f"{votes}")
        predictions = np.array([np.argmax(np.bincount(x)) for x in votes])
        return predictions
    def search(self, X): 
        distances, indices = self.index.search(X.astype(np.float32), k=self.k)
        if self.y_tag is None: 
            return distances, indices
        return distances, indices, self.y_tag[indices]
        

In [10]:
print(R.shape)
print(L_train.shape)

L_train.reshape((-1, 1)).shape

(5, 3750)
(3750,)


(3750, 1)

In [11]:
from numpy import linalg as LA
from sklearn.preprocessing import normalize

R = normalize(R, axis=0, norm='l2')
T = normalize(T, axis=0, norm='l2')
test_points = np.random.choice(T.shape[1], 10)
for t in test_points: 
    # print(f"norm({t})={LA.norm(T[:, t], 2)}") 
    assert np.allclose(1.0, LA.norm(T[:, t], 2))

X_train = R.T
X_test = T.T

fknn = FaissKNN(k=5)
fknn.fit(X_train, L_train) # Note: X_train = np.tranpose(R)

# D, I = fknn.index.search(X_train[:5].astype(np.float32), fknn.k)

# print(f"> D:\n{D}\n")
# print(f"> I:\n{I}\n")
# print(fknn.y)
fknn.predict(X_train[:10])

# for i in range(X_test.shape[0]): 
#     if i > 3: break
#     fknn.predict(X_test[i].reshape(1, -1))

> distances (n=10):
(10, 5)

[[0.    0.    0.    0.    0.   ]
 [0.    0.    0.    0.    0.   ]
 [0.    0.    0.001 0.001 0.001]
 [0.    0.    0.    0.    0.   ]
 [0.    0.    0.    0.    0.001]
 [0.    0.    0.    0.    0.   ]
 [0.    0.    0.    0.    0.001]
 [0.    0.    0.007 0.011 0.012]
 [0.    0.005 0.007 0.009 0.009]
 [0.    0.    0.    0.    0.   ]]
> indices (n=10):
(10, 5)

[[   0  533 1362  341  554]
 [   1 1233 1409  366 3472]
 [   2  938 1495  311 1890]
 [   3  506 3013  371  970]
 [   4 3134  877  740  742]
 [   5 2113 1947  331 2466]
 [   6 1244 3072  924 2043]
 [   7 2922 2602 2793  285]
 [   8 1287 1682 3462 2173]
 [   9 1399 2588 2025 2544]]
#####
[[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 1 0]
 [0 1 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 1 1]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [1 0 1 0 1]]


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [12]:
pairs = [(5, 331), (5, 3130), (5, 1756), (5, 2000)]
for i, (u, v) in enumerate(pairs): 
    print(f"pair #[{i}]")
    print(X_train[u])
    print(X_train[v])
    print("~> ", LA.norm(X_train[u]-X_train[v], 2))

pair #[0]
[0.445 0.895 0.022 0.    0.019]
[0.445 0.895 0.009 0.    0.011]
~>  0.014936103818309518
pair #[1]
[0.445 0.895 0.022 0.    0.019]
[0.447 0.894 0.003 0.    0.02 ]
~>  0.019858581170235798
pair #[2]
[0.445 0.895 0.022 0.    0.019]
[0.446 0.895 0.003 0.    0.006]
~>  0.023424319121266975
pair #[3]
[0.445 0.895 0.022 0.    0.019]
[0.447 0.    0.001 0.895 0.014]
~>  1.2656260385297557
