In [1]:
import sys
sys.path.append('..')

import numpy as np
import matplotlib.pyplot as plt

import metrics
import utils

# Prototype Methods

Training data: N points $(x_i, g_i)$ with $g_i$ class label (1-K).  
Prototype methods represent the training data by a set of points in feature space, each associated with a label.  
Prediction is made by assignint to it the class of it's closest prototype and.  
Closest can be for example Euclidian distance, after we standardize the data.  
The challenge is to find how many prototypes and where to put them.

## K-means clustering

K-Means find $R$ clusters and cluster centers in unlabelled data.  
The procedure iterately moves the centers to minimize the total within cluster variance.

Algorithm:
1. Initialize the $R$ clusters randomly (from training set)
2. Repeat until convergence:
    - Assign each training point to the closest centroid
    - The center of each cluster becomes the mean of all its assigned points

In [68]:
from sklearn.datasets import load_iris

X, y = load_iris().data, load_iris().target
X = X - np.mean(X, axis=0)
X = X / np.std(X, axis=0)
print(X.shape)
print(y.shape)


class KMeansClustering:
    
    def __init__(self, R):
        self.R = R
        
    def fit(self, X):
        N, p = X.shape      
        self.means = X[np.random.choice(N, self.R)]
        
        while True:
            old_means = self.means.copy()
            
            #assign each point to the closest cluster
            ctrs = [list() for _ in range(self.R)]
            for x in X:
                ctrs[self.get_closest_ctr_idx(x)].append(x)
                
            
            # compute the new center position of every cluster
            for i in range(self.R):
                if len(ctrs[i]) != 0:
                    self.means[i] = np.mean(np.vstack(ctrs[i]), axis=0)
            
            
            if np.linalg.norm(old_means - self.means) < 1e-6:
                break
        
    
    def get_closest_ctr_idx(self, x):
        min_idx = None
        min_dist = float('inf')
        for i in range(self.R):
            dist = (x - self.means[i]) @ (x - self.means[i])
            if dist < min_dist:
                min_idx = i
                min_dist = dist

        return min_idx
    
    def predict(self, X):
        y = np.empty(len(X))
        for i in range(len(X)):
            y[i] = self.get_closest_ctr_idx(X[i])
        return y
                
mod = KMeansClustering(3)
mod.fit(X)

(150, 4)
(150,)


K-Means can be used for classification. We fit one K-Means for each class of the training set.  
For prediction, we find the closest centroid among all K-Means. The prediction is the class the centroid belongs to

In [76]:
from sklearn.model_selection import train_test_split

class KMeansClassifier:
    
    def __init__(self, R, K):
        self.R = R
        self.K = K
        
    def fit(self, X, y):
        self.kmod = [None] * self.K
        for k in range(self.K):
            self.kmod[k] = KMeansClustering(self.R)
            self.kmod[k].fit(X[y==k])
            
    def predict(self, X):
        y = np.empty(len(X))
        for i in range(len(X)):
            y[i] = self.get_pred(X[i])
        return y
    
    def get_pred(self, x):
        min_dist = float('inf')
        min_k = None
        for k in range(self.K):
            ctl = self.kmod[k].means[self.kmod[k].get_closest_ctr_idx(x)]
            dist = (x - ctl) @ (x - ctl)
            if dist < min_dist:
                min_dist = dist
                min_k = k
        return min_k
        
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                   random_state=15)

mod = KMeansClassifier(5, 3)
mod.fit(X_train, y_train)
print('train acc:', np.mean(y_train == mod.predict(X_train)))
print('test acc:', np.mean(y_test == mod.predict(X_test)))

train acc: 0.9416666666666667
test acc: 1.0


## Learning Vector Quantization

Better than K-Means for classification, uses all the data to place the centroids. Training points attract prototypes of the correct class, and repel other prototypes.  

Algorithm page 462

In [56]:
from sklearn.model_selection import train_test_split

class LVQ:
    
    def __init__(self, R, K, lr_begin, lr_end, lr_fact):
        self.R = R
        self.K = K
        self.lr_begin = lr_begin
        self.lr_end = lr_end
        self.lr_fact = lr_fact
        
    def fit(self, X, y):
        N, p = X.shape
        
        self.centers = np.empty((self.R * self.K, p))
        for k in range(self.K):
            Xk = X[y == k]
            p = np.random.choice(len(Xk), self.R)
            self.centers[k*self.R:(k+1)*self.R] = Xk[p]
            
        lr = self.lr_begin
        it = 0
        while lr > self.lr_end:
            
            p = np.random.choice(len(X))
            sx, sy = X[p], y[p]
            ctr = self.get_closest_ctr_idx(sx)
            k = ctr // self.R
            step = lr if k == sy else -lr
            self.centers[ctr] += step * (sx - self.centers[ctr])
            
            it += 1
            lr *= self.lr_fact
        
    def get_closest_ctr_idx(self, x):
        min_idx = None
        min_dist = float('inf')
        for i in range(len(self.centers)):
            dist = (x - self.centers[i]) @ (x - self.centers[i])
            if dist < min_dist:
                min_idx = i
                min_dist = dist

        return min_idx
    
    def predict(self, X):
        y = np.empty(len(X))
        for i in range(len(X)):
            y[i] = self.get_closest_ctr_idx(X[i]) // self.R
        return y
        

        
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                   random_state=15)

mod = LVQ(5, 3, 1e-1, 1e-5, 0.999)
mod.fit(X_train, y_train)
print('train acc:', np.mean(y_train == mod.predict(X_train)))
print('test acc:', np.mean(y_test == mod.predict(X_test)))

train acc: 0.975
test acc: 1.0


## Gaussian Mixtures

Gaussian mixtures can also be thought as a prototype method. Each cluster is a Gaussian with a centroid (mean) and a covariance matrix.  
The two steps of the EM algorithm are similar to K-Means:
- E-step: Each obsevation is assigned a weight to each cluster, according to far they are to each Gaussian.
- M-Step: Each observation contributes to the weighted mean and covariance  for every cluster.  

The Gaussian mixture is referred as a soft clustering method, and K-Means as a hard one.

# k-Nearest-Neighbor

KNN doesn't have any learning procedure, it just stores the whole training set.  
At prediction if find the $k$ closest points of $x$, and returns the average for regression, or the class with the highest number of votes for classification.  
One example is the Euclidian distance with standardized training set.  

with $k=1$, the bias is often low, but the variance is high.

In [100]:
from scipy.stats import mode

class KNNClassifier:
    
    def __init__(self, K):
        self.K = K
        
    def fit(self, X, y):
        self.X = X
        self.y = y
        
    def predict(self, X):
        y = np.empty(len(X))
        for i in range(len(X)):
            y[i] = self.get_pred(X[i])
        return y
    
    def get_pred(self, x):
        dists = []
        for i in range(len(self.X)):
            dists.append(( (X[i] - x) @ (X[i] - x), y[i]))
        dists.sort()
        
        _, vals = zip(*dists[:self.K])
        return mode(np.array(vals))[0][0]
    
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                   random_state=15)

mod = KNNClassifier(5)
mod.fit(X_train, y_train)
print('train acc:', np.mean(y_train == mod.predict(X_train)))
print('test acc:', np.mean(y_test == mod.predict(X_test)))

train acc: 0.9333333333333333
test acc: 0.9666666666666667


In some problems, training features are invariant under certain natural transformations, and a KNN classifier can exploit it by using a specific metric.  
For example, for image recognition, rotation often doesn't change the class. We wish to remove the effect of rotation when measuring distance between 2 objects of the same class.  
Distance between 2 images: shortest euclidian distance between any rotated version of the 2 images.  
But it's quite expensive, another solution is the tangent distance.  

We compute the invariant tangent line for each training image.  
To classify a new image, we compute it's invariant tangent line and find the closest lines to it in the training set.  
This tangent can be computed by estimating direction vector from small rotations of the image.  

The tangeant distance can also be used to capture other invariance like scaling, translation for example.  

Efficient pattern recognition using a new transformation distance - Simard, P., Cun, Y. L. and Denker, J. (1993) - [PDF](https://pdfs.semanticscholar.org/8314/dda1ec43ce57ff877f8f02ed89acb68ca035.pdf)

# Adaptive Nearest-Neighbor Methods

In high-dimensional spaces, class probabilities might only change in a low dimensional subspace. This can be corrected if we have a metric so that the resulting neighborhoods stretch out in directions for which the feature probabilities doesn't change much.  

Discrimininant Adaptive Nearest-neighbor adapt the metric locally.  
The metric used to compute the distance at the query point $x_0$ is:
$$D(x, x_0) = (x - x_0)^T \Sigma (x - x_0)$$
$$\text{with } \Sigma = W^{-1/2}(W^{-1/2}BW^{-1/2} + \epsilon I)W^{-1/2}$$

with $W$ and $B$ respectively the within and between class covariance matrices computed using only the $M$ nearest neighbors around $x_0$

Global dimension reduction can also be applied.  
One strategy is to project the data into a reduced subspace, and perform KNN into that subspace.