# Statistical Pattern Recognition - Solution 4: Nonparametric methods

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics


## $\star$ Part 1: K-nearest neighbors

Load the data from dataset.npz and split it evenly into a training set
and a test set. 

Each sample consists of a point in 2D and a class label
$\{1,2,3\}$.

For each point in the test set, predict its label by a k-nearest
neighbor classifier “trained” using the training set. 

Compute the **average classification error** using the true labels of the test set. 

Visualize the training and test points with their respective label, as well as the classifier's decision boundary using a **contour plot** (see example at the bottom).

Repeat for different values of k.


In [None]:

# load the data and split it into train and test sets
# START TODO ################
raise NotImplementedError
# END TODO ################

# check that the data and the split shapes are correct
# START TODO ################
# here we check whether the data is equally split
# between test and train sets
raise NotImplementedError
# END TODO ################


In [None]:
# train and plot the k-nearest neighbors classifier
def plot_k_neighbors(k, data_train, data_test, classifier=KNeighborsClassifier):
    """
    k: Number of neighbors to use
    data_train: subsection of the dataset that is to be used for training
    data_test subsection of the dataset that is to be used for testing

    For more information on the KNeighborsClassifier see:
    https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
    """
    # START TODO ################
    raise NotImplementedError
    # END TODO ################

plot_k_neighbors(2, data_train, data_test)


In [None]:
# test for different values of K
for k in range(1, 8):
    plot_k_neighbors(k, data_train, data_test)
    plt.show()


## $\star$ Part 2: Data splits and hyperparameters

Study how different dataset splits and values of k affect classification results.

### Part 2.1
Train and display k-nearest-neighbor models for different *equally sized* splits of the set into training and test set. Use fixed k=4.

Is the classification error always the same?

In [None]:
# fix K=4 and check how different random data splits change the result
# START TODO ################
raise NotImplementedError
# END TODO ################


### Part 2.2
Train and display k-nearest-neighbor models for differently sized train/test splits. Use fixed k=4.

How does the classification error change?

In [None]:
# fix K=4 and check how different training set sizes change the result
# START TODO ################
raise NotImplementedError
# END TODO ################


### Part 2.3
Compute and plot the classification error for different values of k, averaged over multiple *equally sized* splits.

How is the mean error affected by k?

What you should observe is an effect illustrating the so-called bias-variance tradeoff and will be discussed in more detail in the next class.

In [None]:
def get_classification_errors(data, k, n_trials):
    '''
    Run n_trials experiments with different (equally sized) train-test splits.
    Return a list of classification errors for each run.
    '''
    errors = []
    # START TODO ################
    raise NotImplementedError
    # END TODO ################
    return errors

ks = range(1, 50)
n_trials = 100
errors = []

# compute the mean classification error for each value of K and plot it as a function of K
# START TODO ################
raise NotImplementedError
# END TODO ################


## $\star\star\star$ Bonus part: Custom KNN estimator

Implement the same functionality as `KNeighborsClassifier`.

For this, you can create a new class `SimpleKNeighborsClassifier` and implement the methods `__init__`, `fit` and `predict`.

The general idea is to:

* Convert the class labels from {1, 2, 3} to {0, 1, 2}
* Convert the integer classes to [onehot vectors](https://en.wikipedia.org/wiki/One-hot#Machine_learning_and_statistics).
* Compute the distance from all test points to all training points.
* For each test point, sort the training points by ascending distance, only keep the top k datapoints, average the class probabilities of those k points and predict the class with the highest probability.
* Convert the classes back from {0, 1, 2} to {1, 2, 3}

Check if your estimator produces the same results as the estimator from `sklearn`.


### One-hot utility function

In [None]:
def get_onehot_matrix(targets:np.ndarray, num_classes:int):
    # START TODO ################
    # targets shape (n_datapoints)
    # we want a matrix of shape (n_datapoints, num_classes) s. t.
    # entry[n, cls] = 1 if targets[n] == cls else 0
    raise NotImplementedError
    # END TODO ################

# test the onehot function with mockup targets
example_targets = np.array([1.,3.,0.])
num_classes = 5
example_onehot = get_onehot_matrix(
    example_targets, num_classes)

print(f"Given targets {example_targets} and {num_classes} classes,"
      "\nresulting onehot matrix is:\n"
      f"{example_onehot}")


In [None]:
class SimpleKNeighborsClassifier:
    # START TODO ################
    raise NotImplementedError
    # END TODO ################


In [None]:
print("---------- Custom implementation:")
plot_k_neighbors(4, data_train, data_test,
                 classifier=SimpleKNeighborsClassifier)
print()
print("---------- Implementation from sklearn:")
plot_k_neighbors(4, data_train, data_test,
                 classifier=KNeighborsClassifier)


## Hints

### Example output for Part 1

![example output](ex4_example_output.jpg)


### Creating contour plots

In [None]:
### Creating contour plots

# create custom colormaps
cmap_light = ListedColormap(['#ff9f2f', '#5fff5f', '#7f7fff'])
cmap_bold = ListedColormap(['#af4f00', '#0faf0f', '#2f2fcf'])

# create some example data in sine form
data = np.random.uniform(-1.5, 1.5, size=(500,2))
data[:, 0] *= 3.14
x, y = data[:, 0], data[:, 1]

# set data classes with sine as border
y_border = np.sin(data[:, 0])
classes = (y > y_border).astype(float)

# plot points colored by classes with the created colormap
plt.figure(figsize=(12,8))
plt.scatter(data[:, 0], data[:, 1], c=classes, cmap=cmap_bold)
plt.grid()
plt.show()


In [None]:
# create a meshgrid depending on data range
bordersize = .1
x_min, x_max = x.min() - bordersize, x.max() + bordersize
y_min, y_max = y.min() - bordersize, y.max() + bordersize

grid_size = .02
xrange = np.arange(x_min, x_max, grid_size)
yrange = np.arange(y_min, y_max, grid_size)
print(f"{xrange.shape=}, {yrange.shape=}")

xx, yy = np.meshgrid(xrange, yrange)
print(f"{xx.shape=}, {yy.shape=}")

# now xx maps from pixel position i, j to position x in the data

# classify each point in the meshgrid
zz = (yy > np.sin(xx)).astype(float)

# plot the contour of the true class distribution and the samples
plt.figure(figsize=(12,8))
plt.contourf(xx, yy, zz, cmap=cmap_light)
plt.scatter(data[:, 0], data[:, 1], c=classes, cmap=cmap_bold)
plt.grid()
plt.show()


### Handling data shapes

The sklearn predictor expects input of shape `(datapoints, features)`. To input your meshgrid, use `np.reshape` to flatten both `xx` and `yy` and then use `np.stack` to stack them in the last axis. Finally `reshape` the predictor's output back to your meshgrid shape and you can plot the contour.


### Inplace modification in numpy

Be mindful of whether you are working with *copies* or *views* of your data.

Comparison of copying and inplace modification in numpy:


In [None]:
print(f"---------- Numpy ----------")
a = np.array([5])
b = a
b = b - 1
print(f"Copy:    {a} {b}")

a = np.array([5])
b = a
b -= 1
print(f"Inplace: {a} {b}")

print(f"---------- Python ----------")
a = 5
b = a
b = b - 1
print(f"Copy:    {a} {b}")

a = 5
b = a
b -= 1
print(f"Inplace: {a} {b}")
