# Code

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

- Importing dataset 
- Splitting it into features and labels

In [2]:
X: np.ndarray = np.genfromtxt("ionosphere.txt", delimiter=",", usecols=np.arange(34))
y: np.ndarray = np.genfromtxt("ionosphere.txt", delimiter=",", usecols=34, dtype='int')

- Splitting data into training set and test set

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [4]:
knn: KNeighborsClassifier = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)

## Using Euclidean Metric

- The `leaf_size` parameter specifies the number of samples to be used in the Ball Tree or KDTree
- The `metric` parameter specifies the distance metric to use for computing the nearest neighbors
- The `metric_params` parameter specifies additional parameters for the distance metric
- The `n_jobs` parameter specifies the number of jobs to run in parallel for the nearest neighbors search
- The `n_neighbors` parameter specifies the number of neighbors to use for the nearest neighbors search
- The `p` parameter specifies the power parameter for the Minkowski metric. 
- The `weights` parameter specifies the weight function to use for the nearest neighbors search.

In [5]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2, weights='uniform') # shows

In [6]:
def my_dist(x, y):
	return np.sum(np.abs(x - y))

knn: KNeighborsClassifier = KNeighborsClassifier(n_neighbors=1, metric=my_dist)
knn.fit(X_train, y_train)

In [7]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='my_dist', metric_params=None, n_jobs=1, n_neighbors=1, p=2, weights='uniform') # shows

In [8]:
print(np.mean(knn.predict(X_test) == y_test))

0.9204545454545454


## Using Manhattan Metric 

- Gives better result

In [9]:
knn: KNeighborsClassifier = KNeighborsClassifier(n_neighbors=1, p=1)
knn.fit(X_train, y_train)

In [10]:
print(np.mean(knn.predict(X_test) == y_test))

0.9204545454545454


## Kernel Methods

### Polynomial Kernel

In [11]:
def poly_kernel(x: np.ndarray, y: np.ndarray, d: int) -> float:
	return (1 + np.dot(x, y)) ** d 

In [12]:
def poly_dist(x: np.ndarray, y: np.ndarray, d: int = 2) -> float:
	# return (poly_kernel(x, x, d) + poly_kernel(y, y, d) - 2 * poly_kernel(x, y, d)) ** 0.5
	return (poly_kernel(x, x, d) + poly_kernel(y, y, d) - 2 * poly_kernel(x, y, d)) 

In [13]:
knn: KNeighborsClassifier = KNeighborsClassifier(n_neighbors=1, metric=poly_dist)
knn.fit(X_train, y_train)
np.mean(knn.predict(X_test) == y_test)

0.8863636363636364

### RFB Kernel

In [14]:
def rbf_kernel(x: np.ndarray, y: np.ndarray, gamma: float) -> float:
	return np.exp(-gamma * np.sum((x - y) ** 2)) # gamma = 1 / (2 * sigma ** 2)

In [15]:
def rfb_dist(x: np.ndarray, y: np.ndarray, gamma: float = 1) -> float:
	return (rbf_kernel(x, x, gamma) + rbf_kernel(y, y, gamma) - 2 * rbf_kernel(x, y, gamma)) ** 0.5

In [16]:
knn: KNeighborsClassifier = KNeighborsClassifier(n_neighbors=1, metric=rfb_dist)
knn.fit(X_train, y_train)
np.mean(knn.predict(X_test) == y_test)

0.8522727272727273

- Computing the accuracy of the chosen value $γ=1$ for the parameter $γ$ using cross validation

In [17]:
from sklearn.model_selection import cross_val_predict

In [18]:
best_score: float = 0

for gamma in [0.01, 0.1, 1, 10, 100]:
	def rfb_dist(x: np.ndarray, y: np.ndarray) -> float:
		return (rbf_kernel(x, x, gamma) + rbf_kernel(y, y, gamma) - 2 * rbf_kernel(x, y, gamma))
	knn: KNeighborsClassifier = KNeighborsClassifier(n_neighbors=1, metric=rfb_dist)
	score: float = np.mean(cross_val_predict(knn, X, y, cv=5))
	if score > best_score:
		best_score = score
		best_gamma = gamma

def rbf_dist(x: np.ndarray, y: np.ndarray, gamma: float = 1) -> float:
	return (rbf_kernel(x, x, gamma) + rbf_kernel(y, y, gamma) - 2 * rbf_kernel(x, y, gamma))

knn: KNeighborsClassifier = KNeighborsClassifier(n_neighbors=1, metric=rbf_dist)
knn.fit(X_train, y_train)
test_score: float = knn.score(X_test, y_test)

print("Best CV score: {:.2f}".format(best_score))
print("Best gamma: {:.2f}".format(best_gamma))
print("Test score: {:.2f}".format(test_score))

Best CV score: 0.71
Best gamma: 10.00
Test score: 0.85


### Creating Custom Estimator

In [19]:
class My_Classifier(KNeighborsClassifier):
	def __init__ (self, n_neighbors = 1):
		super().__init__(n_neighbors=n_neighbors)
	
	def fit(self, X, y):
		super().fit(X, y)
		return self
	
	def predict(self, X, y=None):
		return super().predict(X)

	def score(self, X, y):
		return super().score(X, y)

In [20]:
knn: My_Classifier = My_Classifier(n_neighbors=1)
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))

0.8522727272727273


- Replacing Euclidean distance with RBF distance

In [21]:
class rbf_Classifier(KNeighborsClassifier):
	def __init__ (self, n_neighbors = 1, gamma = 1):
		def rbf_dist(x: np.ndarray, y: np.ndarray) -> float:
			return (rbf_kernel(x, x, gamma) + rbf_kernel(y, y, gamma) - 2 * rbf_kernel(x, y, gamma))
		super().__init__(n_neighbors=n_neighbors, metric=rbf_dist)
		self.gamma = gamma
		self.n_neighbors = n_neighbors
	
	def fit(self, X, y):
		super().fit(X, y)
		return self
	
	def predict(self, X, y=None):
		return super().predict(X)

	def score(self, X, y):
		return super().score(X, y)

In [22]:
knn: rbf_Classifier = rbf_Classifier(n_neighbors=1, gamma=best_gamma)
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))

0.6022727272727273


### Uncertainty Estimators

In [23]:
from sklearn.datasets import load_iris

In [24]:
iris: np.ndarray = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)
knn: KNeighborsClassifier = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)

print(knn.score(X_test, y_test))
print(knn.predict(X_test))
print(knn.predict_proba(X_test))

0.9736842105263158
[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]
[[0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]]


# Exercise

## Question 1
Use cross-validation on the training set to choose the best value of `p` for Nearest Neighbour. What is the error of the Nearest Neighbour with this value of p on the test set?

In [25]:
from sklearn.model_selection import cross_val_score

In [26]:
from sklearn.svm import SVC

In [27]:
best_score: float = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
	for C in [0.001, 0.01, 0.1, 1, 10, 100]:
		svm: SVC = SVC(gamma=gamma, C=C)
		score: float = np.mean(cross_val_score(svm, X_train, y_train, cv=5))
		if score > best_score:
			best_score = score
			best_C = C
			best_gamma = gamma

In [28]:
print("Best score: {:.2f}".format(best_score))
print("Best parameters: C = {:.3f}, gamma = {:.3f}".format(best_C, best_gamma))

Best score: 0.97
Best parameters: C = 10.000, gamma = 0.100


## Question 2
Explain how the array in `In[13]` can be obtained from the array in `In[14]`.

- The `predict_proba` method gives the probabilities for each class that the model predicts for the test data
- The `predict` method gives the most likely class for each data point in the test data.

## Question 3
Implement a new method, called `predict_proba`, for the class `rbfClassifier`, which should output probabilities for various labels for the test samples. 
> Hint: Emulate what we did for the method predict in that class.

In [29]:
class rbf_Classifier(KNeighborsClassifier):
	def __init__ (self, n_neighbors = 1, gamma = 1):
		def rbf_dist(x: np.ndarray, y: np.ndarray) -> float:
			return (rbf_kernel(x, x, gamma) + rbf_kernel(y, y, gamma) - 2 * rbf_kernel(x, y, gamma))
		super().__init__(n_neighbors=n_neighbors, metric=rbf_dist)
		self.gamma = gamma
		self.n_neighbors = n_neighbors
	
	def fit(self, X, y):
		super().fit(X, y)
		return self
	
	def predict(self, X, y=None):
		return super().predict(X)

	def score(self, X, y):
		return super().score(X, y)

	# implement predict_proba() method which returns the probability of various labels for the test samples
	def predict_proba(self, X):
		prediction: np.ndarray = self.predict(X) # get the prediction of the test samples
		probability: np.ndarray = np.zeros((X.shape[0], 3)) # create a matrix to store the probability of various labels for the test samples
		# for each test sample and each label, calculate the probability of the label
		for i in range(X.shape[0]):
			for j in range(3): # 3 labels
				probability[i][j] = np.sum(prediction[:i] == j) / (i + 1) # calculate the probability of the label
		return probability

## Question 4
Test your new method `predict_proba` for the class `rbfClassifier`.

In [30]:
knn: rbf_Classifier = rbf_Classifier(n_neighbors=1, gamma=best_gamma)
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))
print(knn.predict(X_test))
print(knn.predict_proba(X_test))

0.9736842105263158
[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]
[[0.         0.         0.        ]
 [0.         0.         0.5       ]
 [0.         0.33333333 0.33333333]
 [0.25       0.25       0.25      ]
 [0.2        0.2        0.4       ]
 [0.33333333 0.16666667 0.33333333]
 [0.28571429 0.14285714 0.42857143]
 [0.375      0.125      0.375     ]
 [0.33333333 0.22222222 0.33333333]
 [0.3        0.3        0.3       ]
 [0.27272727 0.36363636 0.27272727]
 [0.25       0.33333333 0.33333333]
 [0.23076923 0.38461538 0.30769231]
 [0.21428571 0.42857143 0.28571429]
 [0.2        0.46666667 0.26666667]
 [0.1875     0.5        0.25      ]
 [0.23529412 0.47058824 0.23529412]
 [0.22222222 0.5        0.22222222]
 [0.21052632 0.52631579 0.21052632]
 [0.25       0.5        0.2       ]
 [0.28571429 0.47619048 0.19047619]
 [0.27272727 0.45454545 0.22727273]
 [0.26086957 0.47826087 0.2173913 ]
 [0.29166667 0.45833333 0.20833333]
 [0.32       0.44       0.2       ]
 [