# for regression & classification

In [1]:
import pandas as pd
import numpy as np
from scipy.io import loadmat
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import seaborn as sns 
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
iris = datasets.load_iris()
X = iris.data[:,:2]
y = iris.target
class_names = iris.target_names
class_names

## KNeighborsClassifier

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.astype('float'))

X_train,X_test,y_train ,y_test = train_test_split(X_scaled,y,test_size=0.2,random_state=42)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
knn.score(X_test,y_test)

In [None]:
def plot_decision_boundaries(X,y,clf,resolution=0.2):
    #plot the decision surface
    x1_min,x1_max = X[:,0].min() - 1 ,X[:,0].max() + 1
    x2_min,x2_max = X[:,1].min() - 1 ,X[:,1].max() + 1
    
    xx1,xx2 = np.meshgrid(np.arange(x1_min,x1_max,resolution),
                         np.arange(x2_min,x2_max,resolution))

    Z = clf.predict(np.c_[xx1.ravel(),xx2.ravel()])
    Z = Z.reshape(xx1.shape)
    
    plt.contourf(xx1,xx2,Z,alpha=0.4)
    
    # Plot class samples
    plt.scatter(X[:,0],X[:,1],c=y,s = 20, edgecolors='k')
    
    plt.xlabel('Sepal length')
    plt.ylabel('Sepal width')
    
plot_decision_boundaries(X_scaled,y,knn)
plt.title('KNN (k=5)')
    

In [None]:
neighbours,scores,scores_mean = [],[],[]

for n in range(1,75):
    neighbours.append(n)
    knn = KNeighborsClassifier(n_neighbors=n)
    score = cross_val_score(knn,X_scaled,y,cv=3)
    scores.append(score)
    scores_mean.append(score.mean())
    
display(scores,scores_mean)
    
plt.plot(neighbours,scores_mean)
plt.xlabel('k')
plt.ylabel('Accuracy')
plt.title('KNN')
plt.show()

In [None]:
#Ex
mnist_path = 'mnist-original.mat'

mnist = loadmat(mnist_path)
X = mnist['data'].T
y = mnist['label'][0]

# split to train & test 
train_size = 60000
X_train,y_train = X[:train_size],y[:train_size]
X_test,y_test = X[train_size:],y[train_size:]


#shuffle
shuffle_index = np.random.permutation(train_size)
X_train,y_train = X[shuffle_index],y[shuffle_index]





In [None]:
clf = Pipeline([('cls',KNeighborsClassifier())])

param_grid = {
    'cls__weights': ['uniform','distance'],
    'cls__n_neighbors': [5,8]
}

grid_search = GridSearchCV(clf,param_grid,cv=10)
grid_search.fit(X_train,y_train)

print(grid_search.best_params_)

results = grid_search.cv_results_
for mean_score,params in zip(results['mean_test_score'],results['params']):
    print(mean_score,params)

In [None]:
knn = KNeighborsClassifier(n_neighbors=5,weights='uniform') 
score = cross_val_score(knn,X_test,y_test,cv=3)
display(score)