# Classifcation using K Nearest Neighbour (KNN)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
%matplotlib inline

## Task 1

In [None]:
#read the breast cancer data from the local device
df = pd.read_csv('/Users/mahaa/OneDrive/Desktop/UWE/Semester_2/Machine Learning/Week4/Tutorial/data/breast-cancer-wisconsin.data.txt')
#df.head()

## Exploratory data analysis

In [None]:
#checking for null values
df.isnull().sum()

In [None]:
#checking for nonsense values
unique_values = []
for col in df.columns:
    unique_values.extend(df[col].unique()) 
print(unique_values)

In [None]:
#replace the ? with number
df.replace('?',-9999, inplace = True)

In [None]:
#delete the id cloumn
df_1 = df.drop(['id'], 1)

## Training and Testing

In [None]:
#divide the data frame into input array and output array
x = np.array(df_1.drop(['class'], 1)) 
y = np.array(df_1['class'])

In [None]:
#Split the data into training data and testing data
#random state is used to produce the same training and testing data every time this function is run
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state = 0)

In [None]:
#train the data using KNN with one neighbor
clf = KNeighborsClassifier(n_neighbors = 1)
clf.fit(X_train, y_train)

#evaluate the model in the training and testing data
print(clf.score(X_train,y_train))
print(clf.score(X_test, y_test))

In [None]:
#predict the class of new instance using the model
data = np.array([4,3,3,2,1,2,1,1,2])
print(data.reshape(1,-1))
prediction = clf.predict(data.reshape(1,-1)) #-1 means unknown dimension, it will be calculated
print(prediction)

### Task 1a
Change the number of neighbors and test the model

In [None]:
#train the data using KNN with diffrent neighbors values with the default distance function
for n in range(1,11):
    clf_1 = KNeighborsClassifier(n_neighbors = n)
    clf_1.fit(X_train, y_train)

    #evaluate the model in the training and testing data
    print("The accuracy of KNN with", n, "neighbors in the training data is:", round(clf_1.score(X_train,y_train),3))
    print("The accuracy of KNN with", n, "neighbors in the testing data is:", round(clf_1.score(X_test, y_test),3), "\n")

The model performs better when training with KNN with 5 neighours

### Task 1b
Leave the id column and evaluate the model

In [None]:
#divide the data frame into input array and output array
x1 = np.array(df.drop(['class'], 1)) 
y1 = np.array(df['class'])

#Split the data into training data and testing data
X_train1, X_test1, y_train1, y_test1 = train_test_split(x1, y1, random_state = 0)

#train the data using KNN with diffrent neighbors values
for n in range(2,11):
    clf_2 = KNeighborsClassifier(n_neighbors = n)
    clf_2.fit(X_train1, y_train1)

    #evaluate the model in the training and testing data
    print("The accuracy of KNN with", n, "neighbors in the training data is:", round(clf_2.score(X_train1,y_train1),3))
    print("The accuracy of KNN with", n, "neighbors in the testing data is:", round(clf_2.score(X_test1, y_test1),3), "\n")

The acuuracy of the model decrese significantly after adding the id column to the model. This is because id in not correlated with the output class. Thus, it does not add any valuable information when predicting the output class.

### Task 1c
Use manhaten distance

In [None]:
#train the data using KNN with diffrent neighbors values and Manhatten distance
for n in range(1,11):
    clf_3 = KNeighborsClassifier(n_neighbors = n, metric = 'manhattan')
    clf_3.fit(X_train, y_train)

    #evaluate the model in the training and testing data
    print("The accuracy of KNN with", n, "neighbors in the training data is:", round(clf_3.score(X_train,y_train),3))
    print("The accuracy of KNN with", n, "neighbors in the testing data is:", round(clf_3.score(X_test, y_test),3), "\n")

Still Euclidean distance gives the heightes model performance

## Task 2
Plot the training acuuracy and the testing accuracy while K range from 1 to 10. Then check which k value is better in this case 

In [None]:
#Plot shows the training and testing accuracy using diffrent K values
training_accuracy = []
testing_accuracy = []

for n_neighbors in range (1,11):
    #build the model
    clf_4 = KNeighborsClassifier(n_neighbors = n_neighbors)
    clf_4.fit(X_train, y_train)
    
    #record training test accuracy
    training_accuracy.append(clf_4.score(X_train, y_train))
    #record training test accuracy
    testing_accuracy.append(clf_4.score(X_test, y_test))
    
print(training_accuracy)
print(testing_accuracy)
    
plt.plot(range(1,11), training_accuracy, label="Training Accuracy")
plt.plot(range(1,11), testing_accuracy, label="Testing Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("n_neighbors")
plt.legend()

This figure shows the training and testing accuracy when choosing diffrent values for K in KNN. It shows that the training accuracy dicreases when K increases while the testing accuracy increases when K increases. The best K value in this model is when K = 5. This gives the heighest generalisation accuracy and a good training accuracy.

### Task 3
Upload the breast cancer data directly from skit learn. Do all the steps above, again.

In [None]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
X_train2, X_test2, y_train2, y_test2 = train_test_split(cancer.data, cancer.target, stratify = cancer.target, random_state = 66) #stratify parameter will preserve the proportion of output classes as in original dataset, in the train and test datasets as well

#### Task 3a
Data Exploration and finidng missing data

In [None]:
cancer_df = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)
cancer_df['target'] = cancer.target
cancer_df.isnull().sum()

There is no missing data

#### Task 3b
Train a KNN model and check the accuracy of the model with diffrent values of K

In [None]:
#train the data using KNN with diffrent neighbors values
for n in range(1,11):
    clf_5 = KNeighborsClassifier(n_neighbors = n)
    clf_5.fit(X_train2,y_train2)
    
    #evaluate the model in the training and testing data
    print("The accuracy of KNN with", n, "neighbors in the training data is:", round(clf_5.score(X_train2,y_train2),3))
    print("The accuracy of KNN with", n, "neighbors in the testing data is:", round(clf_5.score(X_test2, y_test2),3), "\n")

In [None]:
#plot a confusion matrix for this model
plot_confusion_matrix(clf_5, X_test2, y_test2, cmap = plt.cm.Blues)
plt.show()

#### Task 3c
Calculate F1 on data test split

In [None]:
#calculate the predicted values using the model. Then calculate the F1 score  143
y_predicted = clf_5.predict(X_test2)
f1_score(y_test2, y_predicted)