# **Poisonous Mushroom Classification using KNN Classification**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

> **1. Data Pre-processing :**

In [2]:
df = pd.read_csv('/kaggle/input/drug-classification/drug200.csv')

In [3]:
df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [5]:
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [6]:
df.tail()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.02,drugX
199,40,F,LOW,NORMAL,11.349,drugX


In [7]:
df['Sex'].unique()

array(['F', 'M'], dtype=object)

In [8]:
df['Sex'] = df['Sex'].replace({'F':1, 'M':2})

In [9]:
df['BP'].unique()

array(['HIGH', 'LOW', 'NORMAL'], dtype=object)

In [10]:
df['BP'] = df['BP'].replace({'HIGH':1, 'LOW':2, 'NORMAL':3})

In [11]:
df['Cholesterol'].unique()

array(['HIGH', 'NORMAL'], dtype=object)

In [12]:
df['Cholesterol'] = df['Cholesterol'].replace({'HIGH':1, 'NORMAL':2})

In [13]:
df['Drug'].unique()

array(['DrugY', 'drugC', 'drugX', 'drugA', 'drugB'], dtype=object)

In [14]:
df['Drug'] = df['Drug'].replace({'drugA':1, 'drugB':2, 'drugC':3, 'drugX':4, 'DrugY':5})

In [15]:
df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,1,1,1,25.355,5
1,47,2,2,1,13.093,3
2,47,2,2,1,10.114,3
3,28,1,3,1,7.798,4
4,61,1,2,1,18.043,5
...,...,...,...,...,...,...
195,56,1,2,1,11.567,3
196,16,2,2,1,12.006,3
197,52,2,3,1,9.894,4
198,23,2,3,2,14.020,4


In [16]:
X = df.drop('Drug', axis=1)
Y = df['Drug']

In [17]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X, Y, test_size=0.20, random_state=42)



> **2. Create and Train KNN Clissifier Model :**

In [18]:
from sklearn.neighbors import KNeighborsClassifier

In [19]:
knn_clf = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn_clf.fit(x_train, y_train)

> **3. Predict Test Set Results :**

In [20]:
y_pred = knn_clf.predict(x_test)

In [21]:
pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})

Unnamed: 0,Actual,Predicted
95,4,4
15,5,5
30,4,4
158,3,2
128,5,5
115,5,5
69,5,5
170,4,1
174,1,4
45,4,4


> **4. Evaluate Performance of the Model :**

In [22]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [23]:
mat = confusion_matrix(y_test, y_pred)
mat

array([[ 3,  0,  0,  3,  0],
       [ 0,  2,  0,  1,  0],
       [ 1,  1,  1,  2,  0],
       [ 1,  3,  0,  7,  0],
       [ 0,  0,  0,  0, 15]])

In [24]:
score = accuracy_score(y_test, y_pred)
score

0.7