In [1]:
# K Nearest Neighbor Algorithm (KNN)
# The KNN algorithm assumes that similar things exist in close proximity. In other words, similar things are near to each other.

# How to choose K value:-
# 1 Hit and Trial and Select the K that yields the best performance
# 2 Sqrt(n) n = total numbers of data samples in database
# 3 Use odd values of K to avoid confusion between two class of data (binary classification)

# When do we choose KNN:-
# 1 Dataset should be properly labeled 
# 2 Data should be noise free
# 3 Work very well in small scale datasets
# 4 KNN is better when you want to create model with higher accuracy on cost of computational resource

In [2]:
import numpy as np
import pandas as pd
import sea as sns
import matplotlib.pyplot as plt
from sklearn import datasets

In [3]:
# as_frame use to covert sklearn data in table format 
data = datasets.load_wine(as_frame=True)
x = data.data
y = data.target
names = data.target_names
names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [4]:
# columns=data.feature_names use to set columns name as sklearn database columns name 
df = pd.DataFrame(x,columns=data.feature_names)
df["wine class"] = data.target
df["wine class"] = df["wine class"].replace(to_replace=[0,1,2] , value=["class_0","class_1","class_2"])

In [5]:
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,wine class
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,class_0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,class_0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,class_0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,class_0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,class_0


In [None]:
# sns.pairplot(data = df , hue = "wine class" , palette = "Set2")

In [6]:
# check null value
df.isnull().sum()

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
wine class                      0
dtype: int64

In [7]:
# Segregate our data for training and testing 
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=1)

In [8]:
# Apply KNN algorithm
from sklearn.neighbors import KNeighborsClassifier
import math
math.sqrt(len(y_test))

7.3484692283495345

In [9]:
# predict x_test data
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(x_train,y_train)
pred = knn.predict(x_test)

In [10]:
# find the accuracy of y_test base on x_test prediction 
from sklearn import metrics
metrics.accuracy_score(y_test,pred)

0.6481481481481481

In [11]:
# for resolv low accuracy product we can scale our dataset using standardscaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

In [12]:
# predict x_test data after scaling
knn1 = KNeighborsClassifier(n_neighbors=7,metric="euclidean")
knn1.fit(x_train,y_train)
pred1 = knn1.predict(x_test)

In [13]:
# find the accuracy of y_test base on x_test prediction after scaling 
metrics.accuracy_score(y_test,pred1)

0.9814814814814815