In [6]:
# Bir Clasificiation algoritmasidir yani siniflandirir. 
# K sayisi komsu sayisini belirtir.
import numpy as np
import pandas as pd
import seaborn as sns

In [7]:
df = pd.read_csv("iris.csv")

In [8]:
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [9]:
df.variety.value_counts()

variety
Setosa        50
Versicolor    50
Virginica     50
Name: count, dtype: int64

In [13]:
df.dtypes

sepal.length    float64
sepal.width     float64
petal.length    float64
petal.width     float64
variety          object
dtype: object

In [19]:
df.variety = df.variety.astype('category')

In [23]:
# Dumy benzeri kategorik sekilde siniflara ayiriyoruz. 
# 0 - 1 - 2 gibi 

In [20]:
df.variety = df.variety.cat.codes

In [21]:
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [24]:
np.random.seed(50)

###  Spitting the Data


In [27]:
def splitting(mydata, ratio):
    train_num = int(len(mydata) * ratio)
    train_index = np.random.choice(range(0,len(mydata)),replace = False, size = train_num)
    train = mydata.iloc[train_index]
    test = mydata[~mydata.index.isin(train_index)]
    return train,test

In [28]:
train,test = splitting(mydata = df, ratio = 0.8)

In [29]:
train

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
88,5.6,3.0,4.1,1.3,1
72,6.3,2.5,4.9,1.5,1
20,5.4,3.4,1.7,0.2,0
16,5.4,3.9,1.3,0.4,0
147,6.5,3.0,5.2,2.0,2
...,...,...,...,...,...
96,5.7,2.9,4.2,1.3,1
76,6.8,2.8,4.8,1.4,1
66,5.6,3.0,4.5,1.5,1
121,5.6,2.8,4.9,2.0,2


In [30]:
test

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
2,4.7,3.2,1.3,0.2,0
6,4.6,3.4,1.4,0.3,0
19,5.1,3.8,1.5,0.3,0
22,4.6,3.6,1.0,0.2,0
26,5.0,3.4,1.6,0.4,0
30,4.8,3.1,1.6,0.2,0
31,5.4,3.4,1.5,0.4,0
33,5.5,4.2,1.4,0.2,0
35,5.0,3.2,1.2,0.2,0
43,5.0,3.5,1.6,0.6,0


In [31]:
test.variety.value_counts()

variety
0    11
2    11
1     8
Name: count, dtype: int64

In [53]:
# calculate the Euclidean distance between two vectors be aware that the last element is for the class
# Oklid uzaklik
def euclidean_distance(row1, row2):
    return np.sqrt(np.sum((row1[:-1] - row2[:-1])**2))

In [33]:
A = test.iloc[12,:]
A

sepal.length    5.9
sepal.width     3.2
petal.length    4.8
petal.width     1.8
variety         1.0
Name: 70, dtype: float64

In [34]:
B = test.iloc[11,:]
B

sepal.length    5.6
sepal.width     2.9
petal.length    3.6
petal.width     1.3
variety         1.0
Name: 64, dtype: float64

In [35]:
euclidean_distance(A, B)

1.3674794331177342

In [36]:
# Locate the most similar neighbors
def get_neighbors(train, test_instance, num_neighbors):
    distances = []
    train_copy=train.copy()
    for j in range(len(train)):
        dist = euclidean_distance(test_instance, train.iloc[j,:])
        distances.append(dist)
    train_copy['distance']=distances
    return train_copy.nsmallest(num_neighbors, ['distance'])

In [37]:
neighbors=get_neighbors(train, A, num_neighbors=5)
neighbors

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,distance
138,6.0,3.0,4.8,1.8,2,0.223607
127,6.1,3.0,4.9,1.8,2,0.3
149,5.9,3.0,5.1,1.8,2,0.360555
56,6.3,3.3,4.7,1.6,1,0.469042
126,6.2,2.8,4.8,1.8,2,0.5


In [38]:
neighbors.variety.value_counts()

variety
2    4
1    1
Name: count, dtype: int64

In [39]:
neighbors.variety.value_counts()

variety
2    4
1    1
Name: count, dtype: int64

In [40]:
# Sınıf Tahmininde bulunma
def predict_classes(train, test, num_neighbors):
    prediction=[]
    for j in range(len(test)):
        neighbors = get_neighbors(train, test.iloc[j,:], num_neighbors)
        predicted_class = neighbors.variety.value_counts().index[0]
        prediction.append(predicted_class)
    return prediction

In [41]:
prediction=predict_classes(train, test, num_neighbors=5)
prediction

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2]

In [42]:
list(zip(prediction,test.variety))

[(0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (1, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 2),
 (1, 2),
 (2, 2),
 (2, 2),
 (2, 2),
 (2, 2),
 (2, 2),
 (2, 2),
 (2, 2),
 (2, 2),
 (2, 2)]

In [43]:
def compute_accuracy(prediction,test_y):
    return np.mean(prediction==test_y)

In [44]:
compute_accuracy(prediction,test.variety)

0.9333333333333333

In [45]:
prediction=predict_classes(train, test, num_neighbors=1)
compute_accuracy(prediction,test.variety)

0.9

In [46]:
prediction=predict_classes(train, test, num_neighbors=2)
compute_accuracy(prediction,test.variety)

0.9

In [47]:
prediction=predict_classes(train, test, num_neighbors=10)
compute_accuracy(prediction,test.variety)

0.9

In [48]:
prediction=predict_classes(train, test, num_neighbors=75)
compute_accuracy(prediction,test.variety)

0.8666666666666667

In [49]:
prediction=predict_classes(train, test, num_neighbors=85)
compute_accuracy(prediction,test.variety)

0.3

#  WKNN

In [50]:
def get_neighbors(train, test_instance, num_neighbors):
    distances = []
    train_copy=train.copy()
    for j in range(len(train)):
        dist = euclidean_distance(test_instance, train.iloc[j,:])
        distances.append(dist)
    train_copy['distance']=distances
    train_copy['weights']=1/train_copy['distance']
    return train_copy.nsmallest(num_neighbors, ['distance'])

get_neighbors(df, A, 5)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,distance,weights
70,5.9,3.2,4.8,1.8,1,0.0,inf
138,6.0,3.0,4.8,1.8,2,0.223607,4.472136
127,6.1,3.0,4.9,1.8,2,0.3,3.333333
149,5.9,3.0,5.1,1.8,2,0.360555,2.773501
85,6.0,3.4,4.5,1.6,1,0.424264,2.357023


In [51]:
def predict_classes(train, test, num_neighbors):
    prediction=[]
    for j in range(len(test)):
        neighbors = get_neighbors(train, test.iloc[j,:], num_neighbors)
        Class_Weight_Sums=[]
        for i in np.unique(neighbors.variety):
            Class_Weight_Sums.append((i,np.sum(neighbors[neighbors.variety==i].weights)))
            print(Class_Weight_Sums)
        predicted_class = max(Class_Weight_Sums, key=lambda x:x[1])[0]
        prediction.append(predicted_class)
    return np.array(prediction)

In [52]:
def Weighted_KNN(train,test,number_neighbors):
    test_y=test.variety
    predictions=predict_classes(train, test, number_neighbors)
    accuracy=compute_accuracy(predictions,test_y)
    return accuracy,predictions
Weighted_KNN(train, test, 85)

[(0, 82.98128508248766)]
[(0, 82.98128508248766), (1, 11.94208179117335)]
[(0, 82.98128508248766), (1, 11.94208179117335), (2, 1.4359238751024155)]
[(0, 72.03640875253396)]
[(0, 72.03640875253396), (1, 11.620300602686278)]
[(0, 72.03640875253396), (1, 11.620300602686278), (2, 1.9315587442384317)]
[(0, 85.32149558639718)]
[(0, 85.32149558639718), (1, 12.442715368062387)]
[(0, 85.32149558639718), (1, 12.442715368062387), (2, 1.5165676194243842)]
[(0, 49.595324128169935)]
[(0, 49.595324128169935), (1, 10.69852921210548)]
[(0, 49.595324128169935), (1, 10.69852921210548), (2, 1.3265067401951443)]
[(0, 87.34565052551085)]
[(0, 87.34565052551085), (1, 13.40933730818676)]
[(0, 87.34565052551085), (1, 13.40933730818676), (2, 1.5862926237486052)]
[(0, 92.31156284381922)]
[(0, 92.31156284381922), (1, 13.180480095837684)]
[(0, 92.31156284381922), (1, 13.180480095837684), (2, 1.545837170250807)]
[(0, 71.97478564620093)]
[(0, 71.97478564620093), (1, 13.56825077968287)]
[(0, 71.97478564620093), (1, 1

(0.9333333333333333,
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2,
        2, 2, 2, 1, 2, 2, 2, 2], dtype=int8))