# Import Library

In [22]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

# Download Dataset

In [23]:
!pip install PyDrive



In [24]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials


In [25]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [26]:
downloaded = drive.CreateFile({'id':"1h1F46PJ5qND0uUHF-5FYHAlKErgWoM2E"})   
downloaded.GetContentFile('Diabetes.csv')        

In [27]:
df= pd.read_csv('Diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


# PreProcessing



In [28]:
# replace nilai 0
zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']
for column in zero:
  df[column] = df[column].replace(0, np.NaN)
  mean = int(df[column].mean(skipna=True))
  df[column] = df[column].replace(np.NaN, mean)
  
# Scalling  
scaler = MinMaxScaler()
scalling_column = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
df[scalling_column] = scaler.fit_transform(df[scalling_column])

In [29]:
# fold 1
df1 = [df.iloc[:614].reset_index(drop=True), df.iloc[614:].reset_index(drop=True)]

# fold 2
df2 = [pd.concat([df.iloc[:461], df.iloc[614:768]]).reset_index(drop=True), 
       df.iloc[461:614].reset_index(drop=True)]

# fold 3
df3 = [pd.concat([df.iloc[:307], df.iloc[461:768]]).reset_index(drop=True),
       df.iloc[307:461].reset_index(drop=True)]

# fold 4
df4 = [pd.concat([df.iloc[:154], df.iloc[307:768]]).reset_index(drop=True),
       df.iloc[154:307].reset_index(drop=True)]

#fold 5
df5 = [df.iloc[154:768].reset_index(drop=True), df.iloc[:154].reset_index(drop=True)]



In [30]:
print(df1[0].shape)
print(df2[0].shape)
print(df3[0].shape)
print(df4[0].shape)
print(df5[0].shape)
print("\n")
print(df1[1].shape)
print(df2[1].shape)
print(df3[1].shape)
print(df4[1].shape)
print(df5[1].shape)

(614, 9)
(615, 9)
(614, 9)
(615, 9)
(614, 9)


(154, 9)
(153, 9)
(154, 9)
(153, 9)
(154, 9)


# Metode Perhitungan Jarak

In [31]:
# EUCLIDIAN DISTANCE
def distance(x1, x2):                                   
  return np.sqrt(np.sum(x1-x2)**2)

# MANHATTAN DISTANCE
def distance1(x1, x2):
    return np.sum(np.abs(np.array(x1) - np.array(x2)))


# Perhitungan Akurasi

In [32]:
def score_accuracy(test, pred):
    testSet = test['Outcome'].values.tolist()
    correct = 0
    for x in range(len(pred)):
        if testSet[x] == pred[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0


# KNN


In [33]:
def knn(k, train, test): 

  X = train.drop("Outcome", axis=1).values
  y = train.Outcome.values

  res =  []

  for data_test in test:
      
      jarak = []
      for data_train in X: 
          jarak.append(distance1(data_train, data_test)) # hitung jarak setiap data

      # mendapatkan nilai indeks minimum
      min_indeks = np.argsort(jarak)[:k]
      
  
      # predictions
      pred = [0, 0]
      for outcome in y[min_indeks]:
          if outcome == 0:
              pred[0] += 1
          else:
              pred[1] += 1

      
      if pred[0] > pred[1] :
          res.append(0)
      else:
          res.append(1)
    
  return res


# Main Program

In [34]:
def Main():
    k = 1
    acc = 0
    K = []

    for i in range(20):
        acc = 0
        for data in [df1, df2, df3, df4, df5]:
            y_pred = knn(k, data[0], data[1].drop("Outcome", axis=1).values)
            acc += score_accuracy(data[1], y_pred)
            
        print("Akurasi dengan k = ",k, "adalah = ", (acc/5))
        K.append([k, acc/5])
        k += 1
    lSorted = sorted(K, key=lambda x: x[1], reverse=True)
    
    print("\nNilai K terbaik : ", lSorted[0][0], " Dengan Accuracy : ", lSorted[0][1])
    

Main()

Akurasi dengan k =  1 adalah =  70.31066972243443
Akurasi dengan k =  2 adalah =  67.44588744588745
Akurasi dengan k =  3 adalah =  73.04812834224599
Akurasi dengan k =  4 adalah =  71.61106866989219
Akurasi dengan k =  5 adalah =  74.47669977081742
Akurasi dengan k =  6 adalah =  72.65257618198794
Akurasi dengan k =  7 adalah =  74.47585094643918
Akurasi dengan k =  8 adalah =  74.08284525931583
Akurasi dengan k =  9 adalah =  74.86546133604956
Akurasi dengan k =  10 adalah =  74.73304473304475
Akurasi dengan k =  11 adalah =  76.16755793226382
Akurasi dengan k =  12 adalah =  74.99108734402851
Akurasi dengan k =  13 adalah =  76.16755793226382
Akurasi dengan k =  14 adalah =  75.90781767252355
Akurasi dengan k =  15 adalah =  76.95102283337577
Akurasi dengan k =  16 adalah =  76.68958492487903
Akurasi dengan k =  17 adalah =  77.3414820473644
Akurasi dengan k =  18 adalah =  77.08004413886766
Akurasi dengan k =  19 adalah =  76.69043374925727
Akurasi dengan k =  20 adalah =  77.46795