In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [37]:
df = pd.read_csv("diabetes.csv")
print(df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [None]:
for label in df.columns[:-1]:
  plt.hist(df[df["Outcome"]==1][label], color = "blue", label="Diabetic", density=True, alpha=0.7)
  plt.hist(df[df["Outcome"]==0][label], color = "red", label="Not Diabetic", density=True, alpha=0.7)
  plt.title(label)
  plt.ylabel("Probabilty Density")
  plt.xlabel(label)
  plt.legend()
  plt.show()

In [71]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

  return bound(*args, **kwds)


In [72]:
def scale_dataset(dataframe, oversample=False):
  X = dataframe[dataframe.columns[:-1]].values
  y = dataframe[dataframe.columns[-1]].values

  scaler = StandardScaler()
  X = scaler.fit_transform(X)

  if oversample:
    ros = RandomOverSampler()
    X, y = ros.fit_resample(X, y)

  data = np.hstack((X, np.reshape(y, (-1, 1))))

  return data, X, y

In [73]:
train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

In [58]:
print(len(train[train["Outcome"]==0]))

301


In [76]:
print(sum(y_train==0))

294


## KNN

In [90]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [103]:
knn_model = KNeighborsClassifier(n_neighbors=10)
knn_model.fit(X_train, y_train)

In [104]:
y_pred = knn_model.predict(X_test)

In [105]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.78      0.80       101
           1       0.62      0.68      0.65        53

    accuracy                           0.75       154
   macro avg       0.72      0.73      0.73       154
weighted avg       0.75      0.75      0.75       154



## Naive Bayes

In [81]:
from sklearn.naive_bayes import GaussianNB

In [82]:
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)

In [83]:
y_pred = nb_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.77      0.80       101
           1       0.61      0.68      0.64        53

    accuracy                           0.74       154
   macro avg       0.72      0.73      0.72       154
weighted avg       0.75      0.74      0.74       154



## Log Regression

In [84]:
from sklearn.linear_model import LogisticRegression

In [85]:
lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train, y_train)

In [86]:
y_pred = lg_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.76      0.81       101
           1       0.63      0.77      0.69        53

    accuracy                           0.77       154
   macro avg       0.75      0.77      0.75       154
weighted avg       0.78      0.77      0.77       154



## SVM


In [87]:
from sklearn.svm import SVC

In [88]:
svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)

In [89]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.74      0.79       101
           1       0.60      0.74      0.66        53

    accuracy                           0.74       154
   macro avg       0.72      0.74      0.73       154
weighted avg       0.76      0.74      0.75       154

