In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, accuracy_score
from xgboost import XGBClassifier, plot_importance
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
df

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
sns.boxplot(x="Pregnancies", y="Insulin", data=df)


In [None]:
sns.pairplot(df, hue="Outcome", markers=["o", "s"], corner=True);


In [None]:
sns.distplot(df.BloodPressure.dropna());


In [None]:
df[df['BloodPressure'] == 0].describe()


In [None]:
fig, axs = plt.subplots(ncols=3, figsize=(20,10))
sns.distplot(df.Glucose, ax = axs[0])
sns.distplot(df.BMI, ax = axs[1])
sns.distplot(df.Insulin, ax = axs[2])
plt.show()


# FEATURE ENGINEERING

In [None]:
X = df.drop('Outcome',axis=1).values
y = df['Outcome'].values   

In [None]:
X

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# KNN

In [None]:
classifier = KNeighborsClassifier(n_neighbors = 5,metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [None]:
test_error_rates = []

for k in range(1,30):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(X_train,y_train) 
   
    y_pred_test = knn_model.predict(X_test)
    
    test_error = 1 - accuracy_score(y_test,y_pred_test)
    test_error_rates.append(test_error)

In [None]:
test_error_rates

In [None]:
plt.figure(figsize=(10,6),dpi=200)
plt.plot(range(1,30),test_error_rates,label='Test Error')
plt.legend()
plt.ylabel('Error Rate')
plt.xlabel("K Value")

In [None]:
df

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
auc_score = roc_auc_score(y_test, y_pred)
print("AUC Score:", auc_score)

# XGBOOST

In [None]:
xgb=XGBClassifier(n_estimators=480, learning_rate=0.2, max_depth=8)

In [None]:
xgb.fit(X_train,y_train)


In [None]:
xgb_y_pred=xgb.predict(X_test)

In [None]:
print("Accuracy:",accuracy_score(y_test,y_pred)*100)

# Random Forest

In [None]:
classifier = RandomForestClassifier( n_estimators=100,criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
print("Accuracy:",accuracy_score(y_test,y_pred)*100)