In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE


In [None]:
df = pd.read_csv("diabetes_prediction_dataset.csv")

In [None]:
print(df.shape)       
print(df.dtypes)        
print(df.head()) 

In [None]:
# Check for the Duplicates
n_dup = df.duplicated().sum()
print("Duplicated Number:", n_dup)

# Check for the missing value
miss_cnt = df.isna().sum()
miss_pct = (df.isna().mean() * 100).round(2)
print("\nNumber of missing value:\n", miss_cnt)
print("\nMissing value proportion:\n", miss_pct)

# 3. Check for the distributions:
print("\nGender distribution:\n", df['gender'].value_counts(dropna=False))
print("\nSmoking_history distribution:\n", df['smoking_history'].value_counts(dropna=False))

print("\nage: \n", df['age'].describe())
print("\nbmi: \n", df['bmi'].describe())
print("\nHbA1c_level: \n", df['HbA1c_level'].describe())
print("\nblood_glucose_level: \n", df['blood_glucose_level'].describe())


In [None]:
#1.1 eliminate duplicates
df = df.drop_duplicates()

#1.2 remove gender = other
df = df[df['gender'] != 'Other'].reset_index(drop=True)

gender_encoder = LabelEncoder()
df['gender'] = gender_encoder.fit_transform(df['gender'])

#1.2 replace Smoking_History: No Info to NAN
df['smoking_history'] = df['smoking_history'].replace('No Info', np.nan)

#grouping
df['smoking_history'] = df['smoking_history'].replace({ 'ever':    'former', 'not current': 'former'})

df['smoking_history'] = df['smoking_history'].fillna('Unknown')

smoking_history_encoder = LabelEncoder()
df['smoking_history'] = smoking_history_encoder.fit_transform(df['smoking_history'])




In [None]:
print(df.shape)         
print(df.dtypes)      
print(df.head()) 

In [None]:
numeric_cols = ['age','bmi','HbA1c_level','blood_glucose_level']
outlier_counts = {}

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    mask = (df[col] < lower) | (df[col] > upper)
    outlier_counts[col] = mask.sum()

print("Outliner number: ", outlier_counts)
print("Outliner proportion: ", {col:cnt/len(df)*100  for col,cnt in outlier_counts.items()})

In [None]:
feature_cols = ['gender','smoking_history','age','hypertension','heart_disease','bmi','HbA1c_level','blood_glucose_level']
X = df[feature_cols]
y = df['diabetes']


In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(
    n_splits=1,     
    test_size=0.2
)

for train_idx, test_idx in sss.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# Check the proportion
print("Oringinal proportion: ", y.mean())
print("Training set proportion: ", y_train.mean())
print("Testing set proportion: ", y_test.mean())

In [None]:
print("Training set size: ", X_train.shape, "Testing set size: ", X_test.shape)

In [None]:

# building RFCmodel
rf = RandomForestClassifier(
    n_estimators = 1000,   
    max_depth = 22,      
    min_samples_leaf = 8,  
    random_state = None,
    n_jobs = -1 ,
    class_weight = {0:1, 1:10}
)

In [None]:
#fit the model
rf.fit(X_train, y_train)

In [None]:
# Do the prediction
y_pred = rf.predict(X_test)

y_proba = rf.predict_proba(X_test)[:, 1]

# Output the classification report
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(4, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Reds')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
"""
for depth in range(1, 30):

    rf = RandomForestClassifier(
    n_estimators = 1000,    
    max_depth = depth,    
    min_samples_leaf = 8,  
    random_state = None,
    n_jobs = -1 ,           
    class_weight = 'balanced')
    
    scores = cross_val_score(rf, X_train, y_train, cv=5) 
    print(f" Depth: {depth}, Average accuracy: {scores.mean():.4f}")
"""

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold


# Pipeline construction
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

# Grid search for good parameter
param_grid = {
    'knn__n_neighbors': [3,5,7,9,11,13],
    'knn__weights': ['uniform','distance'],
    'knn__p': [1,2]
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(pipe, param_grid, cv=skf, scoring='recall', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best Parameter:", grid.best_params_)
print("Best recall rate:", grid.best_score_)

best_knn = grid.best_estimator_
y_pred  = best_knn.predict(X_test)
y_proba = best_knn.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(4, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()