In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from imblearn.over_sampling import SVMSMOTE

In [3]:
def load_and_clean_data(filepath, split=False):
    df = pd.read_csv(filepath)
    
    # Remove duplicates
    df = df.drop_duplicates()

    scaler = StandardScaler()
    df[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level',]] = scaler.fit_transform(df[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level',]])

    # One-hot encode categorical variables
    df = pd.get_dummies(df, columns=['gender', 'smoking_history'], drop_first=True)

    # Convert boolean columns to integers
    bool_cols = df.select_dtypes(include='bool').columns
    df[bool_cols] = df[bool_cols].astype(int)
    

    if split:
        x = df.drop("diabetes", axis=1)
        y = df["diabetes"]

        # 80/10/10 split
        X_temp, X_test, y_temp, y_test = train_test_split(
            x, y, test_size=0.10, stratify=y, random_state=42
        )

        val_ratio = 0.10 / 0.90  
        X_train, X_val, y_train, y_val = train_test_split(
            X_temp, y_temp, test_size=val_ratio, stratify=y_temp, random_state=42
        )

        #over-sampling the training data
        svmsmote = SVMSMOTE(random_state=42)
        X_train , y_train  = svmsmote.fit_resample(X_train, y_train)
        return X_train, X_val, X_test, y_train, y_val, y_test

    return df

In [4]:
diabetes_df = load_and_clean_data("diabetes_prediction_dataset.csv", False)
x_train, x_val, x_test, y_train, y_val, y_test = load_and_clean_data("diabetes_prediction_dataset.csv", True)

In [5]:
xg_classifier = XGBClassifier(random_state = 42, eval_metric='logloss')
params = {
   'n_estimators': [20, 50, 100, 150],
   'max_depth': [2, 6, 8, 12],
   'learning_rate': [0.01, 0.1, 0.05]
}

xg_classifier.fit(x_train, y_train)

grid = GridSearchCV(xg_classifier, params, cv=5, scoring='f1',)
grid.fit(x_val, y_val)
best_model = grid.best_estimator_
y_pred = best_model.predict(x_test)
report = classification_report(y_test, y_pred)

In [6]:
print(report)
print(grid.best_params_)

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      8767
           1       0.98      0.68      0.81       848

    accuracy                           0.97      9615
   macro avg       0.98      0.84      0.89      9615
weighted avg       0.97      0.97      0.97      9615

{'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 50}
