In [12]:
import kagglehub
import os
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_selection import SelectKBest, chi2
from imblearn.under_sampling import NearMiss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU
from keras.models import load_model
import joblib

In [13]:
# Download the dataset
path = kagglehub.dataset_download("alexteboul/diabetes-health-indicators-dataset")
dataset_path = os.path.join(path, "diabetes_binary_health_indicators_BRFSS2015.csv")


In [14]:
# Load dataset
df = pd.read_csv(dataset_path)

In [15]:
# Handle missing values more robustly
if df.isnull().values.any():
    # For binary/categorical features, fill with mode
    binary_cols = [col for col in df.columns if df[col].nunique() == 2]
    for col in binary_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)

    # For numerical features, fill with median (more robust to outliers)
    numerical_cols = [col for col in df.columns if col not in binary_cols and col != 'Diabetes_012']
    for col in numerical_cols:
        df[col].fillna(df[col].median(), inplace=True)

In [16]:
from imblearn.over_sampling import SMOTE
# Encode categorical features
categorical_columns = df.select_dtypes(include=['object']).columns
if not categorical_columns.empty:
    df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)


# Split features and labels
X = df.drop(columns=['Diabetes_binary'])
y = df['Diabetes_binary']
# Convert the target variable y to integer type for chi2





In [17]:
# Feature selection
selector = SelectKBest(score_func=chi2, k=12)
X_new = selector.fit_transform(X, y)
selected_columns = selector.get_support(indices=True)
important_features = X.columns[selected_columns].tolist()
X_selected = pd.DataFrame(X_new, columns=important_features)



In [18]:

# Normalize the data using Min-Max scaling
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_selected)

# Convert the scaled data back to a pandas DataFrame (optional, but can be helpful)
X_scaled_df = pd.DataFrame(X_scaled, columns=important_features)

print("Data normalized using Min-Max scaling.")
print(X_scaled_df.head())

Data normalized using Min-Max scaling.
   HighBP  HighChol       BMI  Stroke  HeartDiseaseorAttack  PhysActivity  \
0     1.0       1.0  0.325581     0.0                   0.0           0.0   
1     0.0       0.0  0.151163     0.0                   0.0           1.0   
2     1.0       1.0  0.186047     0.0                   0.0           0.0   
3     1.0       0.0  0.174419     0.0                   0.0           1.0   
4     1.0       1.0  0.139535     0.0                   0.0           1.0   

   GenHlth  MentHlth  PhysHlth  DiffWalk       Age    Income  
0     1.00       0.6       0.5       1.0  0.666667  0.285714  
1     0.50       0.0       0.0       0.0  0.500000  0.000000  
2     1.00       1.0       1.0       1.0  0.666667  1.000000  
3     0.25       0.0       0.0       0.0  0.833333  0.714286  
4     0.25       0.1       0.0       0.0  0.833333  0.428571  


In [19]:
# Handle class imbalance
nm = NearMiss(version=1, n_neighbors=10)
X_resampled, y_resampled = nm.fit_resample(X_scaled, y)



In [20]:
# Final train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [21]:
# Train XGBoost with best parameters
model = XGBClassifier(

    colsample_bytree=1,
    gamma=0,
    learning_rate=0.2,
    max_depth=7,
    n_estimators=300,
    reg_alpha=0.1,
    reg_lambda=1,
    subsample=0.8
)

model.fit(X_train, y_train)

In [24]:
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"\nXGBoost Accuracy after tuning: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


XGBoost Accuracy after tuning: 90.78%

Classification Report:
              precision    recall  f1-score   support

         0.0       0.86      0.97      0.91      7090
         1.0       0.96      0.85      0.90      7049

    accuracy                           0.91     14139
   macro avg       0.91      0.91      0.91     14139
weighted avg       0.91      0.91      0.91     14139



In [25]:
# After model training, save both model and scaler
import joblib
import pickle
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define the path to save the model
model_folder_path = '/content/drive/My Drive/GP/Diabetes'



# Create the directory if it doesn't exist (optional but recommended)
os.makedirs(model_folder_path, exist_ok=True)

# Save the model to Google Drive
joblib.dump(model, os.path.join(model_folder_path, 'diabetes_model.pkl'))

# Save the scaler to Google Drive
joblib.dump(scaler, os.path.join(model_folder_path, 'diabetes_scaler.pkl'))

# Also save the important features list to Google Drive
with open(os.path.join(model_folder_path, 'diabetes_features.pkl'), 'wb') as f:
    pickle.dump(important_features, f)

print(f"Model, scaler, and features saved successfully to {model_folder_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model, scaler, and features saved successfully to /content/drive/My Drive/GP/Diabetes


In [None]:

import pickle
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define the path to save the model
model_folder_path = '/content/drive/My Drive/GP/Diabetes'
model_file_path = os.path.join(model_folder_path, 'new_xgboost_model_v2.pkl')

# Create the directories if they don't exist
os.makedirs(model_folder_path, exist_ok=True)

# Save the trained model using pickle
with open(model_file_path, 'wb') as file:
    pickle.dump(model, file)

print(f"Model saved successfully to {model_file_path}")

In [None]:

from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 0.1, 0.5]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss'), # Added eval_metric to suppress warning
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=3,
                           n_jobs=-1,
                           verbose=2)

# Fit the grid search to the training data
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)


Fitting 3 folds for each of 6561 candidates, totalling 19683 fits


Parameters: { "use_label_encoder" } are not used.



Best parameters found:  {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'subsample': 0.8}
Best accuracy found:  0.9192438951072445
