In [12]:
import sqlite3
import numpy as np
import pandas as pd

In [None]:
# Saving data to a dataframe from a SQLite database
conn = sqlite3.connect("../database/health_indicators.db")
diabetes_health_indicators_df = pd.read_sql("SELECT * FROM diabetes_health_indicators;", conn)
conn.close()

In [14]:
diabetes_health_indicators_df = diabetes_health_indicators_df.drop(['diabetes_stage'], axis=1) # Drop diabetes_stage as it's redundant with diagnosed_diabetes
diabetes_health_indicators_df.head()

Unnamed: 0,age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diabetes_risk_score,diagnosed_diabetes
0,58,Male,Asian,Highschool,Lower-Middle,Employed,Never,0,215,5.7,...,239,41,160,145,136,236,6.36,8.18,29.6,1
1,48,Female,White,Highschool,Middle,Employed,Former,1,143,6.7,...,116,55,50,30,93,150,2.0,5.63,23.0,0
2,60,Male,Hispanic,Highschool,Middle,Unemployed,Never,1,57,6.4,...,213,66,99,36,118,195,5.07,7.51,44.7,1
3,74,Female,Black,Highschool,Low,Retired,Never,0,49,3.4,...,171,50,79,140,139,253,5.28,9.03,38.2,1
4,46,Male,White,Graduate,Middle,Retired,Never,1,109,7.2,...,210,52,125,160,137,184,12.74,7.2,23.5,1


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encoding categorical variables
encoded_df = diabetes_health_indicators_df.copy()
label_encoder = LabelEncoder()
categorical_cols = encoded_df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    encoded_df[col] = label_encoder.fit_transform(encoded_df[col])

# Selecting relevant columns
encoded_df = encoded_df[["hba1c", "glucose_postprandial", "glucose_fasting", "family_history_diabetes", "age", "diagnosed_diabetes"]]

encoded_df.head(10)

Unnamed: 0,hba1c,glucose_postprandial,glucose_fasting,family_history_diabetes,age,diagnosed_diabetes
0,8.18,236,136,0,58,1
1,5.63,150,93,0,48,0
2,7.51,195,118,1,60,1
3,9.03,253,139,0,74,1
4,7.2,184,137,0,46,1
5,6.03,133,100,0,46,0
6,5.24,100,101,0,75,0
7,7.04,189,110,0,62,1
8,6.9,172,116,0,42,1
9,4.99,109,76,0,59,0


In [16]:
X = encoded_df.drop('diagnosed_diabetes', axis=1)
y = encoded_df['diagnosed_diabetes']

In [17]:
from sklearn.model_selection import train_test_split  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
    
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [19]:
predictions = model.predict(X_test)
print("\nXGBoostClassifier Validation Accuracy: {:.4f}".format(accuracy_score(y_test, predictions)))
print("\nXGBoostClassifier Classification Report:\n", classification_report(y_test, predictions))


XGBoostClassifier Validation Accuracy: 0.9189

XGBoostClassifier Classification Report:
               precision    recall  f1-score   support

           0       0.84      1.00      0.91      8077
           1       1.00      0.87      0.93     11923

    accuracy                           0.92     20000
   macro avg       0.92      0.93      0.92     20000
weighted avg       0.93      0.92      0.92     20000



In [None]:
model.save_model('models/xgboost_model_v01.json')

In [21]:
y_test_array = np.ravel(y_test)  # Converts Series to 1D array

# Ensure X_test and predictions are NumPy arrays
X_test_np = np.array(X_test)
predictions_np = np.array(predictions)

In [22]:
X_test_df = pd.DataFrame(X_test_np, columns=[f"Feature_{i+1}" for i in range(X_test_np.shape[1])])

# Combine X_test, y_test, and predictions
results_df = X_test_df.copy()
results_df["True_Label"] = y_test_array
results_df["Prediction"] = predictions_np

results_df.head(50)

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,True_Label,Prediction
0,5.41,105.0,85.0,0.0,55.0,0,0
1,7.72,198.0,122.0,1.0,82.0,1,1
2,6.11,120.0,105.0,0.0,70.0,0,0
3,6.37,160.0,121.0,1.0,18.0,0,0
4,5.35,112.0,85.0,0.0,55.0,1,0
5,5.97,129.0,96.0,0.0,65.0,0,0
6,6.75,179.0,112.0,0.0,38.0,1,1
7,6.82,179.0,111.0,1.0,24.0,1,1
8,6.96,168.0,100.0,0.0,56.0,1,1
9,7.7,193.0,128.0,1.0,30.0,1,1
