In [52]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [53]:
df = pd.read_csv('diabetes.csv')

In [54]:
df.head()

Unnamed: 0,Target,Genetic Markers,Autoantibodies,Family History,Environmental Factors,Insulin Levels,Age,BMI,Physical Activity,Dietary Habits,Blood Pressure,Cholesterol Levels,Waist Circumference,Blood Glucose Levels,Ethnicity,Socioeconomic Factors,Smoking Status,Alcohol Consumption,Glucose Tolerance Test,History of PCOS,Previous Gestational Diabetes,Pregnancy History,Weight Gain During Pregnancy,Pancreatic Health,Pulmonary Function,Cystic Fibrosis Diagnosis,Steroid Use History,Genetic Testing,Neurological Assessments,Liver Function Tests,Digestive Enzyme Levels,Urine Test,Birth Weight,Early Onset Symptoms
0,Steroid-Induced Diabetes,Positive,Negative,No,Present,40,44,38,High,Healthy,124,201,50,168,Low Risk,Medium,Smoker,High,Normal,No,No,Normal,18,36,76,No,No,Positive,3,Normal,56,Ketones Present,2629,No
1,Neonatal Diabetes Mellitus (NDM),Positive,Negative,No,Present,13,1,17,High,Healthy,73,121,24,178,Low Risk,High,Non-Smoker,Moderate,Normal,Yes,No,Normal,8,26,60,Yes,No,Negative,1,Normal,28,Glucose Present,1881,Yes
2,Prediabetic,Positive,Positive,Yes,Present,27,36,24,High,Unhealthy,121,185,36,105,Low Risk,Medium,Smoker,High,Abnormal,Yes,No,Normal,15,56,80,Yes,No,Negative,1,Abnormal,55,Ketones Present,3622,Yes
3,Type 1 Diabetes,Negative,Positive,No,Present,8,7,16,Low,Unhealthy,100,151,29,121,Low Risk,High,Smoker,Moderate,Abnormal,No,Yes,Normal,12,49,89,Yes,No,Positive,2,Abnormal,60,Ketones Present,3542,No
4,Wolfram Syndrome,Negative,Negative,Yes,Present,17,10,17,High,Healthy,103,146,33,289,Low Risk,Low,Smoker,Moderate,Normal,No,Yes,Complications,2,10,41,No,No,Positive,1,Normal,24,Protein Present,1770,No


In [55]:
pd.set_option('display.max_columns', None)


In [56]:
nan_percentage = df.isna().sum() / df.count() * 100

nan_count = df.isna().sum()

nan_table = pd.concat([nan_count, nan_percentage], axis=1)

nan_table.columns = ['Count', 'Percentage']
nan_table

Unnamed: 0,Count,Percentage
Target,0,0.0
Genetic Markers,0,0.0
Autoantibodies,0,0.0
Family History,0,0.0
Environmental Factors,0,0.0
Insulin Levels,0,0.0
Age,0,0.0
BMI,0,0.0
Physical Activity,0,0.0
Dietary Habits,0,0.0


In [57]:
target_mapping = {
  'Cystic Fibrosis-Related Diabetes (CFRD)': 0,
  'Gestational Diabetes': 1,
  'LADA': 2,
  'MODY': 3,
  'Neonatal Diabetes Mellitus (NDM)': 4,
  'Prediabetic': 5,
  'Secondary Diabetes': 6,
  'Steroid-Induced Diabetes': 7,
  'Type 1 Diabetes': 8,
  'Type 2 Diabetes': 9,
  'Type 3c Diabetes (Pancreatogenic Diabetes)': 10,
  'Wolcott-Rallison Syndrome': 11,
  'Wolfram Syndrome': 12
}
genetic_markers_mapping = {'Negative':0,'Positive':1}
autoantibodies_mapping = {'Negative':0,'Positive':1}
family_history_mapping = {'No':0, 'Yes':1}
environmental_factors_mapping = {'Absent':0, 'Present':1}
physical_activity_mapping = {'Low':0,'Moderate':1,'High':2}
dietary_habits_mapping = {'Unhealthy':0, 'Healthy':1}
ethnicity_mapping = {'Low Risk':0, 'High Risk':1}
socioeconomic_factors_mapping = {'Low':0, 'Medium':1,'High':2}
smoking_status_mapping = {'Non-Smoker':0,'Smoker':1}
alcohol_consumption_mapping = {'Low':0,'Moderate':1,'High':2}
glucose_tolerance_test_mapping = {'Normal':0, 'Abnormal': 1}
history_of_pcos_mapping = {'No':0, 'Yes':1}
previous_gestational_diabetes = {'No':0, 'Yes': 1}
pregnancy_history_mapping = {'Normal': 0, 'Complications': 1}
cystic_fibrosis_diagnosis_mapping = {'No': 0, 'Yes': 1}
steroid_use_history_mapping = {'No': 0, 'Yes': 1}
genetic_testing_mapping = {'Negative':0,'Positive':1}
liver_function_tests_mapping = {'Normal':0, 'Abnormal': 1}
urine_test = {'Normal': 0, 'Protein Present': 1, 'Ketones Present': 2, 'Glucose Present': 3}
early_onset_symptoms_mapping = {'No': 0, 'Yes': 1}


In [58]:
df['Target'] = df['Target'].map(target_mapping)
df['Genetic Markers']=df['Genetic Markers'].map(genetic_markers_mapping)
df['Autoantibodies']=df['Autoantibodies'].map(autoantibodies_mapping)
df['Family History']=df['Family History'].map(family_history_mapping)
df['Environmental Factors']=df['Environmental Factors'].map(environmental_factors_mapping)
df['Physical Activity']=df['Physical Activity'].map(physical_activity_mapping)
df['Dietary Habits']=df['Dietary Habits'].map(dietary_habits_mapping)
df['Ethnicity'] = df['Ethnicity'].map(ethnicity_mapping)
df['Socioeconomic Factors'] = df['Socioeconomic Factors'].map(socioeconomic_factors_mapping)
df['Smoking Status'] = df['Smoking Status'].map(smoking_status_mapping)
df['Alcohol Consumption'] = df['Alcohol Consumption'].map(alcohol_consumption_mapping)
df['Glucose Tolerance Test'] = df['Glucose Tolerance Test'].map(glucose_tolerance_test_mapping)
df['History of PCOS'] = df['History of PCOS'].map(history_of_pcos_mapping)
df['Previous Gestational Diabetes'] = df['Previous Gestational Diabetes'].map(previous_gestational_diabetes)
df['Pregnancy History'] = df['Pregnancy History'].map(pregnancy_history_mapping)
df['Cystic Fibrosis Diagnosis'] = df['Cystic Fibrosis Diagnosis'].map(cystic_fibrosis_diagnosis_mapping)
df['Steroid Use History'] = df['Steroid Use History'].map(steroid_use_history_mapping)
df['Genetic Testing'] = df['Genetic Testing'].map(genetic_testing_mapping)
df['Liver Function Tests'] = df['Liver Function Tests'].map(liver_function_tests_mapping)
df['Urine Test'] = df['Urine Test'].map(urine_test)
df['Early Onset Symptoms'] = df['Early Onset Symptoms'].map(early_onset_symptoms_mapping)

In [59]:
df.head()

Unnamed: 0,Target,Genetic Markers,Autoantibodies,Family History,Environmental Factors,Insulin Levels,Age,BMI,Physical Activity,Dietary Habits,Blood Pressure,Cholesterol Levels,Waist Circumference,Blood Glucose Levels,Ethnicity,Socioeconomic Factors,Smoking Status,Alcohol Consumption,Glucose Tolerance Test,History of PCOS,Previous Gestational Diabetes,Pregnancy History,Weight Gain During Pregnancy,Pancreatic Health,Pulmonary Function,Cystic Fibrosis Diagnosis,Steroid Use History,Genetic Testing,Neurological Assessments,Liver Function Tests,Digestive Enzyme Levels,Urine Test,Birth Weight,Early Onset Symptoms
0,7,1,0,0,1,40,44,38,2,1,124,201,50,168,0,1,1,2,0,0,0,0,18,36,76,0,0,1,3,0,56,2,2629,0
1,4,1,0,0,1,13,1,17,2,1,73,121,24,178,0,2,0,1,0,1,0,0,8,26,60,1,0,0,1,0,28,3,1881,1
2,5,1,1,1,1,27,36,24,2,0,121,185,36,105,0,1,1,2,1,1,0,0,15,56,80,1,0,0,1,1,55,2,3622,1
3,8,0,1,0,1,8,7,16,0,0,100,151,29,121,0,2,1,1,1,0,1,0,12,49,89,1,0,1,2,1,60,2,3542,0
4,12,0,0,1,1,17,10,17,2,1,103,146,33,289,0,0,1,1,0,0,1,1,2,10,41,0,0,1,1,0,24,1,1770,0


In [60]:
ss = StandardScaler()
df[['Insulin Levels','Age', 'BMI', 'Blood Pressure', 'Cholesterol Levels', 'Waist Circumference', 'Blood Glucose Levels', 'Weight Gain During Pregnancy', 'Pancreatic Health', 'Pulmonary Function', 'Digestive Enzyme Levels','Birth Weight']] = ss.fit_transform(df[['Insulin Levels','Age', 'BMI', 'Blood Pressure', 'Cholesterol Levels', 'Waist Circumference', 'Blood Glucose Levels', 'Weight Gain During Pregnancy', 'Pancreatic Health', 'Pulmonary Function', 'Digestive Enzyme Levels','Birth Weight']])


In [61]:
df.head()

Unnamed: 0,Target,Genetic Markers,Autoantibodies,Family History,Environmental Factors,Insulin Levels,Age,BMI,Physical Activity,Dietary Habits,Blood Pressure,Cholesterol Levels,Waist Circumference,Blood Glucose Levels,Ethnicity,Socioeconomic Factors,Smoking Status,Alcohol Consumption,Glucose Tolerance Test,History of PCOS,Previous Gestational Diabetes,Pregnancy History,Weight Gain During Pregnancy,Pancreatic Health,Pulmonary Function,Cystic Fibrosis Diagnosis,Steroid Use History,Genetic Testing,Neurological Assessments,Liver Function Tests,Digestive Enzyme Levels,Urine Test,Birth Weight,Early Onset Symptoms
0,7,1,0,0,1,1.705261,0.569277,2.197644,2,1,0.634773,0.137716,2.197183,0.151527,0,1,1,2,0,0,0,0,0.259896,-0.578659,0.479322,0,0,1,3,0,0.494018,2,-0.655702,0
1,4,1,0,0,1,-0.798037,-1.474156,-1.294096,2,1,-1.922277,-1.658739,-1.624429,0.359146,0,2,0,1,0,1,0,0,-0.778199,-1.079046,-0.857855,1,0,0,1,0,-0.949955,3,-1.703567,1
2,5,1,1,1,1,0.499969,0.189103,-0.130183,2,0,0.484358,-0.221575,0.139392,-1.156471,0,1,1,2,1,1,0,0,-0.051533,0.422114,0.813616,1,0,0,1,1,0.442447,2,0.735381,1
3,8,0,1,0,1,-1.26161,-1.189026,-1.460369,0,0,-0.568545,-0.985068,-0.889503,-0.824281,0,2,1,1,1,0,1,0,-0.362961,0.071843,1.565777,1,0,1,2,1,0.700299,2,0.62331,0
4,12,0,0,1,1,-0.427178,-1.046461,-1.294096,2,1,-0.41813,-1.097347,-0.301563,2.663714,0,0,1,1,0,0,1,1,-1.401056,-1.879665,-2.445751,0,0,1,1,0,-1.156237,1,-1.859066,0


In [62]:
y = df['Target']
x = df.drop('Target', axis=1)

In [63]:
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=0.8, random_state=10)

In [64]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(),
    'AdaBoost': AdaBoostClassifier(), 
    'Naive Bayes': GaussianNB(),       
    'MLP Neural Network': MLPClassifier() 
}

In [65]:
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.2f}")
    print(classification_report(y_test, y_pred))

Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.77      0.76      0.76      1028
           1       0.69      0.71      0.70      1034
           2       0.82      0.80      0.81      1079
           3       0.79      0.75      0.77      1121
           4       0.99      1.00      0.99      1095
           5       0.78      0.78      0.78      1103
           6       0.62      0.60      0.61      1122
           7       0.57      0.54      0.55      1059
           8       0.83      0.87      0.85      1100
           9       0.66      0.68      0.67      1062
          10       0.60      0.62      0.61      1058
          11       0.86      0.83      0.85      1094
          12       0.83      0.87      0.85      1045

    accuracy                           0.75     14000
   macro avg       0.75      0.75      0.75     14000
weighted avg       0.75      0.75      0.75     14000

Training Decision Tree...
Decision Tree Accu



AdaBoost Accuracy: 0.23
              precision    recall  f1-score   support

           0       0.10      1.00      0.17      1028
           1       0.00      0.00      0.00      1034
           2       0.00      0.00      0.00      1079
           3       0.00      0.00      0.00      1121
           4       1.00      1.00      1.00      1095
           5       0.00      0.00      0.00      1103
           6       0.00      0.00      0.00      1122
           7       0.00      0.00      0.00      1059
           8       0.00      0.00      0.00      1100
           9       0.00      0.00      0.00      1062
          10       0.00      0.00      0.00      1058
          11       0.51      1.00      0.68      1094
          12       0.00      0.00      0.00      1045

    accuracy                           0.23     14000
   macro avg       0.12      0.23      0.14     14000
weighted avg       0.13      0.23      0.14     14000

Training Naive Bayes...
Naive Bayes Accuracy: 0.82
    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


MLP Neural Network Accuracy: 0.86
              precision    recall  f1-score   support

           0       0.92      0.87      0.89      1028
           1       0.82      0.87      0.85      1034
           2       0.94      0.86      0.90      1079
           3       0.87      0.85      0.86      1121
           4       1.00      1.00      1.00      1095
           5       0.93      0.96      0.95      1103
           6       0.73      0.72      0.72      1122
           7       0.76      0.73      0.75      1059
           8       0.87      0.92      0.89      1100
           9       0.80      0.73      0.76      1062
          10       0.74      0.84      0.79      1058
          11       0.87      0.90      0.88      1094
          12       0.89      0.87      0.88      1045

    accuracy                           0.86     14000
   macro avg       0.86      0.85      0.85     14000
weighted avg       0.86      0.86      0.85     14000





In [66]:
for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.2f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.75
Decision Tree Accuracy: 0.86
Random Forest Accuracy: 0.90
SVM Accuracy: 0.80
KNN Accuracy: 0.65
Gradient Boosting Accuracy: 0.90
XGBoost Accuracy: 0.90




AdaBoost Accuracy: 0.23
Naive Bayes Accuracy: 0.82
MLP Neural Network Accuracy: 0.86




In [67]:
results = {}

for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    results[name] = rmse
    print(f'{name}: RMSE = {rmse}')

results_df = pd.DataFrame(list(results.items()), columns=['Model', 'RMSE'])
print(results_df.sort_values(by='RMSE'))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression: RMSE = 1.5856274107476465
Decision Tree: RMSE = 1.1128470566203477
Random Forest: RMSE = 0.9151502608861564
SVM: RMSE = 1.4018355314168838
KNN: RMSE = 2.010667977137378
Gradient Boosting: RMSE = 0.9272463072375723
XGBoost: RMSE = 0.9300921613321061




AdaBoost: RMSE = 5.3356215329478855
Naive Bayes: RMSE = 1.2870231433150576
MLP Neural Network: RMSE = 1.1793157822592362
                 Model      RMSE
2        Random Forest  0.915150
5    Gradient Boosting  0.927246
6              XGBoost  0.930092
1        Decision Tree  1.112847
9   MLP Neural Network  1.179316
8          Naive Bayes  1.287023
3                  SVM  1.401836
0  Logistic Regression  1.585627
4                  KNN  2.010668
7             AdaBoost  5.335622




In [68]:
# Dictionary to store accuracies
accuracies = {}

for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies[name] = accuracy

# Create a bar plot for accuracies
plt.figure(figsize=(10, 6))
plt.barh(list(accuracies.keys()), list(accuracies.values()), color='skyblue')
plt.xlabel('Accuracy')
plt.title('Accuracy of Different Models')
plt.xlim(0, 1)
plt.gca().invert_yaxis() 
plt.show()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
