In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow
import mlflow.sklearn
import json
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import os
import optuna
import dagshub


In [3]:
dagshub.init(repo_owner='piyushshukla857', repo_name='diabetic_class', mlflow=True)

In [4]:
mlflow.set_tracking_uri('https://dagshub.com/piyushshukla857/diabetic_class.mlflow')

In [6]:

df = pd.read_csv('../data/external/dataset.csv')

In [7]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [8]:
df['gender'].unique()

array(['Female', 'Male', 'Other'], dtype=object)

In [9]:
df['smoking_history'].unique()

array(['never', 'No Info', 'current', 'former', 'ever', 'not current'],
      dtype=object)

In [10]:
gender_ohe = pd.get_dummies(df['gender'], prefix='gender')
df = pd.concat([df, gender_ohe], axis=1)
df.drop(columns=['gender'], inplace=True)

In [11]:
df.head()

Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other
0,80.0,0,1,never,25.19,6.6,140,0,True,False,False
1,54.0,0,0,No Info,27.32,6.6,80,0,True,False,False
2,28.0,0,0,never,27.32,5.7,158,0,False,True,False
3,36.0,0,0,current,23.45,5.0,155,0,True,False,False
4,76.0,1,1,current,20.14,4.8,155,0,False,True,False


In [14]:
smoking_ordinal = OrdinalEncoder(categories=[['never', 'former', 'not current', 'current', 'ever', 'No Info']])
df['smoking_history_encoded'] = smoking_ordinal.fit_transform(df[['smoking_history']])
df.drop(columns=['smoking_history'], inplace=True)

In [15]:
df.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other,smoking_history_encoded
0,80.0,0,1,25.19,6.6,140,0,True,False,False,0.0
1,54.0,0,0,27.32,6.6,80,0,True,False,False,5.0
2,28.0,0,0,27.32,5.7,158,0,False,True,False,0.0
3,36.0,0,0,23.45,5.0,155,0,True,False,False,3.0
4,76.0,1,1,20.14,4.8,155,0,False,True,False,3.0


In [16]:
df['bmi'].fillna(df['bmi'].median(), inplace=True)
df['HbA1c_level'].fillna(df['HbA1c_level'].median(), inplace=True)
df['blood_glucose_level'].fillna(df['blood_glucose_level'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bmi'].fillna(df['bmi'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['HbA1c_level'].fillna(df['HbA1c_level'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whi

In [17]:
X = df.drop(columns=['diabetes'])
y = df['diabetes']

In [19]:
mlflow.set_experiment("Diabetes Prediction")

2024/09/07 18:17:40 INFO mlflow.tracking.fluent: Experiment with name 'Diabetes Prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/8e5c4d14a7a7403b91d1cbb5a0e0164f', creation_time=1725713262104, experiment_id='2', last_update_time=1725713262104, lifecycle_stage='active', name='Diabetes Prediction', tags={}>

In [22]:
algorithms = {
    'LogisticRegression': LogisticRegression(),
    'XGBoost': XGBClassifier(),
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier()
}

In [26]:
with mlflow.start_run(run_name="All Experiments ") as parent_run:
    # Loop through algorithms
    for algo_name, algorithm in algorithms.items():
        with mlflow.start_run(run_name=f"{algo_name}", nested=True) as child_run:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

            # Log preprocessing parameters
            mlflow.log_param("algorithm", algo_name)
            mlflow.log_param("test_size", 0.2)
            
            # Model training
            model = algorithm
            model.fit(X_train, y_train)
            
            # Log model parameters
            if algo_name == 'LogisticRegression':
                mlflow.log_param("C", model.C)
            elif algo_name == 'XGBoost':
                mlflow.log_param("n_estimators", model.n_estimators)
                mlflow.log_param("learning_rate", model.learning_rate)
            elif algo_name == 'RandomForest':
                mlflow.log_param("n_estimators", model.n_estimators)
                mlflow.log_param("max_depth", model.max_depth)
            elif algo_name == 'GradientBoosting':
                mlflow.log_param("n_estimators", model.n_estimators)
                mlflow.log_param("learning_rate", model.learning_rate)
                mlflow.log_param("max_depth", model.max_depth)
            
            # Model evaluation
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, zero_division=1)
            recall = recall_score(y_test, y_pred, zero_division=1)
            f1 = f1_score(y_test, y_pred, zero_division=1)
            
            # Log evaluation metrics
            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("precision", precision)
            mlflow.log_metric("recall", recall)
            mlflow.log_metric("f1_score", f1)
            
            # Log model
            mlflow.sklearn.log_model(model, "model")
            
            # Print the results for verification
            print(f"Algorithm: {algo_name}")
            print(f"Accuracy: {accuracy}")
            print(f"Precision: {precision}")
            print(f"Recall: {recall}")
            print(f"F1 Score: {f1}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Algorithm: LogisticRegression
Accuracy: 0.9571
Precision: 0.8477905073649754
Recall: 0.6065573770491803
F1 Score: 0.7071672354948806


2024/09/07 18:33:13 INFO mlflow.tracking._tracking_service.client: 🏃 View run LogisticRegression at: https://dagshub.com/piyushshukla857/diabetic_class.mlflow/#/experiments/2/runs/e499b8be4176489c8ad5db11347ca493.
2024/09/07 18:33:13 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/piyushshukla857/diabetic_class.mlflow/#/experiments/2.


Algorithm: XGBoost
Accuracy: 0.9714
Precision: 0.9558587479935795
Recall: 0.6973067915690867
F1 Score: 0.8063642518618822


2024/09/07 18:33:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBoost at: https://dagshub.com/piyushshukla857/diabetic_class.mlflow/#/experiments/2/runs/9e2ca1480c544940a067ffdf9a1a8696.
2024/09/07 18:33:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/piyushshukla857/diabetic_class.mlflow/#/experiments/2.


Algorithm: RandomForest
Accuracy: 0.97025
Precision: 0.9441340782122905
Recall: 0.6926229508196722
F1 Score: 0.7990543735224587


2024/09/07 18:33:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForest at: https://dagshub.com/piyushshukla857/diabetic_class.mlflow/#/experiments/2/runs/1dc6c3e3ddea4955821a30f9b05ceeab.
2024/09/07 18:33:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/piyushshukla857/diabetic_class.mlflow/#/experiments/2.


Algorithm: GradientBoosting
Accuracy: 0.9724
Precision: 0.9865319865319865
Recall: 0.6861826697892272
F1 Score: 0.8093922651933702


2024/09/07 18:34:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run GradientBoosting at: https://dagshub.com/piyushshukla857/diabetic_class.mlflow/#/experiments/2/runs/b67b6f2d44244d079e6cebb748795101.
2024/09/07 18:34:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/piyushshukla857/diabetic_class.mlflow/#/experiments/2.
2024/09/07 18:34:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run All Experiments  at: https://dagshub.com/piyushshukla857/diabetic_class.mlflow/#/experiments/2/runs/c6f32d91897d4182b8023c39aa8309ee.
2024/09/07 18:34:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/piyushshukla857/diabetic_class.mlflow/#/experiments/2.


In [25]:
print(mlflow.get_tracking_uri())

https://dagshub.com/piyushshukla857/diabetic_class.mlflow
