In [8]:
import mlflow
import mlflow.xgboost
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import pandas as pd
import dagshub

In [3]:
dagshub.init(repo_owner='piyushshukla857', repo_name='diabetic_class', mlflow=True)

In [4]:
mlflow.set_tracking_uri('https://dagshub.com/piyushshukla857/diabetic_class.mlflow')

In [5]:
df = pd.read_csv('../data/external/dataset.csv')

In [21]:
df.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other,smoking_history_encoded
0,80.0,0,1,25.19,6.6,140,0,True,False,False,0.0
1,54.0,0,0,27.32,6.6,80,0,True,False,False,5.0
2,28.0,0,0,27.32,5.7,158,0,False,True,False,0.0
3,36.0,0,0,23.45,5.0,155,0,True,False,False,3.0
4,76.0,1,1,20.14,4.8,155,0,False,True,False,3.0


In [24]:
df['diabetes'].value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

In [6]:
gender_ohe = pd.get_dummies(df['gender'], prefix='gender')
df = pd.concat([df, gender_ohe], axis=1)
df.drop(columns=['gender'], inplace=True)

In [9]:
smoking_ordinal = OrdinalEncoder(categories=[['never', 'former', 'not current', 'current', 'ever', 'No Info']])
df['smoking_history_encoded'] = smoking_ordinal.fit_transform(df[['smoking_history']])
df.drop(columns=['smoking_history'], inplace=True)

In [10]:
df['bmi'].fillna(df['bmi'].median(), inplace=True)
df['HbA1c_level'].fillna(df['HbA1c_level'].median(), inplace=True)
df['blood_glucose_level'].fillna(df['blood_glucose_level'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bmi'].fillna(df['bmi'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['HbA1c_level'].fillna(df['HbA1c_level'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whi

In [11]:
X = df.drop(columns=['diabetes'])
y = df['diabetes']

In [12]:
mlflow.set_experiment("Hyperparameter for Xgboost")

2024/09/07 18:46:45 INFO mlflow.tracking.fluent: Experiment with name 'Hyperparameter for Xgboost' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/e718eba0c5b04d90ace123090fa5a802', creation_time=1725715006898, experiment_id='3', last_update_time=1725715006898, lifecycle_stage='active', name='Hyperparameter for Xgboost', tags={}>

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
scale_pos_weight = 91500 / 8500

In [30]:
def objective(trial):
    # Define the hyperparameters to tune
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 0.5),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', scale_pos_weight * 0.5, scale_pos_weight * 1.5),
        'objective': 'binary:logistic',
        'use_label_encoder': False
    }

    # Create the model with the parameters suggested by Optuna
    model = XGBClassifier(**params, random_state=42)
    
    # Train the model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate recall (target metric)
    f1 = f1_score(y_test, y_pred)
    
    # Return the recall score to be maximized
    return f1


In [33]:
with mlflow.start_run(run_name="XGBoost with Optuna "):

    # Optimize hyperparameters using Optuna
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)  # Adjust the number of trials as needed

    # Log each trial's parameters and metrics
    for trial in study.trials:
        with mlflow.start_run(run_name=f"Trial {trial.number}", nested=True):
            trial_params = trial.params
            trial_value = trial.value
            
            # Log parameters and metrics for each trial
            mlflow.log_params(trial_params)
            mlflow.log_metric("f1", trial_value)
            
            print(f"Trial {trial.number} - Params: {trial_params}, f1: {trial_value}")

    # Get the best trial
    best_trial = study.best_trial
    print(f"Best trial: {best_trial.params}")

    # Train the model with the best hyperparameters
    best_model = XGBClassifier(**best_trial.params, random_state=42, use_label_encoder=False)
    best_model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = best_model.predict(X_test)

    # Calculate and log evaluation metrics
    recall = recall_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    mlflow.log_metric("recall", recall)  # Primary metric for optimization
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("f1_score", f1)

    # Log hyperparameters
    for param, value in best_trial.params.items():
        mlflow.log_param(param, value)

    # Log the model
    mlflow.xgboost.log_model(best_model, artifact_path="xgboost_model")

    

# Output the final metrics for verification
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")

RestException: 403: You have reached the limit of 100 experiments for a private repository in the free plan.
Please upgrade to the Starter plan to increase the limit at https://dagshub.com/pricing?repo_id=52070