In [1]:
from sklearn.datasets import load_diabetes
import pandas as pd

data = load_diabetes()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
print(df.head())
print(df.describe())

        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  target  
0 -0.002592  0.019907 -0.017646   151.0  
1 -0.039493 -0.068332 -0.092204    75.0  
2 -0.002592  0.002861 -0.025930   141.0  
3  0.034309  0.022688 -0.009362   206.0  
4 -0.002592 -0.031988 -0.046641   135.0  
                age           sex           bmi            bp            s1  \
count  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02   
mean  -2.511817e-19  1.230790e-17 -2.245564e-16 -4.797570e-17 -1.381499e-17   
std    4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-0

In [3]:
from sklearn.model_selection import train_test_split

X =df.drop('target', axis=1)
y =df['target']
X_train,X_test,y_train,y_test = train_test_split(X,y ,test_size=0.2,random_state=42)
print(X_train.shape,y_test.shape)

(353, 10) (89,)


In [4]:
!pip install mlflow


Collecting mlflow
  Downloading mlflow-2.21.2-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.21.2 (from mlflow)
  Downloading mlflow_skinny-2.21.2-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.21.2->mlflow)
  Downloading databricks_sdk-0.49.0-py3-none-any.whl.metadata (38 kB)
Collecting fastapi<1 (from mlflow-skinny==2.21.2->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.21.2->mlflow)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 k

In [8]:
import mlflow
import mlflow.sklearn

mlflow.set_tracking_uri("file:/content/mlruns")
mlflow.set_experiment("diabetes_prediction")

2025/03/29 06:01:42 INFO mlflow.tracking.fluent: Experiment with name 'diabetes_prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///content/mlruns/905714477083410507', creation_time=1743228102194, experiment_id='905714477083410507', last_update_time=1743228102194, lifecycle_stage='active', name='diabetes_prediction', tags={}>

In [9]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Define the model
gbr = GradientBoostingRegressor(random_state=42)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5]
}

# Perform GridSearchCV
grid_search = GridSearchCV(gbr, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

# Evaluate on test set
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Test MSE:", mse)

Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Test MSE: 2849.6162853192623


In [10]:
with mlflow.start_run():
    # Log parameters
    mlflow.log_params(grid_search.best_params_)

    # Log metrics
    mlflow.log_metric("mse", mse)

    # Log the model
    mlflow.sklearn.log_model(best_model, "model")

    print("Experiment logged to MLflow!")



Experiment logged to MLflow!


In [11]:
import joblib

joblib.dump(best_model, 'diabetes_model_gbr.pkl', compress=3)
print("Model saved!")

Model saved!
