In [18]:
import mlflow
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from hyperopt.pyll import scope 

from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
X, y = data.data, data.target
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Convert to DMatrix
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Boston House Price Regression")

# Objective function for hyperopt
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        
        # Train XGBoost model
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        
        # Predict and evaluate performance
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        
        # Log the metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

# Define the hyperparameter space
space = {
    'max_depth': scope.int(hp.quniform('max_depth', 3, 10, 1)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'reg_alpha': hp.uniform('reg_alpha', 0, 10),
    'reg_lambda': hp.uniform('reg_lambda', 0, 10),
    'min_child_weight': hp.uniform('min_child_weight', 1, 10),
    'objective': 'reg:squarederror'
}

# Run the optimization
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, trials=trials)

print("Best hyperparameters:", best)


[0]	validation-rmse:1.08604                           
[1]	validation-rmse:1.04452                           
[2]	validation-rmse:0.99941                           
[3]	validation-rmse:0.94575                           
[4]	validation-rmse:0.89985                           
[5]	validation-rmse:0.86046                           
[6]	validation-rmse:0.82472                           
[7]	validation-rmse:0.80129                           
[8]	validation-rmse:0.76922                           
[9]	validation-rmse:0.75132                           
[10]	validation-rmse:0.73595                          
[11]	validation-rmse:0.71034                          
[12]	validation-rmse:0.68882                          
[13]	validation-rmse:0.67859                          
[14]	validation-rmse:0.66757                          
[15]	validation-rmse:0.65213                          
[16]	validation-rmse:0.63925                          
[17]	validation-rmse:0.62884                          
[18]	valid

In [7]:
import pandas as pd

In [24]:
import mlflow
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from hyperopt.pyll import scope 
from sklearn.datasets import fetch_california_housing
import pickle




# Trainign the Best one 
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Boston House Price Regression - New")

mlflow.xgboost.autolog(disable=True)
with mlflow.start_run():


    data_dct = fetch_california_housing()
    X = pd.DataFrame(data_dct.data, columns = data_dct.feature_names)
    y = data_dct.target
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)


    # Convert to DMatrix
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best = {'colsample_bytree': 0.5635458887594514,
    'learning_rate': 0.06921920192195667,
    'max_depth': 9,
    'min_child_weight': 9.983077851595704,
    'reg_alpha': 5.75023436140963,
    'reg_lambda': 7.637834865645946,
    'subsample': 0.9480385455129612}

    mlflow.log_params(best)

    booster = xgb.train(
        params=best,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("RMSE", rmse)

    with open("../models/preprocessor1.b", "wb") as f:
        pickle.dump(scaler, f)

    mlflow.log_artifact("../models/preprocessor1.b", artifact_path="Preprocessor")
    mlflow.xgboost.log_model(booster, artifact_path="manual_model_mlflow")

[0]	validation-rmse:1.10779
[1]	validation-rmse:1.08875
[2]	validation-rmse:1.06414
[3]	validation-rmse:1.02494
[4]	validation-rmse:0.97942
[5]	validation-rmse:0.93908
[6]	validation-rmse:0.90713
[7]	validation-rmse:0.88423
[8]	validation-rmse:0.85658
[9]	validation-rmse:0.83744
[10]	validation-rmse:0.82035
[11]	validation-rmse:0.79183
[12]	validation-rmse:0.77079
[13]	validation-rmse:0.75680
[14]	validation-rmse:0.74646
[15]	validation-rmse:0.72503
[16]	validation-rmse:0.70855
[17]	validation-rmse:0.69902
[18]	validation-rmse:0.68403
[19]	validation-rmse:0.67157
[20]	validation-rmse:0.65459
[21]	validation-rmse:0.64459
[22]	validation-rmse:0.63544
[23]	validation-rmse:0.62805
[24]	validation-rmse:0.62024
[25]	validation-rmse:0.61505
[26]	validation-rmse:0.60882
[27]	validation-rmse:0.60061
[28]	validation-rmse:0.58824
[29]	validation-rmse:0.58506
[30]	validation-rmse:0.58201
[31]	validation-rmse:0.57829
[32]	validation-rmse:0.57449
[33]	validation-rmse:0.56392
[34]	validation-rmse:0.5



In [25]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [26]:
client.search_experiments()

[<Experiment: artifact_location='/home/nishan/zoomcamp-mlops-learn/02-experiment_tracking/Learning/notebooks/mlruns/10', creation_time=1726110389175, experiment_id='10', last_update_time=1726110389175, lifecycle_stage='active', name='Boston House Price Regression - New', tags={}>,
 <Experiment: artifact_location='/home/nishan/zoomcamp-mlops-learn/02-experiment_tracking/Learning/notebooks/mlruns/9', creation_time=1726032883723, experiment_id='9', last_update_time=1726032883723, lifecycle_stage='active', name='Sample Regression New', tags={}>,
 <Experiment: artifact_location='/home/nishan/zoomcamp-mlops-learn/02-experiment_tracking/Learning/notebooks/mlruns/8', creation_time=1726032865476, experiment_id='8', last_update_time=1726032865476, lifecycle_stage='active', name='Sample Regression', tags={}>,
 <Experiment: artifact_location='/home/nishan/zoomcamp-mlops-learn/02-experiment_tracking/Learning/notebooks/mlruns/7', creation_time=1725767717848, experiment_id='7', last_update_time=17257

In [27]:
from mlflow.entities import ViewType

# Getting the best runs from an experiment
runs = client.search_runs(
    experiment_ids='10', # Selecting the experiment we need to work with.
    filter_string="", # Can include any Filtering Criterias.
    run_view_type=ViewType.ACTIVE_ONLY, # Selects only the active runs only.
    max_results=5, # Total Number of results we need.
    order_by=["metrics.rmse ASC"] # To order the result in Ascending or Descending order.
)

In [29]:
for run in runs:
    print(f"run id : {run.info.run_id}, rmse : {run.data.metrics['RMSE']}")

run id : 172120f5e7a04f8bbba287d76de17f54, rmse : 0.43167498843750596


In [30]:
# Promote the models to model registry
import mlflow

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

run_id = "172120f5e7a04f8bbba287d76de17f54"
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name="CaliRegressionBest")

Successfully registered model 'CaliRegressionBest'.
Created version '1' of model 'CaliRegressionBest'.


<ModelVersion: aliases=[], creation_timestamp=1726110653783, current_stage='None', description=None, last_updated_timestamp=1726110653783, name='CaliRegressionBest', run_id='172120f5e7a04f8bbba287d76de17f54', run_link=None, source='/home/nishan/zoomcamp-mlops-learn/02-experiment_tracking/Learning/notebooks/mlruns/10/172120f5e7a04f8bbba287d76de17f54/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [31]:
# Transitioning from one stage to another for registered model
client.transition_model_version_stage(
    name='CaliRegressionBest',
    version=1,
    stage='Staging',
    archive_existing_versions=False
)

<ModelVersion: aliases=[], creation_timestamp=1726110653783, current_stage='Staging', description=None, last_updated_timestamp=1726110709391, name='CaliRegressionBest', run_id='172120f5e7a04f8bbba287d76de17f54', run_link=None, source='/home/nishan/zoomcamp-mlops-learn/02-experiment_tracking/Learning/notebooks/mlruns/10/172120f5e7a04f8bbba287d76de17f54/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [74]:
import pickle
import pandas as pd
import numpy as np
import mlflow.pyfunc
import xgboost as xgb
from sklearn.metrics import mean_squared_error

def load_data(data_dct):
    X = pd.DataFrame(data_dct.data, columns = data_dct.feature_names)
    y = data_dct.target
    return X, y

def preprocess(df, preprocessor_path):

    # Load the preprocessor (StandardScaler in this case)
    with open(preprocessor_path, "rb") as f:
        scaler = pickle.load(f)

    # Apply necessary transformations, such as scaling
    df_transformed = scaler.transform(df)

    return df_transformed


def test_model(stage, X_test, y_test, model_name):

    # Load the model from MLflow
    model = mlflow.pyfunc.load_model(f"models:/{model_name}/{stage}")

    # Make predictions using the DataFrame directly
    y_pred = model.predict(X_test)  # No need to convert to DMatrix

    # Calculate and return the RMSE
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    return {"rmse": rmse}



In [36]:
X_test, y_test = load_data(data_dct)

In [39]:
run_id = "172120f5e7a04f8bbba287d76de17f54"

client.download_artifacts(run_id=run_id, path='Preprocessor', dst_path='.')

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

'/home/nishan/zoomcamp-mlops-learn/02-experiment_tracking/Learning/notebooks/Preprocessor'

In [40]:
import pickle
with open('Preprocessor/preprocessor1.b', 'rb') as f:
    scaler = pickle.load(f)

In [42]:
X_test = scaler.transform(X_test)

In [76]:
%%time
test_model(stage='Staging', X_test=X_test, y_test=y_test, model_name='CaliRegressionBest')

CPU times: user 1.1 s, sys: 19.8 ms, total: 1.12 s
Wall time: 117 ms


{'rmse': 0.29923858973594264}

# Model Registry

In [4]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [13]:
# List all available experiments
client.search_experiments()

[<Experiment: artifact_location='/home/nishan/zoomcamp-mlops-learn/02-experiment_tracking/Learning/notebooks/mlruns/9', creation_time=1726032883723, experiment_id='9', last_update_time=1726032883723, lifecycle_stage='active', name='Sample Regression New', tags={}>,
 <Experiment: artifact_location='/home/nishan/zoomcamp-mlops-learn/02-experiment_tracking/Learning/notebooks/mlruns/8', creation_time=1726032865476, experiment_id='8', last_update_time=1726032865476, lifecycle_stage='active', name='Sample Regression', tags={}>,
 <Experiment: artifact_location='/home/nishan/zoomcamp-mlops-learn/02-experiment_tracking/Learning/notebooks/mlruns/7', creation_time=1725767717848, experiment_id='7', last_update_time=1725767717848, lifecycle_stage='active', name='NYC Taxi Learning 01', tags={}>,
 <Experiment: artifact_location='/home/nishan/zoomcamp-mlops-learn/02-experiment_tracking/Learning/notebooks/mlruns/6', creation_time=1725685650712, experiment_id='6', last_update_time=1725685650712, lifecyc

In [11]:
# Create a new experiment such that the experiment ID Will be stored in a variable
exp_id = client.create_experiment(name='Sample Regression New')

In [26]:
from mlflow.entities import ViewType

# Getting the best runs from an experiment
runs = client.search_runs(
    experiment_ids='6', # Selecting the experiment we need to work with.
    filter_string="metrics.rmse < 0.45", # Can include any Filtering Criterias.
    run_view_type=ViewType.ACTIVE_ONLY, # Selects only the active runs only.
    max_results=5, # Total Number of results we need.
    order_by=["metrics.rmse ASC"] # To order the result in Ascending or Descending order.
)

In [27]:
for run in runs:
    print(f"run id : {run.info.run_id}, rmse : {run.data.metrics['rmse']}")

run id : 1512a7464c974eb29420bf1f9824e19b, rmse : 0.43167498843750596
run id : fa65fbddd6c244d5b270c4d0854cb492, rmse : 0.4325843420061471
run id : e15c7d7f8fa14f5faae03e3a6e7414e4, rmse : 0.43298243076261905
run id : 612495b0f7124363b9702ad51c7b6406, rmse : 0.43344575497566096
run id : 30cfea00432a411983601654a677bfe1, rmse : 0.4338641589172989


In [30]:
# Promote the models to model registry
import mlflow

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

run_id = "1512a7464c974eb29420bf1f9824e19b"
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name="BostonRegression")

Registered model 'BostonRegression' already exists. Creating a new version of this model...
Created version '3' of model 'BostonRegression'.


<ModelVersion: aliases=[], creation_timestamp=1726033971788, current_stage='None', description=None, last_updated_timestamp=1726033971788, name='BostonRegression', run_id='1512a7464c974eb29420bf1f9824e19b', run_link=None, source='/home/nishan/zoomcamp-mlops-learn/02-experiment_tracking/Learning/notebooks/mlruns/6/1512a7464c974eb29420bf1f9824e19b/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

In [40]:
# Checking for the transition status of models
regsitry_name = "BostonRegression"
latest_versions = client.get_latest_versions(name=regsitry_name)

for version in latest_versions:
    print(f"version : {version.version}, stage : {version.current_stage}")

version : 2, stage : Staging
version : 3, stage : None


In [41]:
# Transitioning from one stage to another for registered model
client.transition_model_version_stage(
    name=regsitry_name,
    version=3,
    stage='Staging',
    archive_existing_versions=False
)

<ModelVersion: aliases=[], creation_timestamp=1726033971788, current_stage='Staging', description=None, last_updated_timestamp=1726034331942, name='BostonRegression', run_id='1512a7464c974eb29420bf1f9824e19b', run_link=None, source='/home/nishan/zoomcamp-mlops-learn/02-experiment_tracking/Learning/notebooks/mlruns/6/1512a7464c974eb29420bf1f9824e19b/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

In [42]:
# Transitioning from one stage to another for registered model
client.transition_model_version_stage(
    name=regsitry_name,
    version=2,
    stage='Production',
    archive_existing_versions=False
)

<ModelVersion: aliases=[], creation_timestamp=1726031916425, current_stage='Production', description='', last_updated_timestamp=1726034352697, name='BostonRegression', run_id='48988eeac1c94ea09fd7e4a2096bc6f6', run_link='', source='/home/nishan/zoomcamp-mlops-learn/02-experiment_tracking/Learning/notebooks/mlruns/6/48988eeac1c94ea09fd7e4a2096bc6f6/artifacts/model', status='READY', status_message=None, tags={'model': 'xgboost2'}, user_id=None, version=2>

In [44]:
# Adding a version description for model stages
from datetime import datetime
date = datetime.today()

client.update_model_version(
    name=regsitry_name,
    version=2,
    description=f"The model version was transitioned to Production Stage on - {date}"
)

<ModelVersion: aliases=[], creation_timestamp=1726031916425, current_stage='Production', description=('The model version was transitioned to Production Stage on - 2024-09-11 '
 '11:32:28.979448'), last_updated_timestamp=1726034548982, name='BostonRegression', run_id='48988eeac1c94ea09fd7e4a2096bc6f6', run_link='', source='/home/nishan/zoomcamp-mlops-learn/02-experiment_tracking/Learning/notebooks/mlruns/6/48988eeac1c94ea09fd7e4a2096bc6f6/artifacts/model', status='READY', status_message=None, tags={'model': 'xgboost2'}, user_id=None, version=2>

In [None]:
# Archiving model in stagring or production
