In [9]:
import mlflow
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import pickle


# Set the tracking URI (optional)
# If you have a dedicated MLflow server, set its URI here.
# Otherwise, MLflow will default to saving data to the local ./mlruns directory.
# mlflow.set_tracking_uri('http://your_mlflow_server:5000')

# Create or get an experiment by name
def create_or_get_experiment(name):
    experiment_id = mlflow.get_experiment_by_name(name)
    if experiment_id is None:
        experiment_id = mlflow.create_experiment(name)
    else:
        experiment_id = experiment_id.experiment_id
    return experiment_id


NUMERICAL_COLS = ["Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight"]

ROOT_PATH = os.getcwd()
DATA_PATH = os.path.join(ROOT_PATH, "../data/abalone.csv")

In [2]:
# utiliatary functions
def read_data(dataset_path: str) -> pd.DataFrame:
    """Read the data at the given path and return a dataframe."""
    df = pd.read_csv(dataset_path)
    df_train = df.sample(frac=0.8, random_state=42)
    df_test = df.drop(df_train.index)
    return df_train, df_test

def save_pickle(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)


In [3]:
def preprocessing(
        df: pd.DataFrame,
        training: bool = False,
        scaler:StandardScaler = None,
        label_encoder: LabelEncoder = None) -> (pd.DataFrame, StandardScaler, LabelEncoder):
    
    df['age'] = df['Rings']+1.5
    df = df.drop('Rings', axis=1)

    if training:

        label_encoder = LabelEncoder()
        numerical_encoders = StandardScaler()

        df[NUMERICAL_COLS] = numerical_encoders.fit_transform(df[NUMERICAL_COLS])
        df['Sex'] = label_encoder.fit_transform(df['Sex'])

        return df, numerical_encoders, label_encoder
    else:
        df[NUMERICAL_COLS] = scaler.transform(df[NUMERICAL_COLS])
        df["Sex"] = label_encoder.transform(df['Sex'])
        return df, None, None

def extract_X_y(df: pd.DataFrame):
    X = df.drop('age', axis=1)
    y = df['age']
    return X, y

In [8]:
def train_model(X, y,  model=xgb.XGBRegressor())-> None:
    regressor = model
    regressor.fit(X, y)
    return regressor

def predict(model, preprocessed_X, scaler: StandardScaler, label_encoder: LabelEncoder):
    predictions = model.predict(preprocessed_X)
    return predictions

In [10]:
def evaluate_model(model, preprocessed_X, y):
    predictions = model.predict(preprocessed_X)
    metrics = { "RMSE": mean_squared_error(y, predictions), 
                "MAE": mean_absolute_error(y, predictions),
                "R2": r2_score(y, predictions)}
    return metrics

# Experiment with MLFlow runs

## Create new experiment


In [12]:
# Example usage
experiment_name = "abalone-project"
experiment_id = create_or_get_experiment(experiment_name)
print(f"Experiment ID for '{experiment_name}': {experiment_id}")

mlflow.set_experiment(experiment_id=experiment_id)

Experiment ID for 'abalone-project': 829909819513202339


<Experiment: artifact_location='file:///Users/pierr/Library/CloudStorage/OneDrive-Personnel/Documents/POLYTECHNIQUE/4A%20-%20HEC/13%20-%20MLOps/xhec-mlops-project-student/notebooks/mlruns/829909819513202339', creation_time=1698059793173, experiment_id='829909819513202339', last_update_time=1698059793173, lifecycle_stage='active', name='abalone-project', tags={}>

In [14]:
import time

# Start a run

PARAMS_GRID = {
    "max_depths": [5, 7, 9],
    "learning_rates": [0.1, 0.2],
    "n_estimators": [100, 200, 300]
}


for i in PARAMS_GRID["max_depths"]: 
    for j in PARAMS_GRID["learning_rates"]:
        for k in PARAMS_GRID["n_estimators"]:
            
            params = {
            "max_depth": i,
            "learning_rate": j,
            "n_estimators": k}

            with mlflow.start_run() as run:
                run_id = run.info.run_id


                # Set tags for the run
                mlflow.set_tag("Level", "Development")
                mlflow.set_tag("Team", "Data Science")

                # Log parameters
                mlflow.log_param("max_depth", i)
                mlflow.log_param("learning_rate", j)
                mlflow.log_param("n_estimators", k)


                # Load data
                train_df , test_df= read_data(DATA_PATH)

                # MLflow logs
                mlflow.log_param("train_size", len(train_df))
                mlflow.log_param("test_size", len(test_df))
                

                # Preprocess data
                train_df, scaler, label_encoder = preprocessing(train_df, training=True)
                test_df, _, _ = preprocessing(test_df, training=False, scaler=scaler, label_encoder=label_encoder)


                # Extract X and y
                X_train, y_train = extract_X_y(train_df)
                X_test, y_test = extract_X_y(test_df)

                # Train model
                model_to_train = xgb.XGBRegressor(**params)

                start = time.time()
                model = train_model(X_train, y_train, model_to_train)
                delta = time.time() - start
                mlflow.log_metric("train_time", delta)
                
                # Evaluate model
                train_metrics = evaluate_model(model, X_train, y_train)
                mlflow.log_metric("train RMSE", train_metrics["RMSE"])
                mlflow.log_metric("train MAE", train_metrics["MAE"])
                mlflow.log_metric("train R2", train_metrics["R2"])


                # Evaluate model on test set
                test_metrics = evaluate_model(model, X_test, y_test)
                mlflow.log_metric("test RMSE", test_metrics["RMSE"])
                mlflow.log_metric("test MAE", test_metrics["MAE"])
                mlflow.log_metric("test R2", test_metrics["R2"])

                # Log your model
                mlflow.xgboost.log_model(model, "models")


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_catego

In [16]:
chosen_run = "c1c1f539c3514098bb83ef1f77f3563f"
# Register your model as the production model
mlflow.register_model(f"runs:/{run_id}/models", "xgboost")

Successfully registered model 'xgboost'.
2023/10/23 14:01:24 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost, version 1
Created version '1' of model 'xgboost'.


<ModelVersion: aliases=[], creation_timestamp=1698062484014, current_stage='None', description=None, last_updated_timestamp=1698062484014, name='xgboost', run_id='4b289cb5ecd546cca02f0a1472c62605', run_link=None, source='file:///Users/pierr/Library/CloudStorage/OneDrive-Personnel/Documents/POLYTECHNIQUE/4A%20-%20HEC/13%20-%20MLOps/xhec-mlops-project-student/notebooks/mlruns/829909819513202339/4b289cb5ecd546cca02f0a1472c62605/artifacts/models', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [17]:
client = mlflow.MlflowClient()
production_version = 1

client.transition_model_version_stage(
    name="xgboost", version=production_version, stage="Production"
)

<ModelVersion: aliases=[], creation_timestamp=1698062484014, current_stage='Production', description=None, last_updated_timestamp=1698062621664, name='xgboost', run_id='4b289cb5ecd546cca02f0a1472c62605', run_link=None, source='file:///Users/pierr/Library/CloudStorage/OneDrive-Personnel/Documents/POLYTECHNIQUE/4A%20-%20HEC/13%20-%20MLOps/xhec-mlops-project-student/notebooks/mlruns/829909819513202339/4b289cb5ecd546cca02f0a1472c62605/artifacts/models', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [18]:
mlflow_experiment_path = f"/mlflow/xgboost"

In [20]:
model_uri = f"models:/{mlflow_experiment_path}/production"
print(model_uri)
model = mlflow.xgboost.load_model(model_uri)

model

models://mlflow/xgboost/production
