# Objective

Preaaring 5 different trained models to compare them and have experience with MLFlow registry

In [1]:
import mlflow

import xgboost as xgb
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [2]:
from data_loader import read_dataframe
from data_loader import encode_data

In [3]:
features = ['PU_DO', 'trip_distance']
target = 'duration'

## Preparing data

In [4]:
training_data = ["../data/green_tripdata_2021-01.parquet"]
validation_data = ["../data/green_tripdata_2021-03.parquet"]
test_data = ["../data/green_tripdata_2021-04.parquet"]

In [5]:
df_train = read_dataframe(*training_data)
df_validation = read_dataframe(*validation_data)

df_train.shape, df_validation.shape

((73908, 3), (80372, 3))

In [6]:
X_train, y_train, dv = encode_data(df_train, features, target)
X_validation, y_validation = dv.transform(df_validation[features].to_dict(orient='records')), df_validation[target]

## MLFlow + Training models

In [7]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("mlflow_model_registry_practice_by_code")

mlflow.sklearn.autolog(log_datasets=False)
mlflow.xgboost.autolog(log_datasets=False)

In [8]:
sklearn_models = [
  LinearRegression(), 
  Lasso(alpha=0.01, random_state=42),
  DecisionTreeRegressor(max_depth=3, random_state=42),
  RandomForestRegressor(n_estimators=5, max_depth=3, random_state=42)
]

In [13]:
for model in sklearn_models:
  model_name = str(model)
  model_name = model_name[:model_name.index("(")]
  with mlflow.start_run(run_name=model_name):
    trained_model = model.fit(X_train, y_train)
    validation_rmse = root_mean_squared_error(y_true = y_validation, y_pred = trained_model.predict(X_validation))
    mlflow.log_metric("rmse", validation_rmse)

In [None]:
boosting_hps = {
  'learning_rate': 0.06795566766046571,
  'max_depth': 74,
  'min_child_weight': 1.1034760099449035,
  'reg_alpha': 0.08418429054929681,
  'reg_lambda': 0.007240669500118009,
  'eval_metric': "rmse",
  'objective': 'reg:squarederror',
  'seed': 42
}

train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_validation, label=y_validation)

In [12]:
with mlflow.start_run(run_name="XGBoost"):
  validation_rmses = {}
  booster = xgb.train(
    params=boosting_hps,
    dtrain=train,
    num_boost_round=10,
    evals=[(valid, "validation")],
    early_stopping_rounds=2,
    evals_result=validation_rmses
  )

  mlflow.log_metric("rmse", min(validation_rmses['validation']['rmse']))

[0]	validation-rmse:11.69575
[1]	validation-rmse:11.18721
[2]	validation-rmse:10.72405
[3]	validation-rmse:10.30187
[4]	validation-rmse:9.91824
[5]	validation-rmse:9.56938
[6]	validation-rmse:9.25418
[7]	validation-rmse:8.96847
[8]	validation-rmse:8.70964
[9]	validation-rmse:8.47681


