# Objective

Preaaring 5 different trained models to compare them and have experience with MLFlow registry

In [None]:
import mlflow

# import xgboost as xgb
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
from data_loader import read_dataframe
from data_loader import encode_data

In [None]:
features = ['PU_DO', 'trip_distance']
target = 'duration'

## Preparing data

In [None]:
training_data = ["../data/green_tripdata_2021-01.parquet"]
validation_data = ["../data/green_tripdata_2021-03.parquet"]
test_data = ["../data/green_tripdata_2021-04.parquet"]

In [None]:
df_train = read_dataframe(*training_data)
# df_validation = read_dataframe(*validation_data)

# df_train.shape, df_validation.shape

In [None]:
X_train, y_train, dv = encode_data(df_train, features, target)
# X_validation, y_validation = dv.transform(df_validation[features].to_dict(orient='records')), df_validation[target]

## MLFlow + Training models

In [None]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("mlflow_model_registry_practice")

mlflow.sklearn.autolog()
# mlflow.xgboost.autolog()

In [None]:

LinearRegression().fit(X_train, y_train)

In [None]:
Lasso(alpha=0.01, random_state=42).fit(X_train, y_train)

In [None]:
DecisionTreeRegressor(max_depth=3, random_state=42).fit(X_train, y_train)

In [None]:
RandomForestRegressor(n_estimators=5, max_depth=3, random_state=42).fit(X_train, y_train)

In [None]:
boosting_hps = {
  'learning_rate': 0.06795566766046571,
  'max_depth': 74,
  'min_child_weight': 1.1034760099449035,
  'reg_alpha': 0.08418429054929681,
  'reg_lambda': 0.007240669500118009,
  'objective': 'reg:squarederror',
  'seed': 42
}

train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_validation, label=y_validation)

In [None]:

booster = xgb.train(
  params=boosting_hps,
  dtrain=train,
  num_boost_round=10,
  evals=[(valid, "validation")],
  early_stopping_rounds=2
)