Week 2: ML experiment tracking

Data source: the NYC taxi dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import root_mean_squared_error

In [3]:
pd.__version__, np.__version__, sns.__version__

('2.2.3', '2.2.5', '0.13.2')

In [4]:
import mlflow
mlflow.__version__

'2.22.0'

In [None]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('NYC-taxi-experiment')

In [6]:
jan_fn =  '../data/yellow_tripdata_2023-01.parquet'
feb_fn =  '../data/yellow_tripdata_2023-02.parquet'

In [7]:
# embed all the preprocessing in a function
def preprocess_df(fn, categorical_cols):
    df = pd.read_parquet(fn)
    # create duration (in minutes) feature
    df['duration'] = (
        df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
    ).dt.total_seconds().div(60.)
    # filter out outliers: trips should be between 1 and 60 minutes.
    df = df[
        (1 <= df.duration) & (df.duration <= 60.)
    ]
    # convert categorical columns to string data type
    df[categorical_cols] = df[categorical_cols].astype(str)
    return df

In [8]:
cat_cols = ['PULocationID', 'DOLocationID']
january_trips = preprocess_df(jan_fn, cat_cols)
february_trips = preprocess_df(feb_fn, cat_cols)

In [9]:
january_trips.shape, february_trips.shape

((3009173, 20), (2855951, 20))

In [14]:
vectorizer = DictVectorizer(sparse=True)

In [15]:
training_feats_dict = january_trips[cat_cols].to_dict(orient='records')
vectorizer.fit(training_feats_dict)

In [17]:
X_train = vectorizer.transform(training_feats_dict)
y_train = january_trips['duration'].to_numpy()

In [18]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [20]:
rmse = root_mean_squared_error(
    y_train, lr_model.predict(X_train)
)
print(f'RMSE on train = {rmse:.2f}')

RMSE on train = 7.65


In [21]:
# save model
import pickle
with open('../models/linear_reg.pkl', 'wb') as fw:
    pickle.dump((vectorizer, lr_model), fw, protocol=pickle.HIGHEST_PROTOCOL)

In [22]:
validation_features_dict = february_trips[cat_cols].to_dict('records')

In [23]:
X_valid = vectorizer.transform(validation_features_dict)
y_valid = february_trips.duration.to_numpy()

In [25]:
# check dimension
print(f'Validation set dimension: {X_valid.shape}')
print(f'Does it match training feature vector dimension ? {X_train.shape[1] == X_valid.shape[1]}')

Validation set dimension: (2855951, 515)
Does it match training feature vector dimension ? True


In [26]:
valid_rmse = root_mean_squared_error(
    y_valid, lr_model.predict(X_valid)
)
print(f'Validation set RMSE: {valid_rmse:.2f}')

Validation set RMSE: 7.81


In [28]:
# try a different lR model
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)
root_mean_squared_error(y_valid, lasso_model.predict(X_valid))

8.750574569620587

Important to note that at this stage there is no tracking. 

So changing anything in the model would result in not having the history of what we changed. 

That's where mlflow comes to play.

In [31]:
with mlflow.start_run():
    mlflow.set_tag("developer", "mamady")
    mlflow.log_param('train-data-path', '../data/yellow_tripdata_2023-01.parquet')
    mlflow.log_param('valid-data-path', '../data/yellow_tripdata_2023-02.parquet')

    alpha = 0.001
    mlflow.log_param('alpha', alpha)
    lasso_model = Lasso(alpha=0.1)
    lasso_model.fit(X_train, y_train)
    
    rmse = root_mean_squared_error(
        y_valid, lasso_model.predict(X_valid)
    )
    mlflow.log_metric('rmse', rmse)

Now let's use a little more complex model to explore more mlflow features.

In [32]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [33]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_valid, label=y_valid)

In [38]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(
            y_valid, y_pred
        )
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [36]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 50, 100, 10)),
    'learning_rate': hp.loguniform('learning_rate', -2, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -2, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -3, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

In [None]:
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=10,
    trials=Trials(),
    verbose=False,
)

In [43]:
best_result

{'learning_rate': np.float64(0.24553944703020156),
 'max_depth': np.float64(90.0),
 'min_child_weight': np.float64(5.1467120397278725),
 'reg_alpha': np.float64(0.3175195505655532),
 'reg_lambda': np.float64(0.2810968936145532)}

In [68]:
mlflow.xgboost.autolog(disable=True)
signature = mlflow.models.infer_signature(
    X_valid, y_valid
)

In [None]:
# train the model with best parameters
best_result['max_depth'] = int(best_result['max_depth'])
booster = xgb.train(
    params=best_result,
    dtrain=train,
    num_boost_round=1000,
    evals=[(valid, 'validation')],
    early_stopping_rounds=50,
    signature=signature,
    verbose_eval=False,
)

In [69]:
with mlflow.start_run():
    mlflow.log_params(best_result)
    booster = xgb.train(
        params=best_result,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50,
        verbose_eval=False,
    )
    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(
        y_valid, y_pred
    )
    mlflow.log_metric("rmse", rmse)

    with open("../models/preprocessor.pkl", "wb") as fw:
        pickle.dump(vectorizer, fw)
    mlflow.log_artifact("../models/preprocessor.pkl", artifact_path="preprocessor")
    model_info = mlflow.xgboost.log_model(
        booster, 
        artifact_path='models_mlflow',
        signature=signature,
    )

  xgb_model.save_model(model_data_path)


In [70]:
# validating the model before production
model_uri = 'runs:/66e134ea35f248e0aca1dbf3d62bff43/models_mlflow'
loaded_model = mlflow.pyfunc.load_model(model_uri)

In [71]:
xgboost_model = mlflow.xgboost.load_model(model_uri)

In [72]:
root_mean_squared_error(y_valid, xgboost_model.predict(valid))

5.209388520304419