In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge

from sklearn.metrics import mean_squared_error

import pickle

In [2]:
import mlflow
import os

os.makedirs("./mlruns", exist_ok=True)
mlflow.set_tracking_uri("sqlite:///mlflow.db")

mlflow.set_experiment(experiment_name="nyc-taxi-experiment")

<Experiment: artifact_location='/workspaces/zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1734313537028, experiment_id='1', last_update_time=1734313537028, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [3]:
print("Tracking URI:", mlflow.get_tracking_uri())
print("Artifact URI", mlflow.get_artifact_uri())

Tracking URI: sqlite:///mlflow.db
Artifact URI /workspaces/zoomcamp/02-experiment-tracking/mlruns/1/d82ffca780314ad8b7f160c8d5774a62/artifacts


In [4]:
def read_df(filename: str):
    df = pd.read_parquet(filename)

    df["lpep_pickup_datetime"] = pd.to_datetime(df["lpep_pickup_datetime"])
    df["lpep_dropoff_datetime"] = pd.to_datetime(df["lpep_dropoff_datetime"])

    df["duration"] = df["lpep_dropoff_datetime"] - df["lpep_pickup_datetime"] # our target

    td = df["duration"].iloc[0]
    df["duration"] = df["duration"].apply(lambda td: td.total_seconds() / 60) # turn it into minutes
    
    df = df[(df.duration >= 1) & (df.duration <= 60)] # remove outliers / errors

    # features we are going to use
    categorical = ["PULocationID", "DOLocationID"]
    numerical = ["trip_distance"]

    df[categorical] = df[categorical].astype(str) # ints that will be used as categorial features
    
    return df

In [5]:
df_train = read_df("../data/green_tripdata_2021-01.parquet") # train the model on january
df_val = read_df("../data/green_tripdata_2021-02.parquet")   # validate the model on febuary

In [6]:
round(df_train['duration'].std(),2)

np.float64(11.56)

In [7]:
# creating a better feature on existing columns = better model (reduce mse)
df_train["PU_DO"] = df_train["PULocationID"] + '_' + df_train["DOLocationID"]
df_val["PU_DO"] = df_val["PULocationID"] + '_' + df_val["DOLocationID"]

In [8]:
# transform dicts into feature matrices
dv = DictVectorizer() # performs auto one-hot encoding on categorial features

categorical = ["PU_DO", "PULocationID", "DOLocationID"]
numerical = ["trip_distance"]

train_dict = df_train[categorical + numerical].to_dict(orient="records")
X_train = dv.fit_transform(train_dict) # convert dict to matrix

val_dict = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dict)


In [9]:
# define our target on train and validation sets
target = "duration"
y_train = df_train[target].values
y_val = df_val[target].values

In [13]:
with mlflow.start_run(): # everything inside is associated with the current run
    
    mlflow.set_tag("developer", "vitor")
    
    mlflow.log_param("train-data-path", "../data/green_tripdata_2021-01.parquet")
    mlflow.log_param("valid-data-path", "../data/green_tripdata_2021-02.parquet")
    
    alpha = 0.1
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    
    mlflow.log_artifact(local_path="models/lin_reg.bin", artifact_path="mlartifacts") # log the model from a local path

In [44]:
import os

# Check current working directory
print("Current working directory:", os.getcwd())

# Check if /workspaces exists
print("Does /workspaces exist?:", os.path.exists("/workspaces"))

# Check if we're in a container (one way to check)
print("Are we in a container?:", os.path.exists("/.dockerenv"))

Current working directory: /Users/vitor/Documents/GitHub/zoomcamp/02-experiment-tracking
Does /workspaces exist?: False
Are we in a container?: False


In [26]:
# by saving both objects, dv can apply the transformations to future datasets 
# and lr can make the predictions after that

with open("../models/lin_reg.bin", "wb") as f_out:
    pickle.dump((dv, lr), f_out)

In [10]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [11]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [19]:
# There will be a run for each set of parameters, associated with the experiment
# With this, we can compare the results of different runs based on metrics

def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params) # log params to mlflow
        booster = xgb.train(
            params=params, # also pass the params to the model
            dtrain=train,  # on the training data
            num_boost_round=1000, # 1000 iterations of the booster
            evals=[(valid, "validation")], # validation set to control the optimization (minimize error on validation set)
            early_stopping_rounds=50 # stop if the error on the validation set does not improve for 50 rounds
        )
        y_pred = booster.predict(valid) # make predictions on the validation set, get the target
        rmse = mean_squared_error(y_val, y_pred) # hyperopt minimizes the error
        mlflow.log_metric("rmse", rmse)
        
    return {"loss": rmse, "status": STATUS_OK}
    


In [20]:
# the range in which hyperopt will search for the best parameters
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0), # exp(-3), exp(0)
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42,
}

# pass the information to the fmin method (will minimize the output of the objective function)
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials())

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

job exception: Run with UUID bb15f605811a4c4282e6fdcb417c1f9c is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True



  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]


Exception: Run with UUID bb15f605811a4c4282e6fdcb417c1f9c is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True

In [17]:
import os
print(os.getcwd())

/Users/vitor/Documents/GitHub/zoomcamp/02-experiment-tracking


In [16]:
if mlflow.active_run() is not None:
    mlflow.end_run()

with mlflow.start_run():

    params = {
        "learning_rate": 0.06739331004838427,
        "max_depth": 25,
        "min_child_weight": 1.519461591196802,
        "objective": "reg:linear",
        "reg_alpha": 0.02979646034040559,
        "reg_lambda": 0.21172810212072343,
        "seed": 42,
    }
    
    mlflow.log_params(params) # can pass a dictionary with the parameters
    
    booster = xgb.train(
    params=params,
    dtrain=train,
    num_boost_round=5,
    evals=[(valid, "validation")],
    early_stopping_rounds=5
    )
    
    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    
    mlflow.xgboost.log_model(booster, artifact_path="models") # log the model to mlflow




[0]	validation-rmse:11.66075
[1]	validation-rmse:11.15679
[2]	validation-rmse:10.69644
[3]	validation-rmse:10.27704
[4]	validation-rmse:9.89500




OSError: [Errno 30] Read-only file system: '/workspaces'

Model Management

- Model Versioning 