In [1]:
import os
print(os.getcwd())

import os
from pathlib import Path

# Set the working directory to the location of the notebook
notebook_dir = Path(os.getcwd()) / "02-experiment tracking"
os.chdir(notebook_dir)

print("Current working directory:", os.getcwd())

/workspaces/mlops-zoomcamp
Current working directory: /workspaces/mlops-zoomcamp/02-experiment tracking


In [2]:
# !pip install numpy
import pandas as pd
import pickle

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline

from sklearn.metrics import root_mean_squared_error

In [3]:
# !pip install mlflow
import mlflow


In [4]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    df['PU_DO'] = df.PULocationID + "_" + df.DOLocationID
    
    return df

In [5]:
df_train = read_dataframe('./data/green_tripdata_2024-01.parquet')
df_val = read_dataframe('./data/green_tripdata_2024-02.parquet')

print(df_train.shape[0], df_val.shape[0])

54373 51497


In [6]:
categorical = ['PU_DO', 'PULocationID', 'DOLocationID']
numerical = ['trip_distance']
target = 'duration'

pipeline = make_pipeline(
    DictVectorizer(),
    LinearRegression()
)

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
val_dicts = df_val[categorical + numerical].to_dict(orient='records')

y_train = df_train[target].values
y_val = df_val[target].values

In [10]:
pipeline.fit(train_dicts, y_train)
y_pred = pipeline.predict(val_dicts)

print(f"root mean squared error is: {root_mean_squared_error(y_val, y_pred)}")

root mean squared error is: 5.926355548541141


In [None]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump(pipeline, f_out)

In [11]:
alpha = 0.001

# mlflow ui --backend-store-uri sqlite:///mlflow.db

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

with mlflow.start_run():

    mlflow.set_tag("developer", "mohammad")

    mlflow.log_param("train-data-path", "./data/green_tripdata_2024-01.parquet")
    mlflow.log_param("valid-data-path", "./data/green_tripdata_2024-02.parquet")

    pipeline = make_pipeline(
        DictVectorizer(),
        Lasso(alpha)
    )
    
    mlflow.log_param("alpha", alpha)

    pipeline.fit(train_dicts, y_train)

    y_pred = pipeline.predict(val_dicts)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    mlflow.log_artifact(local_path="./models/lin_reg.bin", artifact_path="models_pickle")

### Adding hyper-parameter tuning

In [None]:
# !pip install xgboost
# !pip install hyperopt

from xgboost import XGBRegressor

import numpy as np
import random 

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error

# the following uses some 'baysian approach' to find the best hyper-parameter
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [None]:
# dv = DictVectorizer()
# X_train = dv.fit_transform(train_dicts)
# X_val = dv.fit_transform(val_dicts)

# train = xgb.DMatrix(X_train, label=y_train)
# valid = xgb.DMatrix(X_val, label=y_val)

In [None]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "lasso")
        mlflow.log_params(params)

        pipeline = make_pipeline(
            DictVectorizer(),
            Lasso(alpha=params['alpha'])
        )
        pipeline.fit(train_dicts, y_train)
        y_pred = pipeline.predict(val_dicts)

        y_pred = pipeline.predict(val_dicts)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
search_space = {
    'alpha': hp.loguniform('alpha', -5, -1),
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=Trials()
)

In [None]:
def objective_xgb(params):

    # Set seeds for reproducibility
    seed = params.get('seed', 42)
    np.random.seed(seed)
    random.seed(seed)

    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)

        model = XGBRegressor(
            # params,
            learning_rate=params['learning_rate'],
            max_depth=int(params['max_depth']),
            reg_alpha=params['reg_alpha'],
            reg_lambda=params['reg_lambda'],
            random_state=seed,
            verbosity=0,
            objective=params['objective']
        )

        pipeline = make_pipeline(
            DictVectorizer(),
            model
        )

        pipeline.fit(train_dicts, y_train)
        y_pred = pipeline.predict(val_dicts)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -6, -1),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror',
    'seed': 42
}
best_result = fmin(
    fn=objective_xgb,
    space=search_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=Trials()
)

In [None]:
import mlflow.sklearn
import mlflow.xgboost

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")


params = {
    'learning_rate':    0.410215639489084,
    'max_depth':    9,
    'min_child_weight':    2.955225015279242,
    'objective':     'reg:squarederror',
    'reg_alpha':    0.07847140786574915,
    'reg_lambda':    0.15098298624710604,
    'seed':    42
}

model = XGBRegressor(
    learning_rate=params['learning_rate'],
    max_depth=int(params['max_depth']),
    reg_alpha=params['reg_alpha'],
    reg_lambda=params['reg_lambda'],
    random_state=params['seed'],
    verbosity=0,
    objective=params['objective']
)

pipeline = make_pipeline(
    DictVectorizer(),
    model
)

with mlflow.start_run():
    pipeline.fit(train_dicts, y_train)
    y_pred = pipeline.predict(val_dicts)
    rmse = root_mean_squared_error(y_val, y_pred)
    print(rmse)

    mlflow.set_tag("model", "xgboost")
    mlflow.log_params(params)
    mlflow.log_metric("rmse", rmse)
    
    # Log the entire sklearn pipeline
    mlflow.sklearn.log_model(pipeline, "model")

# TO Test the model

In [None]:
import mlflow.pyfunc

# Load the pipeline model
model_uri = 'runs:/65a7096112cb41b693e557e8008cfa9c/model'
model = mlflow.pyfunc.load_model(model_uri)

# Predict directly on val_dicts (list of dicts)
preds = model.predict(val_dicts)

print(root_mean_squared_error(y_val, preds))

# print(preds)


# Gradient boosting

In [None]:
import numpy as np
import random 

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
# from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

# the following uses some 'baysian approach' to find the best hyper-parameter
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [None]:
def objective_gb(params):

    # mlflow.sklearn.autolog()

    with mlflow.start_run():
        mlflow.set_tag("model", "gradientboosting")
        mlflow.log_params(params)

        model = GradientBoostingRegressor(
            # params,
            learning_rate=params['learning_rate'],
            max_depth=int(params['max_depth']),
            min_samples_split=params['min_samples_split'],
            random_state=params['random_state'],
            verbose=0,
            loss=params['loss']
        )

        pipeline = make_pipeline(
            DictVectorizer(),
            model
        )

        pipeline.fit(train_dicts, y_train)
        y_pred = pipeline.predict(val_dicts)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [22]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 4, 50, 1)),
    'learning_rate': hp.loguniform('learning_rate', -7, -1),
    'loss': 'squared_error',
    'random_state': 42
}
best_result = fmin(
    fn=objective_gb,
    space=search_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=Trials()
)

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]





  5%|▌         | 1/20 [01:44<33:06, 104.56s/trial, best loss: 5.383652413454907]





 10%|█         | 2/20 [02:06<16:42, 55.68s/trial, best loss: 5.106544191898413] 





 15%|█▌        | 3/20 [04:23<26:25, 93.24s/trial, best loss: 5.106544191898413]





 20%|██        | 4/20 [08:40<42:00, 157.52s/trial, best loss: 5.106544191898413]





 25%|██▌       | 5/20 [12:20<45:05, 180.38s/trial, best loss: 5.106544191898413]





 30%|███       | 6/20 [16:18<46:37, 199.85s/trial, best loss: 5.106544191898413]





 35%|███▌      | 7/20 [20:06<45:17, 209.06s/trial, best loss: 5.106544191898413]





 40%|████      | 8/20 [21:10<32:34, 162.88s/trial, best loss: 5.106544191898413]





 45%|████▌     | 9/20 [24:09<30:47, 167.98s/trial, best loss: 5.106544191898413]





 50%|█████     | 10/20 [24:42<21:01, 126.15s/trial, best loss: 5.106544191898413]





 55%|█████▌    | 11/20 [26:38<18:27, 123.01s/trial, best loss: 5.106544191898413]





 60%|██████    | 12/20 [28:09<15:06, 113.25s/trial, best loss: 5.106544191898413]





 65%|██████▌   | 13/20 [30:52<14:59, 128.51s/trial, best loss: 5.106544191898413]





 70%|███████   | 14/20 [32:49<12:29, 124.88s/trial, best loss: 5.106544191898413]





 75%|███████▌  | 15/20 [36:05<12:11, 146.26s/trial, best loss: 5.106544191898413]





 80%|████████  | 16/20 [36:49<07:42, 115.63s/trial, best loss: 5.106544191898413]





 85%|████████▌ | 17/20 [38:23<05:27, 109.24s/trial, best loss: 5.106544191898413]





 90%|█████████ | 18/20 [42:30<05:00, 150.49s/trial, best loss: 5.106544191898413]





 95%|█████████▌| 19/20 [43:30<02:03, 123.46s/trial, best loss: 5.106544191898413]





100%|██████████| 20/20 [46:43<00:00, 140.17s/trial, best loss: 5.106544191898413]


In [9]:
import mlflow.sklearn
from sklearn.ensemble import GradientBoostingRegressor

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")


best_params = {
    'learning_rate':    0.05563655633030401,
    "max_depth": 13,
    "min_samples_split": 17,
    "loss": "squared_error",
}

model = GradientBoostingRegressor(
    learning_rate=best_params['learning_rate'],
    max_depth=int(best_params['max_depth']),
    min_samples_split=best_params['min_samples_split'],
    verbose=0,
    loss=best_params['loss']
)

pipeline = make_pipeline(
    DictVectorizer(),
    model
)

with mlflow.start_run():
    pipeline.fit(train_dicts, y_train)
    y_pred = pipeline.predict(val_dicts)
    rmse = root_mean_squared_error(y_val, y_pred)
    print(rmse)

    mlflow.set_tag("model", "gradient_boosting")
    mlflow.log_params(best_params)
    mlflow.log_metric("rmse", rmse)
    
    # Log the entire sklearn pipeline
    mlflow.sklearn.log_model(pipeline, "model")



5.104456270121915


