In [1]:
!python -V

Python 3.11.7


In [2]:
import pandas as pd

In [3]:
import pickle

In [4]:
#import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [5]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [6]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [7]:
import mlflow
import mlflow.sklearn
import mlflow.xgboost
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.svm import LinearSVR


In [8]:
mlflow.set_tracking_uri("sqlite:///mlflow2.db")
mlflow.set_experiment("nyc-taxi-experiment_1")

<Experiment: artifact_location='file:///c:/Users/mikes/Documents/STUDY/mlops-zoomcamp/mlops_zoomcamp_hw/02-experiment-tracking/mlruns/1', creation_time=1747606063871, experiment_id='1', last_update_time=1747606063871, lifecycle_stage='active', name='nyc-taxi-experiment_1', tags={}>

In [9]:
df_val = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')
df_val['duration'] = (df_val['lpep_dropoff_datetime'] - df_val['lpep_pickup_datetime']).dt.total_seconds() / 60
df_val = df_val[(df_val['duration'] >= 1) & (df_val['duration'] <= 60)]

In [10]:
#df = pd.read_parquet('./data/green_tripdata_2021-01.parquet')

df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')

df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

df = df[(df.duration >= 1) & (df.duration <= 60)]

df["weekday"] = df.lpep_pickup_datetime.dt.weekday
df["hour"] =  df.lpep_pickup_datetime.dt.hour

categorical = ['PULocationID', 'DOLocationID',  'weekday', 'hour']
numerical = ['trip_distance']

df[categorical] = df[categorical].astype(str)

In [11]:
train_dicts = df[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

print("Feature matrix dimensions:", X_train.shape)

Feature matrix dimensions: (73908, 538)


In [13]:
target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred)

95.25045983499173

In [12]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [15]:
#df_train = read_dataframe('./data/green_tripdata_2023-01.parquet')
#df_val = read_dataframe('./data/green_tripdata_2023-02.parquet')

In [13]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')
df_train.to_parquet('data/green_tripdata_2021-01.parquet')
df_val.to_parquet('data/green_tripdata_2021-02.parquet')

In [19]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [20]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [16]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [21]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [22]:
mlflow.sklearn.autolog()
mlflow.xgboost.autolog()

In [23]:

models = {
   # "ExtraTrees": ExtraTreesRegressor(n_estimators=100, random_state=42),
   # "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
   # "LinearSVR": LinearSVR(max_iter=10000),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
   # "XGBoost": XGBRegressor(n_estimators=100, use_label_encoder=False, eval_metric='rmse', random_state=42)
}

In [24]:
with mlflow.start_run(run_name="XGBoost") as run:
    params = {
        'eta': 0.3,
        'max_depth': 6,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'seed': 42
    }

    xgb_model = xgb.train(params, train, evals=[(valid, "validation")], num_boost_round=100, early_stopping_rounds=10)
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 6)
    mlflow.log_param("learning_rate", 0.3)

    y_pred = xgb_model.predict(valid)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mlflow.log_metric("rmse", rmse)

[0]	validation-rmse:12.05041
[1]	validation-rmse:11.95388
[2]	validation-rmse:9.82269
[3]	validation-rmse:8.53012
[4]	validation-rmse:7.78619
[5]	validation-rmse:7.36522
[6]	validation-rmse:7.12856
[7]	validation-rmse:7.12091
[8]	validation-rmse:7.11523
[9]	validation-rmse:6.98196
[10]	validation-rmse:6.90427
[11]	validation-rmse:6.85351
[12]	validation-rmse:6.82661
[13]	validation-rmse:6.80889
[14]	validation-rmse:6.79244
[15]	validation-rmse:6.78886
[16]	validation-rmse:6.77945
[17]	validation-rmse:6.77694
[18]	validation-rmse:6.77509
[19]	validation-rmse:6.77301
[20]	validation-rmse:6.76131
[21]	validation-rmse:6.75672
[22]	validation-rmse:6.75472
[23]	validation-rmse:6.75326
[24]	validation-rmse:6.74962
[25]	validation-rmse:6.74529
[26]	validation-rmse:6.74199
[27]	validation-rmse:6.74001
[28]	validation-rmse:6.73761
[29]	validation-rmse:6.73568
[30]	validation-rmse:6.73402
[31]	validation-rmse:6.73033
[32]	validation-rmse:6.72827
[33]	validation-rmse:6.72639
[34]	validation-rmse:6



In [26]:
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name) as run:
        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        mlflow.log_metric("rmse", rmse)


In [34]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = "sqlite:///mlflow2.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.search_experiments()

[<Experiment: artifact_location='file:///c:/Users/mikes/Documents/STUDY/mlops-zoomcamp/mlops_zoomcamp_hw/02-experiment-tracking/mlruns/1', creation_time=1747606063871, experiment_id='1', last_update_time=1747606063871, lifecycle_stage='active', name='nyc-taxi-experiment_1', tags={}>,
 <Experiment: artifact_location='file:///c:/Users/mikes/Documents/STUDY/mlops-zoomcamp/mlops_zoomcamp_hw/02-experiment-tracking/mlruns/0', creation_time=1747606063865, experiment_id='0', last_update_time=1747606063865, lifecycle_stage='active', name='Default', tags={}>]

In [None]:
latest_mv = client.get_latest_versions(model_name, stages=[stage])[0]
client.set_registered_model_alias(model_name, alias, latest_mv.version)

In [44]:
run_id = "b258a40490de4039a36089cd36829cd9"
model_name = "nyc-taxi-regressor"
mlflow.register_model(model_uri=f"runs:/{run_id}/model", name=model_name)

Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...
Created version '3' of model 'nyc-taxi-regressor'.


<ModelVersion: aliases=[], creation_timestamp=1747655627860, current_stage='None', description=None, last_updated_timestamp=1747655627860, name='nyc-taxi-regressor', run_id='b258a40490de4039a36089cd36829cd9', run_link=None, source='file:///c:/Users/mikes/Documents/STUDY/mlops-zoomcamp/mlops_zoomcamp_hw/02-experiment-tracking/mlruns/1/b258a40490de4039a36089cd36829cd9/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

In [49]:
latest_version = client.get_latest_versions(model_name)
for version in latest_version:
    print(f"Version: {version.version}, Stage: {version.current_stage}")

Version: 2, Stage: None
Version: 3, Stage: Production


  latest_version = client.get_latest_versions(model_name)


In [48]:
client.transition_model_version_stage(
    name=model_name,
    version=latest_version[0].version,
    stage="Production",
    archive_existing_versions=True
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1747655627860, current_stage='Production', description=None, last_updated_timestamp=1747656088040, name='nyc-taxi-regressor', run_id='b258a40490de4039a36089cd36829cd9', run_link=None, source='file:///c:/Users/mikes/Documents/STUDY/mlops-zoomcamp/mlops_zoomcamp_hw/02-experiment-tracking/mlruns/1/b258a40490de4039a36089cd36829cd9/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

In [50]:
client.update_model_version(name= model_name, version=latest_version[0].version, description="Model was transitioned to production")

<ModelVersion: aliases=[], creation_timestamp=1747642463174, current_stage='None', description='Model was transitioned to production', last_updated_timestamp=1747656749186, name='nyc-taxi-regressor', run_id='b258a40490de4039a36089cd36829cd9', run_link='', source='file:///c:/Users/mikes/Documents/STUDY/mlops-zoomcamp/mlops_zoomcamp_hw/02-experiment-tracking/mlruns/1/b258a40490de4039a36089cd36829cd9/artifacts/model', status='READY', status_message=None, tags={'model': 'GBregressor'}, user_id=None, version=2>

In [43]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids='1',
    filter_string="attribute.status = 'FINISHED'",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

for run in runs:
    #print(run)
    #print(f"run_id: {run.info.run_id}" )
    print(f"run_id: {run.info.run_id},  rmse: {run.data.metrics['rmse']}" )

run_id: b87c54aa20fc4682a8e98f2956d7d4d4,  rmse: 6.644020665421058
run_id: 999eddd205e04b4da29b642149fa7883,  rmse: 6.644020665421058
run_id: b258a40490de4039a36089cd36829cd9,  rmse: 6.742303328497426
run_id: 52d6de674e0543839c3b2ca46724c9ac,  rmse: 6.742303328497426


In [51]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)   

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df


def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']

    dicts = df[categorical + numerical].to_dict(orient='records')
    X = dv.transform(dicts)
    return X	


def test_model(name, stage, model, X, y):
    model = mlflow.pyfunc.load_model(
        model_uri=f"models:/{name}/{stage}"
    )
    y_pred = model.predict(X)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    return rmse

In [53]:
df = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-03.parquet')
client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')

  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]


MlflowException: The following failures occurred while downloading one or more artifacts from file:///c:/Users/mikes/Documents/STUDY/mlops-zoomcamp/mlops_zoomcamp_hw/02-experiment-tracking/mlruns/1/b258a40490de4039a36089cd36829cd9/artifacts:
##### File preprocessor #####
[Errno 2] No such file or directory: 'C:\\Users\\mikes\\Documents\\STUDY\\mlops-zoomcamp\\mlops_zoomcamp_hw\\02-experiment-tracking\\mlruns\\1\\b258a40490de4039a36089cd36829cd9\\artifacts\\preprocessor'

In [54]:
import pickle
with open("models/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)

In [55]:
X_test = preprocess(df, dv)

In [56]:
target = 'duration'
y_test = df[target].values

In [58]:
%time test_model(name = model_name, stage = "Production", model = "xgboost-regressor", X = X_test, y = y_test)

CPU times: total: 14.9 s
Wall time: 19.5 s


np.float64(6.659623830022514)

In [59]:
%time test_model(name = model_name, stage = "Staging", model = "xgboost-regressor", X = X_test, y = y_test)

MlflowException: No versions of model with name 'nyc-taxi-regressor' and stage 'Staging' found

%time test_model(name = model_name, stage = "Production", model = "xgboost-regressor", X = X_test, y = y_test)

In [None]:
from mlflow import MlflowClient

# Initialize an MLflow Client
client = MlflowClient()


def assign_alias_to_stage(model_name, stage, alias):
    """
    Assign an alias to the latest version of a registered model within a specified stage.

    :param model_name: The name of the registered model.
    :param stage: The stage of the model version for which the alias is to be assigned. Can be
                "Production", "Staging", "Archived", or "None".
    :param alias: The alias to assign to the model version.
    :return: None
    """
    latest_mv = client.get_latest_versions(model_name, stages=[stage])[0]
    client.set_registered_model_alias(model_name, alias, latest_mv.version)