In [1]:
!python -V

Python 3.11.6


In [2]:
import pandas as pd

In [3]:
import pickle

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import root_mean_squared_error

In [6]:
import mlflow


mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='/home/mpierrau/Documents/code/courses/mlops-zoomcamp-2024/mlops-zoomcamp-2024-projects/homework/02-experiment-tracking/mlruns/1', creation_time=1715937454052, experiment_id='1', last_update_time=1715937454052, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [7]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [8]:
df_train = read_dataframe('../data/yellow_tripdata_2023-01.parquet')
df_val = read_dataframe('../data/yellow_tripdata_2023-02.parquet')

In [12]:
len(df_train), len(df_val)

(3009173, 2855951)

In [9]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [10]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [11]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [12]:
mlflow.sklearn.autolog(disable=False)

In [13]:
with mlflow.start_run():

    mlflow.set_tag("developer", "magnus")
    mlflow.set_tag("project-type", "experimenting")

    mlflow.log_param("train-data-path", "./data/yellow_tripdata_2023-01.parquet")
    mlflow.log_param("valid-data-path", "./data/yellow_tripdata_2023-02.parquet")

    alpha = 0.001
    mlflow.log_param("alpha", alpha)
    lr = LinearRegression()
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    
    with open('models/lin_reg.bin', 'wb') as f_out:
        pickle.dump((dv, lr), f_out)

    mlflow.log_metric("rmse", rmse)

    mlflow.log_artifact(local_path="models/lin_reg.bin", artifact_path="models_pickle")



In [16]:
import xgboost as xgb

In [17]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [18]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [19]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=20,
            evals=[(valid, 'validation')],
            early_stopping_rounds=10
        )
        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [17]:
del X_train, X_val, y_train
del train_dicts , val_dicts
del df_train , df_val

In [19]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

  0%|                                    | 0/50 [00:00<?, ?trial/s, best loss=?]




[0]	validation-rmse:7.95039                                                     
[1]	validation-rmse:6.65935                                                     
[2]	validation-rmse:5.90901                                                     
[3]	validation-rmse:5.48805                                                     
[4]	validation-rmse:5.25403                                                     
[5]	validation-rmse:5.12334                                                     
[6]	validation-rmse:5.04762                                                     
[7]	validation-rmse:5.00054                                                     
[8]	validation-rmse:4.97382                                                     
[9]	validation-rmse:4.95523                                                     
[10]	validation-rmse:4.94233                                                    
[11]	validation-rmse:4.92994                                                    
[12]	validation-rmse:4.92184




[0]	validation-rmse:8.02152                                                     
[1]	validation-rmse:6.76795                                                     
[2]	validation-rmse:6.03716                                                     
[3]	validation-rmse:5.62627                                                     
[4]	validation-rmse:5.39882                                                     
[5]	validation-rmse:5.27230                                                     
[6]	validation-rmse:5.19996                                                     
[7]	validation-rmse:5.15678                                                     
[8]	validation-rmse:5.13142                                                     
[9]	validation-rmse:5.11370                                                     
[10]	validation-rmse:5.10220                                                    
[11]	validation-rmse:5.09236                                                    
[12]	validation-rmse:5.08628




[0]	validation-rmse:9.69075                                                     
[1]	validation-rmse:9.33832                                                     
[2]	validation-rmse:9.00916                                                     
[3]	validation-rmse:8.70207                                                     
[4]	validation-rmse:8.41578                                                     
[5]	validation-rmse:8.14922                                                     
[6]	validation-rmse:7.90111                                                     
[7]	validation-rmse:7.67075                                                     
[8]	validation-rmse:7.45678                                                     
[9]	validation-rmse:7.25836                                                     
[10]	validation-rmse:7.07474                                                    
[11]	validation-rmse:6.90468                                                    
[12]	validation-rmse:6.74766




[0]	validation-rmse:9.48504                                                     
[1]	validation-rmse:8.96039                                                     
[2]	validation-rmse:8.48901                                                     
[3]	validation-rmse:8.06645                                                     
[4]	validation-rmse:7.68857                                                     
[5]	validation-rmse:7.35152                                                     
[6]	validation-rmse:7.05185                                                     
[7]	validation-rmse:6.78538                                                     
[8]	validation-rmse:6.55014                                                     
[9]	validation-rmse:6.34167                                                     
[10]	validation-rmse:6.15870                                                    
[11]	validation-rmse:5.99776                                                    
[12]	validation-rmse:5.85627




[0]	validation-rmse:7.90956                                                     
[1]	validation-rmse:6.57897                                                     
  8%|▉          | 4/50 [03:44<44:25, 57.95s/trial, best loss: 4.897897431572136]

In [20]:
mlflow.xgboost.autolog(disable=False)

In [24]:
with mlflow.start_run():
    
    #train = xgb.DMatrix(X_train, label=y_train)
    #valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
        'learning_rate': 0.09585355369315604,
        'max_depth': 30,
        'min_child_weight': 1.060597050922164,
        'objective': 'reg:squarederror',
        'reg_alpha': 0.018060244040060163,
        'reg_lambda': 0.011658731377413597,
        'seed': 42
    }

    mlflow.autolog()
    
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=10,
        evals=[(valid, 'validation')],
        early_stopping_rounds=5
    )

    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

2024/05/20 10:53:51 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


[0]	validation-rmse:9.37133
[1]	validation-rmse:8.75894
[2]	validation-rmse:8.22242
[3]	validation-rmse:7.75402
[4]	validation-rmse:7.34642
[5]	validation-rmse:6.99342
[6]	validation-rmse:6.68849
[7]	validation-rmse:6.42668
[8]	validation-rmse:6.20193
[9]	validation-rmse:6.01053




FileNotFoundError: [Errno 2] No such file or directory: 'models/preprocessor.b'

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR

mlflow.sklearn.autolog()

for model_class in (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR):

    with mlflow.start_run():

        mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
        mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
        



In [17]:
loaded_model = mlflow.pyfunc.load_model("runs:/56cf295f0eb8461c9d5a48a1ce6ab8f3/model")

In [19]:
sklearn_model = mlflow.sklearn.load_model("runs:/56cf295f0eb8461c9d5a48a1ce6ab8f3/model")

<bound method LinearModel.predict of LinearRegression()>

In [22]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [26]:
client.create_experiment("my-new-experiment")

'2'

In [28]:
client.delete_experiment("2")

In [35]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids="1",
    filter_string="",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"],
)

In [36]:
for run in runs:
    print(f"{run.info.run_id=}, rmse: {run.data.metrics['rmse']:.4f}")    

run.info.run_id='769ce8d6b1834dd0bfe914d90d579b8f', rmse: 4.8395
run.info.run_id='f47d106024ee4dab84ad4fc48c25d5b9', rmse: 4.8979
run.info.run_id='0b40a2c5cf3d414c8683417d3a684fca', rmse: 5.0656
run.info.run_id='aa7dfc14fbb645209c337b8bc1fca2e8', rmse: 5.2114
run.info.run_id='fb5a6193a72c484192233ab61c9d9a9b', rmse: 5.2115


In [37]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [39]:
client.get_run("769ce8d6b1834dd0bfe914d90d579b8f")

<Run: data=<RunData: metrics={'best_iteration': 99.0,
 'rmse': 4.839452010459607,
 'stopped_iteration': 99.0,
 'validation-rmse': 4.839452005423746}, params={'custom_metric': 'None',
 'early_stopping_rounds': '10',
 'learning_rate': '0.09585355369315604',
 'max_depth': '30',
 'maximize': 'None',
 'min_child_weight': '1.060597050922164',
 'num_boost_round': '100',
 'objective': 'reg:linear',
 'reg_alpha': '0.018060244040060163',
 'reg_lambda': '0.011658731377413597',
 'seed': '42',
 'verbose_eval': 'True'}, tags={'mlflow.log-model.history': '[{"run_id": "769ce8d6b1834dd0bfe914d90d579b8f", '
                             '"artifact_path": "model", "utc_time_created": '
                             '"2024-05-17 12:03:34.088258", "flavors": '
                             '{"python_function": {"loader_module": '
                             '"mlflow.xgboost", "python_version": "3.11.6", '
                             '"data": "model.xgb", "env": {"conda": '
                             '"con

In [40]:
mlflow.register_model(model_uri = "769ce8d6b1834dd0bfe914d90d579b8f", name="nyc-taxi-regressor")

Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...
Created version '4' of model 'nyc-taxi-regressor'.


<ModelVersion: aliases=[], creation_timestamp=1716196842677, current_stage='None', description=None, last_updated_timestamp=1716196842677, name='nyc-taxi-regressor', run_id=None, run_link=None, source='769ce8d6b1834dd0bfe914d90d579b8f', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [54]:
model_name = "nyc-taxi-regressor"
latest_versions = client.get_latest_versions(name=model_name)

  latest_versions = client.get_latest_versions(name=model_name)


In [61]:
client.update_model_version("nyc-taxi-regressor", "4", description="testing a manual model description update") # only updates description

<ModelVersion: aliases=[], creation_timestamp=1716196842677, current_stage='None', description='testing a manual model description update', last_updated_timestamp=1716198068752, name='nyc-taxi-regressor', run_id=None, run_link=None, source='769ce8d6b1834dd0bfe914d90d579b8f', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [63]:
client.set_tag("56cf295f0eb8461c9d5a48a1ce6ab8f3", key="staging_status", value="testing")

In [64]:
client.download_artifacts("56cf295f0eb8461c9d5a48a1ce6ab8f3", path="model", dst_path=".")

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

'/home/mpierrau/Documents/code/courses/mlops-zoomcamp-2024/mlops-zoomcamp-2024-projects/homework/02-experiment-tracking/model'

In [66]:
client.transition_model_version_stage(
    name="nyc-taxi-regressor",
    version=4,
    stage="production",
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1716196842677, current_stage='Production', description='testing a manual model description update', last_updated_timestamp=1716204331143, name='nyc-taxi-regressor', run_id=None, run_link=None, source='769ce8d6b1834dd0bfe914d90d579b8f', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [75]:
#import mlflow.version


print(mlflow.VERSION)

2.12.2


In [None]:
client.set_model