In [3]:
import pandas as pd
from sklearn.metrics import root_mean_squared_error
from mlflow  import MlflowClient
mlflow_tracking_uri="sqlite:///mlflow.db"
client=MlflowClient(mlflow_tracking_uri)

In [45]:
for exp in client.search_experiments():
    print(dict(exp))

{'artifact_location': '/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/4', 'creation_time': 1743171826030, 'experiment_id': '4', 'last_update_time': 1743171826030, 'lifecycle_stage': 'active', 'name': 'test-experiment', 'tags': {}}
{'artifact_location': '/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/2', 'creation_time': 1742825619994, 'experiment_id': '2', 'last_update_time': 1742825619994, 'lifecycle_stage': 'active', 'name': 'nyc-taxi-experiment', 'tags': {}}
{'artifact_location': 'mlflow-artifacts:/0', 'creation_time': 1742652750276, 'experiment_id': '0', 'last_update_time': 1742652750276, 'lifecycle_stage': 'active', 'name': 'Default', 'tags': {}}


In [4]:
client.create_experiment("test-experiment")

'4'

In [17]:
from mlflow.entities.view_type import ViewType
runs=client.search_runs(
    experiment_ids="2",
    max_results=5,
    run_view_type=ViewType.ACTIVE_ONLY,
    filter_string="metrics.rmse <6.4"



)

In [18]:
for run in runs:
    print(f"runs id: {run.info.run_id}, rmse: {run.data.metrics['rmse']}")

runs id: d32c149b25e64a6fa742e515303413a1, rmse: 6.327263733391266
runs id: e52d64c57a854c83b29bb398265fb3de, rmse: 6.327263733391266
runs id: 1f279ca8e69046288524daba3d3ecea5, rmse: 6.327263733391266
runs id: a78800da8f93444bb312e7d19bf293b8, rmse: 6.359498438162637
runs id: 707232543b814e74aca48be6c822f9af, rmse: 6.367620917584296


In [2]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/2', creation_time=1742825619994, experiment_id='2', last_update_time=1742825619994, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [47]:
client.search_registered_models()

[<RegisteredModel: aliases={'challenger': 1, 'champion': 2}, creation_timestamp=1743158645816, description='', last_updated_timestamp=1743174472331, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1743174472331, current_stage='None', description=None, last_updated_timestamp=1743174472331, name='nyc_taxi_data_regressor', run_id='84df48e758eb4d63a5dc178c7356a22b', run_link=None, source='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/2/84df48e758eb4d63a5dc178c7356a22b/artifacts/models_mlflow', status='READY', status_message=None, tags={}, user_id=None, version=3>], name='nyc_taxi_data_regressor', tags={}>]

In [25]:
mod_uri="runs:/84df48e758eb4d63a5dc178c7356a22b/models_mlflow"
mlflow.register_model(mod_uri,"nyc_taxi_data_regressor")

Registered model 'nyc_taxi_data_regressor' already exists. Creating a new version of this model...
Created version '3' of model 'nyc_taxi_data_regressor'.


<ModelVersion: aliases=[], creation_timestamp=1743174472331, current_stage='None', description=None, last_updated_timestamp=1743174472331, name='nyc_taxi_data_regressor', run_id='84df48e758eb4d63a5dc178c7356a22b', run_link=None, source='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/2/84df48e758eb4d63a5dc178c7356a22b/artifacts/models_mlflow', status='READY', status_message=None, tags={}, user_id=None, version=3>

In [78]:
latest_versions=client.search_model_versions(
    filter_string="name='nyc_taxi_data_regressor'",
    order_by=["version_number ASC"],



)

In [79]:
for version in latest_versions:
    print(f"version id:{version.version}, tags:{version.tags}")
    #print(f"version_id: {version.version},alias:{version.aliases}")

version id:1, tags:{'model': 'xgb1'}
version id:2, tags:{'model': 'xgb2'}
version id:3, tags:{'validation_status': 'pending'}


In [68]:
client.set_model_version_tag(name="nyc_taxi_data_regressor", version=3, key="validation_status", value="pending")

In [65]:
client.set_registered_model_alias("nyc_taxi_data_regressor","competitor",3)

In [80]:
client.update_model_version (
    name="nyc_taxi_data_regressor",
    version="3",
    description="this is the latest version created"
)

<ModelVersion: aliases=['competitor'], creation_timestamp=1743174472331, current_stage='None', description='this is the latest version created', last_updated_timestamp=1743177961609, name='nyc_taxi_data_regressor', run_id='84df48e758eb4d63a5dc178c7356a22b', run_link=None, source='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/2/84df48e758eb4d63a5dc178c7356a22b/artifacts/models_mlflow', status='READY', status_message=None, tags={'validation_status': 'pending'}, user_id=None, version=3>

In [23]:
def read_dataframe(filename):
    df=pd.read_parquet(filename)
    df['duration']=df.lpep_dropoff_datetime-df.lpep_pickup_datetime
    df.duration=df.duration.apply(lambda x:x.total_seconds()/60)
    df=df[(df.duration>=1) & (df.duration<=60)]
    categorical=['PULocationID', 'DOLocationID']
    df[categorical]=df[categorical].astype(str)
    return df
def preprocess(df,dv):
    df['PU_DO']=df['PULocationID']+"_"+df['DOLocationID']
    categorical=['PU_DO']
    numerical=['trip_distance']
    train_dicts=df[categorical+numerical].to_dict(orient="records")
    return dv.transform(train_dicts)
def test_model(X_test, y_test, alias,name="nyc_taxi_data_regressor"):
    model=mlflow.pyfunc.load_model(f"models:/{name}@{alias}")
    y_pred=model.predict(X_test)
    return {"rmse":root_mean_squared_error(y_test,y_pred)}
    

In [5]:
df=read_dataframe("./data/green_tripdata_2021-03.parquet")

In [6]:
!pwd

/workspaces/mlops-zoomcamp/02-experiment-tracking


In [7]:
client.download_artifacts(run_id="20470a7bb98947a7934e62baa723dd80", path="preprocessor",dst_path=".")

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

'/workspaces/mlops-zoomcamp/02-experiment-tracking/preprocessor'

In [9]:
import pickle
with open("./preprocessor/preprocessor.b","rb") as fin:
    dv=pickle.load(fin)

In [10]:
X_test=preprocess(df,dv)

In [13]:
X_test

<80372x13221 sparse matrix of type '<class 'numpy.float64'>'
	with 153356 stored elements in Compressed Sparse Row format>

In [11]:
y_test=df["duration"].values

In [12]:
print(len(y_test))

80372


In [26]:
print(test_model(X_test,y_test,"champion"))
print(test_model(X_test,y_test,"challenger"))
print(test_model(X_test,y_test,"competitor"))

{'rmse': 6.268277779785903}
{'rmse': 6.268277779785903}
{'rmse': 6.308751518920858}


In [27]:
client.set_registered_model_alias("nyc_taxi_data_regressor", "champion", 3)