# Get started with Metrics Tracking and Monitoring

This notebook demonstrates how to use Evidently to:
* Generate a model performance report and calculate associated metrics.
* Log model metrics to MLFlow.
* Store the model in MLFlow as an artifact.
* Store the model performance report in MLFlow as an artifact.

In [12]:
%load_ext autoreload
%autoreload 2

import joblib
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
import pandas as pd
from pathlib import Path
from sklearn import ensemble, model_selection
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

from config import MLFLOW_TRACKING_URI

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load Data

More information about the dataset can be found in UCI machine learning repository: https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset

Acknowledgement: Fanaee-T, Hadi, and Gama, Joao, 'Event labeling combining ensemble detectors and background knowledge', Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg

In [13]:
# Download original dataset with: python src/pipelines/load_data.py 

raw_data = pd.read_csv("../data/raw_data.csv")
raw_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


## Define column mapping

In [14]:
target = 'cnt'
prediction = 'prediction'
numerical_features = ['temp', 'atemp', 'hum', 'windspeed', 'mnth', 'hr', 'weekday']
categorical_features = ['season', 'holiday', 'workingday', ]

In [15]:
sample_data = raw_data.set_index('dteday').loc['2011-01-01 00:00:00':'2011-01-28 23:00:00'].reset_index()

print(sample_data.shape)

(594, 17)


In [16]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    sample_data[numerical_features + categorical_features],
    sample_data[target],
    test_size=0.3
)

print(X_train.shape)
print(X_test.shape)

(415, 10)
(179, 10)


## Train a  Model

In [17]:
model = ensemble.RandomForestRegressor(random_state = 0, n_estimators = 50)
model.fit(X_train, y_train) 

model_path = Path('../models/model.joblib')
joblib.dump(model, model_path)

['../models/model.joblib']

In [18]:
model

In [19]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

preds = model.predict(X_test)

me = mean_squared_error(y_test, preds)
mae = mean_absolute_error(y_test, preds)

print(me, mae)

270.38246033519556 10.709162011173184


## Build the model validation report

In [20]:
report = X_test.copy()
report['target'] = y_test
report['prediction'] = preds

report.head()

Unnamed: 0,temp,atemp,hum,windspeed,mnth,hr,weekday,season,holiday,workingday,target,prediction
128,0.2,0.2121,0.37,0.1642,1,15,5,1,0,1,72,67.4
14,0.34,0.3333,0.81,0.1642,1,15,0,1,0,0,74,67.38
569,0.18,0.2121,0.74,0.0896,1,22,4,1,0,1,27,36.12
77,0.22,0.197,0.37,0.3284,1,10,3,1,0,1,57,59.02
494,0.14,0.1061,0.26,0.3881,1,17,0,1,0,0,55,72.62


# Metrics Tracking with MLflow

In [21]:
# Set up MLFlow 
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

# Set up MLFlow Client
client = MlflowClient()
print(f"Client tracking uri: {client.tracking_uri}")

# Set experiment name
mlflow.set_experiment('Train Model')


Client tracking uri: http://localhost:5000


<Experiment: artifact_location='/Users/mnrozhkov/dev/mlrepa/modules/mlflow/mlflow-1-metrics-tracking/mlruns/919251558402389511', creation_time=1695110016155, experiment_id='919251558402389511', last_update_time=1695110016155, lifecycle_stage='active', name='Train Model', tags={}>

## Discover models and their stages

In [22]:
with mlflow.start_run() as run: 

    # Log params 
    mlflow.log_param('model', 'RandomForest') 
    mlflow.log_params({'random_state': 0, 'n_estimators': 50})

    # Log metrics
    mlflow.log_metric('me', round(me, 3))
    mlflow.log_metric('mae', round(mae, 3))

    # Log the sklearn model and register as version 1
    mlflow.log_artifact("../data/raw_data.csv")