# Monitoring setup for Bicycle Sharing Demand Prediction

This notebook shows how you can use the Evidently to:
* calculate prerformance and data drift for the model, performed as batch checks 
* log models quality & data drift using MLflow Tracking
* explore the result 

More examples are avaliable in the github: https://github.com/evidentlyai/evidently/tree/main/examples

Evidently docs: https://docs.evidentlyai.com/

Join our Discord: https://discord.com/invite/xZjKRaNp8b

In [1]:
%load_ext autoreload
%autoreload 2

import datetime
import joblib
import pandas as pd
import mlflow
import numpy as np
import requests
import zipfile
import io
import json

from mlflow.tracking import MlflowClient
from sklearn import datasets, ensemble, model_selection
from scipy.stats import anderson_ksamp

from evidently.metrics import RegressionQualityMetric, RegressionErrorPlot, RegressionErrorDistribution
from evidently.metric_preset import DataDriftPreset, RegressionPreset
from evidently.pipeline.column_mapping import ColumnMapping
from evidently.report import Report

from src.reports import (
    build_model_monitoring_report,
    get_model_monitoring_metrics
)

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

## Bicycle Demand Data

More information about the dataset can be found in UCI machine learning repository: https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset

Acknowledgement: Fanaee-T, Hadi, and Gama, Joao, 'Event labeling combining ensemble detectors and background knowledge', Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg

In [3]:
# content = requests.get("https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip").content
# with zipfile.ZipFile(io.BytesIO(content)) as arc:
#     raw_data = pd.read_csv(arc.open("hour.csv"), header=0, sep=',', parse_dates=['dteday']) 


# Load data saved in the previous step (in train_model.ipynb)
raw_data = pd.read_csv('../data/raw_data.csv', index_col=0)

In [4]:
raw_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
2011-01-01 00:00:00,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
2011-01-01 01:00:00,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2011-01-01 02:00:00,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
2011-01-01 03:00:00,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
2011-01-01 04:00:00,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


## Load the Model (from the MLFlow)

In [5]:
from config import MLFLOW_TRACKING_URI


mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
print(f"Client tracking uri: {client.tracking_uri}")

# Update MLFLOW_RUN_ID to your value
MLFLOW_RUN_ID = 'f23e2040ace24e20a937e9c116644292'

model = mlflow.artifacts.download_artifacts(
    run_id=MLFLOW_RUN_ID, 
    artifact_path='model.joblib'
)

model 

Client tracking uri: http://localhost:5000


'/var/folders/ff/tyq6m7sn6gj89qfmv0lxbgc00000gn/T/tmpgz6svsxm/model.joblib'

In [6]:
regressor = joblib.load(model)
regressor

# Get Predictions

In [7]:
target = 'cnt'
prediction = 'prediction'
# datetime = 'dteday'
numerical_features = ['temp', 'atemp', 'hum', 'windspeed', 'mnth', 'hr', 'weekday']
categorical_features = ['season', 'holiday', 'workingday', ]#'weathersit']

column_mapping = ColumnMapping()
column_mapping.target = target
column_mapping.prediction = prediction
# column_mapping.datetime = datetime
column_mapping.numerical_features = numerical_features
column_mapping.categorical_features = categorical_features

In [8]:
train_dates = ('2011-01-02 00:00:00','2011-03-06 23:00:00')

prediction_batches = [ 
    ('2011-03-07 00:00:00','2011-03-13 23:00:00'),
    ('2011-03-14 00:00:00','2011-03-20 23:00:00'),
    ('2011-03-21 00:00:00','2011-03-27 23:00:00'), 
]

In [9]:
reference_data = raw_data.loc[train_dates[0]:train_dates[1]]
reference_prediction = regressor.predict(
    reference_data[numerical_features + categorical_features]
    )
reference_data['prediction'] = reference_prediction
print(reference_data.shape)

(1456, 18)


# Monitor Model

### Week 1

In [10]:
current_dates = prediction_batches[0]
current_data = raw_data.loc[current_dates[0]:current_dates[1]]  

print(current_data.shape)
# current_data.head()

(163, 17)


In [11]:
current_prediction = regressor.predict(current_data[numerical_features + categorical_features])
current_data['prediction'] = current_prediction

print(current_data.shape)

(163, 18)


In [12]:
model_report = build_model_monitoring_report(
    reference_data=reference_data.reset_index(drop=True),
    current_data=current_data.reset_index(drop=True),
    column_mapping=column_mapping,
)

model_metrics = get_model_monitoring_metrics(model_report)
model_metrics

{'me': 2.201717791411043,
 'mae': 22.862576687116565,
 'rmse': 1249.1163680981597,
 'mape': 76.95577073743617}

### Week 2

In [13]:
current_dates = prediction_batches[1]
current_data = raw_data.loc[current_dates[0]:current_dates[1]]  

current_prediction = regressor.predict(current_data[numerical_features + categorical_features])
current_data['prediction'] = current_prediction

print(current_dates)
print(current_data.shape)
# current_data.head()

('2011-03-14 00:00:00', '2011-03-20 23:00:00')
(164, 18)


In [14]:
model_report = build_model_monitoring_report(
    reference_data=reference_data.reset_index(drop=True),
    current_data=current_data.reset_index(drop=True),
    column_mapping=column_mapping,
)

model_metrics = get_model_monitoring_metrics(model_report)
model_metrics

{'me': -19.882926829268293,
 'mae': 28.782926829268295,
 'rmse': 2131.593790243902,
 'mape': 36.33232800509782}

# Model Quality Evaluation (Prod)

In [15]:
from config import MLFLOW_TRACKING_URI


mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
print(f"Client tracking uri: {client.tracking_uri}")

# Get or Create an experiment by name 
EXPERIMENT_NAME = "Monitor Model 3"
experiment = client.get_experiment_by_name(EXPERIMENT_NAME)

if experiment: 
    
    experiment_id = experiment.experiment_id if experiment else None
    # Show experiment info
    print("Name: {}".format(experiment.name))
    print("Experiment ID: {}".format(experiment.experiment_id))
    print("Experiment Name: {}".format(experiment.name))
    print("Artifact Location: {}".format(experiment.artifact_location))
    print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))

else:
    
    # Create_experiment
    experiment_id = client.create_experiment(EXPERIMENT_NAME)
    print("Experiment ID: {}".format(experiment_id))

mlflow.set_experiment(EXPERIMENT_NAME)

Client tracking uri: http://localhost:5000
Experiment ID: 899982032781375836


<Experiment: artifact_location='mlflow-artifacts:/899982032781375836', creation_time=1688658187463, experiment_id='899982032781375836', last_update_time=1688658187463, lifecycle_stage='active', name='Monitor Model 3', tags={}>

In [17]:
REPORTS_DIR = '../reports'

#start new run
for current_dates in prediction_batches:
    
    with mlflow.start_run(run_name=current_dates[1]) as run: 
        
        # Show newly created run metadata info
        print("Experiment id: {}".format(run.info.experiment_id))
        print("Run id: {}".format(run.info.run_id))
        print("Run name: {}".format(run.info.run_name))
            
        # Log parameters
        mlflow.log_param("begin", current_dates[0])
        mlflow.log_param("end", current_dates[1])
        
        
        # current_dates = prediction_batches[1]
        current_data = raw_data.loc[current_dates[0]:current_dates[1]]
        current_prediction = regressor.predict(current_data[numerical_features + categorical_features])
        current_data['prediction'] = current_prediction


        # Get and Log metrics
        model_report = build_model_monitoring_report(
            reference_data=reference_data.reset_index(drop=True),
            current_data=current_data.reset_index(drop=True),
            column_mapping=column_mapping,
        )
        model_metrics = get_model_monitoring_metrics(model_report)
        mlflow.log_metrics(model_metrics)
        
        # Log Monitoring Report 
        monitoring_report_path = f"{REPORTS_DIR}/model_monitoring_report.html"
        model_report.save_html(monitoring_report_path)
        mlflow.log_artifact(monitoring_report_path)
        
        print(run.info)

Experiment id: 899982032781375836
Run id: c078334ae9464c3ba56f47b06a624a6b
Run name: 2011-03-13 23:00:00
<RunInfo: artifact_uri='mlflow-artifacts:/899982032781375836/c078334ae9464c3ba56f47b06a624a6b/artifacts', end_time=None, experiment_id='899982032781375836', lifecycle_stage='active', run_id='c078334ae9464c3ba56f47b06a624a6b', run_name='2011-03-13 23:00:00', run_uuid='c078334ae9464c3ba56f47b06a624a6b', start_time=1688658195527, status='RUNNING', user_id='mnrozhkov'>
Experiment id: 899982032781375836
Run id: 2bc39c52b3ee4c1792e4b3c0e08ada1e
Run name: 2011-03-20 23:00:00
<RunInfo: artifact_uri='mlflow-artifacts:/899982032781375836/2bc39c52b3ee4c1792e4b3c0e08ada1e/artifacts', end_time=None, experiment_id='899982032781375836', lifecycle_stage='active', run_id='2bc39c52b3ee4c1792e4b3c0e08ada1e', run_name='2011-03-20 23:00:00', run_uuid='2bc39c52b3ee4c1792e4b3c0e08ada1e', start_time=1688658195885, status='RUNNING', user_id='mnrozhkov'>
Experiment id: 899982032781375836
Run id: 707c8c0a3712