# Using MLFlow and Evidently to Evaluate Data Drift

In this example, we will explore the MLflow integration with Evidently.

This notebook shows how you can use the Evidently and MLflow to:
* calculate data drift for the model, performed as batch checks 
* log data drift using MLflow Tracking
* explore the result using MLflow UI


In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
import json
import pandas as pd
import requests
import zipfile
import io

from evidently.pipeline.column_mapping import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset

import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

More information about the dataset can be found in Kaggle Playground Competition: https://www.kaggle.com/c/bike-sharing-demand/data?select=train.csv

Acknowledgement: Fanaee-T, Hadi, and Gama, Joao, 'Event labeling combining ensemble detectors and background knowledge', Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg

In [3]:
#load data
content = requests.get("https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip").content
with zipfile.ZipFile(io.BytesIO(content)) as arc:
    raw_data = pd.read_csv(arc.open("day.csv"), header=0, sep=',', parse_dates=['dteday'])

In [4]:
#observe data structure
raw_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [5]:
#set column mapping for Evidently Profile
data_columns = ColumnMapping()
data_columns.datetime = 'dteday'
data_columns.numerical_features = ['weathersit', 'temp', 'atemp', 'hum', 'windspeed']
data_columns.categorical_features = ['holiday', 'workingday']

In [6]:
#evaluate data drift with Evidently Profile
def eval_drift(reference, production, column_mapping):
    """
    Returns a list with pairs (feature_name, drift_score)
    Drift Score depends on the selected statistical test or distance and the threshold
    """    
    data_drift_report = Report(metrics=[DataDriftPreset()])
    data_drift_report.run(reference_data=reference, current_data=production, column_mapping=column_mapping)
    report = data_drift_report.as_dict()

    drifts = []

    for feature in column_mapping.numerical_features + column_mapping.categorical_features:
        drifts.append((feature, report["metrics"][1]["result"]["drift_by_columns"][feature]["drift_score"]))

    return drifts

In [7]:
#set reference dates
reference_dates = ('2011-01-01 00:00:00','2011-01-28 23:00:00')

#set experiment batches dates
experiment_batches = [
    ('2011-01-01 00:00:00','2011-01-29 23:00:00'),
    ('2011-01-29 00:00:00','2011-02-07 23:00:00'),
    ('2011-02-07 00:00:00','2011-02-14 23:00:00'),
    ('2011-02-15 00:00:00','2011-02-21 23:00:00'),  
]

In [15]:
from config import MLFLOW_TRACKING_URI

MLFLOW_TRACKING_URI

'.'

In [17]:
#log into MLflow
client = MlflowClient(MLFLOW_TRACKING_URI)

#set experiment
mlflow.set_experiment('Data Drift Evaluation with Evidently')

# USER_NAME = "evindently"
# EXPERIMENT_NAME = "data_drift"
# mlflow.set_experiment(f"{USER_NAME}/{EXPERIMENT_NAME}")

#start new run
for date in experiment_batches:
    with mlflow.start_run() as run: #inside brackets run_name='test'
        
        # Log parameters
        mlflow.log_param("begin", date[0])
        mlflow.log_param("end", date[1])

        # Log metrics
        metrics = eval_drift(raw_data.loc[raw_data.dteday.between(reference_dates[0], reference_dates[1])], 
                             raw_data.loc[raw_data.dteday.between(date[0], date[1])], 
                             column_mapping=data_columns)
        for feature in metrics:
            mlflow.log_metric(feature[0], round(feature[1], 3))

        print(run.info)

2023/06/29 17:01:35 INFO mlflow.tracking.fluent: Experiment with name 'NEW URI' does not exist. Creating a new experiment.


<RunInfo: artifact_uri='file:///Users/mnrozhkov/dev/mlrepa/com/evidently/evidently-mlflow/notebooks/mlruns/397959499604508878/684e26f0643b473ebd150907ac4a4168/artifacts', end_time=None, experiment_id='397959499604508878', lifecycle_stage='active', run_id='684e26f0643b473ebd150907ac4a4168', run_name='welcoming-ape-683', run_uuid='684e26f0643b473ebd150907ac4a4168', start_time=1688050896004, status='RUNNING', user_id='mnrozhkov'>
<RunInfo: artifact_uri='file:///Users/mnrozhkov/dev/mlrepa/com/evidently/evidently-mlflow/notebooks/mlruns/397959499604508878/8f7eb6540eaa488c93d21276334b97ac/artifacts', end_time=None, experiment_id='397959499604508878', lifecycle_stage='active', run_id='8f7eb6540eaa488c93d21276334b97ac', run_name='nebulous-bat-583', run_uuid='8f7eb6540eaa488c93d21276334b97ac', start_time=1688050896219, status='RUNNING', user_id='mnrozhkov'>
<RunInfo: artifact_uri='file:///Users/mnrozhkov/dev/mlrepa/com/evidently/evidently-mlflow/notebooks/mlruns/397959499604508878/3cfc62cf025b4

In [11]:
client.tracking_uri

'file:///Users/mnrozhkov/dev/mlrepa/com/evidently/evidently-mlflow/notebooks/mlruns'

In [13]:
client._registry_uri

'file:///Users/mnrozhkov/dev/mlrepa/com/evidently/evidently-mlflow/notebooks/mlruns'