In [None]:
# Automatically reload our project Python modules when we run the notebook
%load_ext autoreload
%autoreload 2

For more information have a look at https://github.com/evidentlyai/evidently

In [None]:
import pandas as pd

from evidently import Report
from evidently.presets import DataDriftPreset

raw_data = pd.read_parquet("../data/taxi_ride_data.parquet")
# Use a representative day as the reference dataset
reference_data =  raw_data[raw_data['tpep_pickup_datetime'].dt.date == pd.to_datetime('2025-01-15').date()]

report = Report([
    DataDriftPreset()
],
include_tests=True)


In [None]:
# Check if we have data drift on the first of January 2025
current_data =  raw_data[raw_data['tpep_pickup_datetime'].dt.date == pd.to_datetime('2025-01-01').date()]
results = report.run(reference_data=reference_data, current_data=current_data)
results

In [None]:
# Check if we have data drift one day prior to the reference dataset
current_data =  raw_data[raw_data['tpep_pickup_datetime'].dt.date == pd.to_datetime('2025-01-14').date()]
results = report.run(reference_data=reference_data, current_data=current_data, timestamp=pd.Timestamp("2025-01-14"))
results

In [13]:
# Generate a day with data drift: A new vendor arrives that rides 30 times faster than all others
new_vendor = raw_data[raw_data['VendorID'] == 2].copy()
new_vendor['VendorID'] = 99
new_vendor['trip_distance'] = new_vendor['trip_distance'] * 30

drifted_data = raw_data.copy()
drifted_data = pd.concat([drifted_data, new_vendor], ignore_index=True)
drifted_data = drifted_data[drifted_data['tpep_pickup_datetime'].dt.date == pd.to_datetime('2025-01-17').date()]
drifted_data.to_parquet('../work/2025-01-17.taxi-rides.parquet')

In [None]:
from evidently.ui.workspace import CloudWorkspace

results = report.run(reference_data=reference_data, current_data=drifted_data)

ws = CloudWorkspace(
    token="...",
    url="https://app.evidently.cloud")
ws.add_run('0196786a-8a19-7825-bc64-e21665fc68b3', results, include_data=False)