# Baseline model for batch monitoring example


In [1]:
import requests
import datetime
import pandas as pd

from evidently import DataDefinition
from evidently import Dataset
from evidently import Report
from evidently.metrics import (
    ValueDrift,
    DriftedColumnsCount,
    MissingValueCount,
    QuantileValue,
    DuplicatedRowCount,
)

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [2]:
! mkdir data

In [2]:
files = [
    ("green_tripdata_2024-03.parquet", "./data"),
    ("green_tripdata_2022-01.parquet", "./data"),
]

print("Download files:")
for file, path in files:
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp = requests.get(url, stream=True)
    save_path = f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(
            resp.iter_content(),
            desc=f"{file}",
            postfix=f"save to {save_path}",
            total=int(resp.headers["Content-Length"]),
        ):
            handle.write(data)

Download files:


green_tripdata_2024-03.parquet: 100%|██████████| 1372372/1372372 [00:03<00:00, 406842.11it/s, save to ./data/green_tripdata_2024-03.parquet]
green_tripdata_2022-01.parquet: 100%|██████████| 1254291/1254291 [00:03<00:00, 412060.43it/s, save to ./data/green_tripdata_2022-01.parquet]


In [9]:
def prepare_data(df):
    # create target
    df["duration_min"] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration_min = df.duration_min.apply(lambda td: float(td.total_seconds()) / 60)
    # filter out outliers
    df = df[(df.duration_min >= 0) & (df.duration_min <= 60)]
    df = df[(df.passenger_count > 0) & (df.passenger_count <= 8)]
    return df

In [10]:
train_data = prepare_data(pd.read_parquet("data/green_tripdata_2022-01.parquet"))
val_data = prepare_data(pd.read_parquet("data/green_tripdata_2024-03.parquet"))

In [11]:
# data labeling
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

In [None]:
model = LinearRegression()
model.fit(train_data[num_features + cat_features], train_data[target])
train_preds = model.predict(train_data[num_features + cat_features])
train_data["prediction"] = train_preds

In [15]:
val_preds = model.predict(val_data[num_features + cat_features])
val_data["prediction"] = val_preds

In [16]:
print(mean_absolute_error(train_data.duration_min, train_data.prediction))
print(mean_absolute_error(val_data.duration_min, val_data.prediction))

3.9329290119804226
4.267088174846758


# Dump model and reference data


In [19]:
! mkdir models

In [17]:
with open("models/lin_reg.bin", "wb") as f_out:
    dump(model, f_out)

In [18]:
val_data.to_parquet("data/reference.parquet")

# Evidently Report


In [None]:
data_definition = DataDefinition(
    numerical_columns=num_features + ["prediction"], categorical_columns=cat_features
)
train_dataset = Dataset.from_pandas(train_data, data_definition)
val_dataset = Dataset.from_pandas(val_data, data_definition)

In [37]:
report = Report(
    metrics=[
        ValueDrift(column="prediction"),
        DriftedColumnsCount(),
        MissingValueCount(column="prediction"),
        QuantileValue(column="fare_amount", quantile=0.5),
        DuplicatedRowCount(),
    ]
)

In [None]:
date = datetime.date(2024, 1, 1)
print()

2024-01-01


In [58]:
day_val_dataset = val_data.loc[
    val_data.lpep_pickup_datetime.between("2024-03-01", "2024-03-02", inclusive="left")
]

In [59]:
day_val_dataset

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration_min,prediction
0,2,2024-03-01 00:10:52,2024-03-01 00:26:12,N,1.0,129,226,1.0,1.72,12.8,...,3.06,0.0,,1.0,18.36,1.0,1.0,0.00,15.333333,13.675769
1,2,2024-03-01 00:22:21,2024-03-01 00:35:15,N,1.0,130,218,1.0,3.25,17.7,...,0.00,0.0,,1.0,20.20,2.0,1.0,0.00,12.900000,15.944903
2,2,2024-03-01 00:45:27,2024-03-01 01:04:32,N,1.0,255,107,2.0,4.58,23.3,...,3.50,0.0,,1.0,32.05,1.0,1.0,2.75,19.083333,20.215881
3,1,2024-03-01 00:02:00,2024-03-01 00:23:45,N,1.0,181,71,1.0,0.00,22.5,...,0.00,0.0,,1.0,24.00,1.0,1.0,0.00,21.750000,18.134422
4,2,2024-03-01 00:16:45,2024-03-01 00:23:25,N,1.0,95,135,1.0,1.15,8.6,...,1.00,0.0,,1.0,12.10,1.0,1.0,0.00,6.666667,10.502977
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2040,2,2024-03-01 23:40:15,2024-03-01 23:50:28,N,1.0,130,56,1.0,4.70,20.5,...,0.00,0.0,,1.0,23.00,2.0,1.0,0.00,10.216667,17.393500
2041,2,2024-03-01 23:30:59,2024-03-01 23:31:02,N,5.0,265,264,2.0,0.00,39.0,...,0.00,0.0,,1.0,40.00,1.0,2.0,0.00,0.050000,28.342438
2046,2,2024-03-01 23:59:03,2024-03-02 00:08:22,N,1.0,159,116,1.0,1.61,11.4,...,0.00,0.0,,1.0,13.90,2.0,1.0,0.00,9.316667,11.608263
2051,2,2024-03-01 19:47:21,2024-03-01 20:02:35,N,1.0,82,82,1.0,2.11,14.9,...,3.48,0.0,,1.0,20.88,1.0,1.0,0.00,15.233333,15.093740


In [60]:
date = datetime.date(2024, 3, 1)
quantiles = []
while date.month == 3:
    start = date.strftime("%Y-%m-%d")
    date += datetime.timedelta(days=1)
    end = date.strftime("%Y-%m-%d")
    day_val_dataset = val_data.loc[
        val_data.lpep_pickup_datetime.between(start, end, inclusive="left")
    ]
    print(day_val_dataset.shape)
    evidently_day_val_dataset = Dataset.from_pandas(day_val_dataset, data_definition)
    snapshot = report.run(reference_data=train_dataset, current_data=evidently_day_val_dataset)
    result = snapshot.dict()
    quantiles.append(result["metrics"][3]["value"])



(1994, 22)




(1509, 22)




(1385, 22)




(1820, 22)




(1867, 22)




(2174, 22)




(2012, 22)




(1964, 22)




(1655, 22)




(1337, 22)




(1742, 22)




(1792, 22)




(1961, 22)




(1974, 22)




(1906, 22)




(1580, 22)




(1349, 22)




(1775, 22)




(1859, 22)




(1938, 22)




(2010, 22)




(1830, 22)




(1289, 22)




(1296, 22)




(1702, 22)




(1833, 22)




(1938, 22)




(2018, 22)




(1734, 22)




(1483, 22)




(1399, 22)


In [62]:
max(quantiles)

np.float64(14.2)

In [63]:
# number of drifted columns
result["metrics"][1]["value"]["count"]

6.0

In [64]:
# share of missing values
result["metrics"][2]["value"]["count"]

0.0

# Evidently Dashboard


In [None]:
from evidently.presets import DataDriftPreset, DataSummaryPreset

from evidently.ui.workspace import Workspace
from evidently.sdk.panels import *
from evidently.legacy.renderers.html_widgets import WidgetSize

In [None]:
ws = Workspace("workspace")

In [None]:
project = ws.create_project("NYC Taxi Data Quality Project")
project.description = "My project description"
project.save()

In [None]:
regular_report = Report(
    metrics=[DataSummaryPreset()],
)

data = Dataset.from_pandas(
    val_data.loc[
        val_data.lpep_pickup_datetime.between("2022-01-28", "2022-01-29", inclusive="left")
    ],
    data_definition=data_definition,
)

regular_snapshot = regular_report.run(current_data=data, timestamp=datetime.datetime(2022, 1, 28))

regular_snapshot

In [None]:
ws.add_run(project.id, regular_snapshot)

note: To view a report please run "evidently ui" command in a separate tab in your terminal.


In [None]:
# configure the dashboard
project.dashboard.add_panel(text_panel(title="NYC taxi data dashboard"))

project.dashboard.add_panel(
    bar_plot_panel(
        title="Inference Count",
        values=[
            PanelMetric(
                metric="RowCount",
                legend="count",
            ),
        ],
        size="half",
    ),
)

project.dashboard.add_panel(
    line_plot_panel(
        title="Number of Missing Values",
        values=[
            PanelMetric(metric="DatasetMissingValueCount", legend="count"),
        ],
        size="half",
    ),
)

project.save()

To view a dashboard please run "evidently ui" command in a separate tab in your terminal.


In [None]:
regular_report = Report(
    metrics=[DataSummaryPreset()],
)

data = Dataset.from_pandas(
    val_data.loc[
        val_data.lpep_pickup_datetime.between("2022-01-29", "2022-01-30", inclusive="left")
    ],
    data_definition=data_definition,
)

regular_run = regular_report.run(current_data=data, timestamp=datetime.datetime(2022, 1, 29))

regular_run

In [None]:
ws.add_run(project.id, regular_run)