# Baseline model for batch monitoring example

In [1]:
import requests
import datetime
import pandas as pd

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric
from evidently.metrics import ColumnCorrelationsMetric, ColumnQuantileMetric

In [2]:
files = [('green_tripdata_2022-02.parquet', './data'),
         ('green_tripdata_2022-01.parquet', './data'),
         ('green_tripdata_2024-03.parquet', './data')]

print("Download files:")
for file, path in files:
    url=f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp=requests.get(url, stream=True)
    save_path=f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=f"{file}",
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

In [3]:
mar_data = pd.read_parquet('data/green_tripdata_2024-03.parquet')
len(mar_data)

57457

# Evidently Report

In [4]:
# data labeling
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

In [5]:
column_mapping = ColumnMapping(
    target=None,
    prediction='prediction',
    numerical_features=num_features,
    categorical_features=cat_features
)

# Q2

`ColumnCorrelationsMetric`

In [6]:
report = Report(metrics=[
    ColumnDriftMetric(column_name='prediction'),
    DatasetDriftMetric(),
    DatasetMissingValuesMetric(),
    ColumnCorrelationsMetric(column_name="fare_amount"),
    ColumnQuantileMetric(column_name="fare_amount", quantile=0.5)
]
)

In [7]:

def process_data(df):
    df["duration_min"] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration_min = df.duration_min.apply(lambda td : float(td.total_seconds())/60)
    df["prediction"] = df.duration_min
    return df

jan_data = pd.read_parquet('data/green_tripdata_2022-01.parquet')
jan_data = process_data(jan_data)
mar_data = process_data(mar_data)

In [8]:
report.run(reference_data=jan_data, current_data=mar_data, column_mapping=column_mapping)



In [None]:
report.show(mode='inline')

# Q3

max q50 for fare_amount = 14.2

In [10]:
begin = datetime.datetime(2024, 3, 1, 0, 0)
report = Report(metrics = [
    ColumnQuantileMetric(column_name="fare_amount", quantile=0.5)
])

def calculate_metrics(data, i):
    current_data = data[
        (data.lpep_pickup_datetime >= (begin + datetime.timedelta(i))) &
        (data.lpep_pickup_datetime < (begin + datetime.timedelta(i + 1)))
    ]

    report.run(reference_data = None, current_data = current_data,
        column_mapping=column_mapping)

    result = report.as_dict()
    median = result["metrics"][0]["result"]["current"]["value"]
    metrics = {"median": median}
    
    return metrics

metrics = []
for i in range(29):
    metrics.append(calculate_metrics(mar_data, i))

q50 = pd.DataFrame(metrics)
q50.max()

median    14.2
dtype: float64

# Q4

`project_folder/dashboards`