# Baseline model for batch monitoring example

In [70]:
import requests
import pandas as pd

from evidently import Dataset, Report
from evidently.metrics import QuantileValue, MeanValue
from evidently.metrics.group_by import GroupBy

from tqdm import tqdm

import psycopg

import uuid
import pytz
from random import random
from datetime import datetime
from dataclasses import dataclass

In [7]:
files = [('green_tripdata_2024-03.parquet', './data')]

print("Download files:")
for file, path in files:
    url=f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp=requests.get(url, stream=True)
    save_path=f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=f"{file}",
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

Download files:


green_tripdata_2024-03.parquet: 100%|██████████| 1372372/1372372 [00:10<00:00, 127172.18it/s, save to ./data/green_tripdata_2024-03.parquet]


In [30]:
data = pd.read_parquet('data/green_tripdata_2024-03.parquet')

In [31]:
data.dtypes

VendorID                          int32
lpep_pickup_datetime     datetime64[us]
lpep_dropoff_datetime    datetime64[us]
store_and_fwd_flag               object
RatecodeID                      float64
PULocationID                      int32
DOLocationID                      int32
passenger_count                 float64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
ehail_fee                       float64
improvement_surcharge           float64
total_amount                    float64
payment_type                    float64
trip_type                       float64
congestion_surcharge            float64
dtype: object

## Q1. Prepare the dataset

In [32]:
data.shape

(57457, 20)

## Q2. Metric

Q: What metric did you choose?

Choosed `MeanValue` from `evidently.metrics`


## Q3. Monitoring

In [80]:
@dataclass
class EvaluationResult:
  day: int
  mean: float
  quantile: float

In [74]:
results = []

for day, group in tqdm(data.groupby(by=data['lpep_pickup_datetime'].dt.day)):
  dataset = Dataset.from_pandas(group)
  report = Report([MeanValue(column="fare_amount"), QuantileValue(column="fare_amount")])
  result = report.run(group).dict()["metrics"]

  results.append(EvaluationResult(day=day, mean=result[0]["value"], quantile=result[1]["value"]))


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid v

In [77]:
max(results, key=lambda res: res.quantile)

evaluations(day=3, mean=np.float64(18.562301369863015), quantile=np.float64(14.2))

## Q4. Dashboard

In [90]:
create_table_statement = """
drop table if exists evals;
create table evals(
	day timestamp,
	mean FLOAT,
	quantile FLOAT
);
"""

In [92]:
with psycopg.connect("host=localhost port=5432 dbname=monitoring user=admin password=admin", autocommit=True) as conn:
	conn.execute(create_table_statement)
	for res in results:
		conn.execute(
			"insert into evals(day, mean, quantile) values (%s, %s, %s)",
			(datetime(year=2024, month=3, day=res.day), res.mean, res.quantile)
		)

Q4 solution: project_folder/dashboards
![homework solution graphs.png](../assets/homework%20solution%20graphs.png)