In [1]:
import requests
import datetime
import pandas as pd

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [2]:
df = pd.read_parquet('/workspaces/course-mlops-zoomcamp/05-monitoring/data/green_tripdata_2024-03.parquet')

In [3]:
df.shape

(57457, 20)

In [4]:
df.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2024-03-01 00:10:52,2024-03-01 00:26:12,N,1.0,129,226,1.0,1.72,12.8,1.0,0.5,3.06,0.0,,1.0,18.36,1.0,1.0,0.0
1,2,2024-03-01 00:22:21,2024-03-01 00:35:15,N,1.0,130,218,1.0,3.25,17.7,1.0,0.5,0.0,0.0,,1.0,20.2,2.0,1.0,0.0
2,2,2024-03-01 00:45:27,2024-03-01 01:04:32,N,1.0,255,107,2.0,4.58,23.3,1.0,0.5,3.5,0.0,,1.0,32.05,1.0,1.0,2.75
3,1,2024-03-01 00:02:00,2024-03-01 00:23:45,N,1.0,181,71,1.0,0.0,22.5,0.0,1.5,0.0,0.0,,1.0,24.0,1.0,1.0,0.0
4,2,2024-03-01 00:16:45,2024-03-01 00:23:25,N,1.0,95,135,1.0,1.15,8.6,1.0,0.5,1.0,0.0,,1.0,12.1,1.0,1.0,0.0


In [7]:
# Only import ColumnQuantileMetric, as ColumnAggMetric does not exist
metrics = [
    ColumnQuantileMetric(column_name="fare_amount", quantile=0.5),
    ColumnQuantileMetric(column_name="trip_distance", quantile=0.5),
]

In [18]:
# Only import ColumnQuantileMetric, as ColumnMeanMetric does not exist
from evidently.metrics import ColumnQuantileMetric
from evidently.report import Report

report = Report(metrics=[
    ColumnQuantileMetric(column_name="fare_amount", quantile=0.5),  # Median
    # ColumnMeanMetric does not exist in evidently.metrics, so we omit it
])

report.run(reference_data=df, current_data=df)

In [19]:
# Only import ColumnQuantileMetric, as ColumnMeanMetric does not exist
from evidently.metrics import ColumnQuantileMetric
from evidently.report import Report

report = Report(metrics=[
    ColumnQuantileMetric(column_name="trip_distance", quantile=0.5),  # Median
    # ColumnMeanMetric does not exist in evidently.metrics, so we omit it
])

report.run(reference_data=df, current_data=df)

In [8]:
df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])

In [9]:
daily_medians = df.groupby(df['lpep_pickup_datetime'].dt.date)['fare_amount'].median()

In [11]:
print(daily_medians)

lpep_pickup_datetime
2008-12-31     0.00
2024-02-23    20.00
2024-02-24    75.00
2024-02-25    13.05
2024-02-29    34.00
2024-03-01    13.50
2024-03-02    13.50
2024-03-03    14.20
2024-03-04    12.80
2024-03-05    13.50
2024-03-06    12.80
2024-03-07    13.50
2024-03-08    13.50
2024-03-09    13.50
2024-03-10    14.20
2024-03-11    12.80
2024-03-12    13.50
2024-03-13    13.50
2024-03-14    14.20
2024-03-15    13.50
2024-03-16    14.20
2024-03-17    13.50
2024-03-18    13.50
2024-03-19    13.50
2024-03-20    12.80
2024-03-21    13.50
2024-03-22    13.50
2024-03-23    12.80
2024-03-24    14.20
2024-03-25    13.50
2024-03-26    13.50
2024-03-27    13.50
2024-03-28    13.50
2024-03-29    13.50
2024-03-30    14.20
2024-03-31    13.50
2024-04-01     5.80
Name: fare_amount, dtype: float64


In [15]:
# Ensure datetime format
df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])

# Filter for March 2024
march_df = df[
    (df['lpep_pickup_datetime'] >= '2024-03-01') &
    (df['lpep_pickup_datetime'] < '2024-04-01')
]

# Calculate daily median (quantile=0.5) for fare_amount
daily_medians = march_df.groupby(march_df['lpep_pickup_datetime'].dt.date)['fare_amount'].median()

# Find the highest daily median value
max_daily_median = daily_medians.max()
print("Highest daily median fare_amount in March 2024:", max_daily_median)

Highest daily median fare_amount in March 2024: 14.2
