In [2]:
import requests #loads data from the internet
import datetime
import pandas as pd

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

## Download files

In [9]:
files = [('green_tripdata_2024-01.parquet', './data'),('green_tripdata_2024-02.parquet', './data'),('green_tripdata_2024-03.parquet', './data'),('green_tripdata_2024-04.parquet', './data')]
print("Download files:")
for file, path in files:
    url=f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp=requests.get(url, stream=True)
    save_path=f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=f"{file}",
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

Download files:


green_tripdata_2024-01.parquet: 100%|██████████| 1362284/1362284 [00:06<00:00, 197360.56it/s, save to ./data/green_tripdata_2024-01.parquet]
green_tripdata_2024-02.parquet: 100%|██████████| 1283805/1283805 [00:06<00:00, 208542.47it/s, save to ./data/green_tripdata_2024-02.parquet]
green_tripdata_2024-03.parquet: 100%|██████████| 1372372/1372372 [00:06<00:00, 200201.93it/s, save to ./data/green_tripdata_2024-03.parquet]
green_tripdata_2024-04.parquet: 100%|██████████| 1346502/1346502 [00:06<00:00, 199283.47it/s, save to ./data/green_tripdata_2024-04.parquet]


## Create a pandas dataframes

In [12]:
for i, file in enumerate(files):
    filepath = file[1] + '/' + file[0]
    t_df = pd.read_parquet(filepath)
    if i==0:
        df = t_df
    else:
        df = pd.concat([df,t_df])

display(df.head())
display(df.tail())

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2024-01-01 00:46:55,2024-01-01 00:58:25,N,1.0,236,239,1.0,1.98,12.8,1.0,0.5,3.61,0.0,,1.0,21.66,1.0,1.0,2.75
1,2,2024-01-01 00:31:42,2024-01-01 00:52:34,N,1.0,65,170,5.0,6.54,30.3,1.0,0.5,7.11,0.0,,1.0,42.66,1.0,1.0,2.75
2,2,2024-01-01 00:30:21,2024-01-01 00:49:23,N,1.0,74,262,1.0,3.08,19.8,1.0,0.5,3.0,0.0,,1.0,28.05,1.0,1.0,2.75
3,1,2024-01-01 00:30:20,2024-01-01 00:42:12,N,1.0,74,116,1.0,2.4,14.2,1.0,1.5,0.0,0.0,,1.0,16.7,2.0,1.0,0.0
4,2,2024-01-01 00:32:38,2024-01-01 00:43:37,N,1.0,74,243,1.0,5.14,22.6,1.0,0.5,6.28,0.0,,1.0,31.38,1.0,1.0,0.0


Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
56466,2,2024-04-30 18:31:00,2024-04-30 18:44:00,,,74,263,,2.34,20.59,0.0,0.0,4.77,0.0,,1.0,29.11,,,
56467,2,2024-04-30 18:33:00,2024-04-30 18:43:00,,,168,263,,2.51,18.08,0.0,0.0,4.27,0.0,,1.0,26.1,,,
56468,1,2024-04-30 19:42:43,2024-04-30 19:53:55,,,75,263,,1.5,12.1,2.5,0.5,0.0,0.0,,1.0,18.85,,,
56469,2,2024-04-30 19:04:00,2024-04-30 19:16:00,,,193,233,,3.37,20.37,0.0,0.0,6.03,0.0,,1.0,30.15,,,
56470,2,2024-04-30 20:21:00,2024-04-30 20:28:00,,,33,25,,0.94,12.08,0.0,0.0,1.89,0.0,,1.0,14.97,,,


In [19]:
#Create duration
df['duration'] = (df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']).dt.seconds / 60

## Filters
#Remove too long and too short a ride
df = df[(df['duration']>0) & (df['duration']<=60)]
#Remove weird passenger numbers
df = df[(df['passenger_count']>= 1) & (df['passenger_count']<=6)]
df

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration
0,2,2024-01-01 00:46:55,2024-01-01 00:58:25,N,1.0,236,239,1.0,1.98,12.8,...,0.5,3.61,0.00,,1.0,21.66,1.0,1.0,2.75,11.500000
1,2,2024-01-01 00:31:42,2024-01-01 00:52:34,N,1.0,65,170,5.0,6.54,30.3,...,0.5,7.11,0.00,,1.0,42.66,1.0,1.0,2.75,20.866667
2,2,2024-01-01 00:30:21,2024-01-01 00:49:23,N,1.0,74,262,1.0,3.08,19.8,...,0.5,3.00,0.00,,1.0,28.05,1.0,1.0,2.75,19.033333
3,1,2024-01-01 00:30:20,2024-01-01 00:42:12,N,1.0,74,116,1.0,2.40,14.2,...,1.5,0.00,0.00,,1.0,16.70,2.0,1.0,0.00,11.866667
4,2,2024-01-01 00:32:38,2024-01-01 00:43:37,N,1.0,74,243,1.0,5.14,22.6,...,0.5,6.28,0.00,,1.0,31.38,1.0,1.0,0.00,10.983333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54478,2,2024-04-30 20:53:29,2024-04-30 21:07:07,N,1.0,130,205,2.0,3.21,16.3,...,0.5,3.76,0.00,,1.0,22.56,1.0,1.0,0.00,13.633333
54479,2,2024-04-30 23:44:57,2024-04-30 23:55:06,N,1.0,130,197,1.0,2.14,12.8,...,0.5,0.00,0.00,,1.0,15.30,2.0,1.0,0.00,10.150000
54480,2,2024-04-30 23:18:02,2024-04-30 23:24:33,N,1.0,210,150,1.0,1.05,8.6,...,0.5,2.22,0.00,,1.0,13.32,1.0,1.0,0.00,6.516667
54481,1,2024-04-30 23:52:22,2024-05-01 00:05:13,N,1.0,75,229,1.0,2.60,15.6,...,1.5,2.09,0.00,,1.0,22.94,1.0,1.0,2.75,12.850000


In [20]:
#Split data frames
reference_month = df[df['lpep_pickup_datetime'].dt.month == 1]
df_eval = df[df['lpep_pickup_datetime'].dt.month != 1]

#Save reference as parquet
reference_month.to_parquet('data/reference_month.parquet')
df_eval.to_parquet('./data/eval_data.parquet')


## Model training

In [21]:
#Train a simple model
target = "duration"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount","tolls_amount"]
cat_features = ["PULocationID", "DOLocationID"]

# Simple training
train_data = reference_month
val_data = df_eval

#model
model = LinearRegression()
model.fit(train_data[num_features + cat_features], train_data[target])

In [24]:
reference_month['pred_duration'] = model.predict(train_data[num_features + cat_features])
df_eval['pred_duration'] = model.predict(val_data[num_features + cat_features])

df_eval.head()


Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration,pred_duration
88,2,2023-12-31 14:38:47,2023-12-31 14:46:45,N,1.0,25,65,1.0,0.95,8.6,...,2.78,0.0,,1.0,13.88,1.0,1.0,0.0,7.966667,9.525786
90,2,2023-12-31 23:38:54,2023-12-31 23:50:09,N,1.0,74,235,1.0,4.85,21.2,...,0.0,0.0,,1.0,23.7,1.0,1.0,0.0,11.25,15.018855
0,2,2024-02-01 00:15:53,2024-02-01 00:24:20,N,1.0,75,161,1.0,2.77,13.5,...,3.75,0.0,,1.0,22.5,1.0,1.0,2.75,8.45,12.84278
2,2,2024-02-01 00:30:29,2024-02-01 00:35:32,N,1.0,75,238,1.0,1.03,7.9,...,2.6,0.0,,1.0,13.0,1.0,1.0,0.0,5.05,9.531133
4,2,2024-02-01 00:31:14,2024-02-01 00:31:16,N,5.0,95,264,1.0,0.0,50.0,...,1.0,0.0,,1.0,52.0,1.0,2.0,0.0,0.033333,28.190618


## Create report

In [33]:
import datetime

from evidently.metrics import ColumnQuantileMetric
from evidently.metrics import DatasetSummaryMetric
from evidently.metrics import DatasetCorrelationsMetric
from evidently.metrics import ColumnSummaryMetric

#Create dashboard
from evidently.metric_preset import DataDriftPreset, DataQualityPreset
from evidently.ui.workspace import Workspace
from evidently.ui.dashboards import DashboardPanelCounter, DashboardPanelPlot, CounterAgg, PanelValue, PlotType, ReportFilter
from evidently.renderers.html_widgets import WidgetSize

#Create the workspace
ws = Workspace("DriftWorkspace") #makes a folder in the local directory

#Create project
project = ws.create_project("NYC taxi data drift")
project.description = "Looking at the data drift when predicting the ride duration of a taxi trip in green NYC taxis"
project.save()

column_mapping = ColumnMapping(
    target='duration', #Whether we want to see the 'duration' value
    prediction='pred_duration',
    numerical_features=num_features,
    categorical_features=cat_features
)

report = Report(metrics=[
    ColumnDriftMetric(column_name='pred_duration'), #whether the prediction is drifting
    DatasetDriftMetric(),
    DatasetMissingValuesMetric(),
    DatasetSummaryMetric(),
    DatasetCorrelationsMetric(),
]
)
#Compare day by day to the reference month
timestamp = datetime.datetime(year=2024, month=1, day=31)
day = datetime.timedelta(days=1)
for i in range(35):
    timestamp = timestamp + day
    t_df = df_eval[
        (df_eval['lpep_pickup_datetime'].dt.year == timestamp.year) &
        (df_eval['lpep_pickup_datetime'].dt.month == timestamp.month) &
        (df_eval['lpep_pickup_datetime'].dt.day == timestamp.day)
        ]
    #Run the report
    report.run(reference_data=reference_month, current_data=t_df, column_mapping=column_mapping)

    #add to workspace
    ws.add_report(project.id, report)

In [32]:
# configure dashboard
project.dashboard.add_panel (
    DashboardPanelCounter(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        agg=CounterAgg.NONE,
        title="Summary of Data Drift"
    )
)

project.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title='Inference Count',
        values = [
            PanelValue(
                metric_id = "ColumnDriftMetric", 
                field_path = "current.number_of_rows", 
                legend="count"
                ),
            ],
        plot_type = PlotType.BAR,
        size = WidgetSize.HALF,
    ),
)

project_3.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title='fare_amount 50th quartile',
        values = [
            PanelValue(
                metric_id = "ColumnQuantileMetric",
                field_path = "current.value",
                legend="count"
            ),
        ],
        plot_type = PlotType.LINE,
        size = WidgetSize.HALF,
    ),
)

project_3.save()
    

2024

In [34]:
project.id

UUID('f7bf392f-f465-4bcf-8827-2fd26c090608')