# Baseline model for NYC taxi prediction

In [7]:
import os
import requests
import datetime

import pandas as pd
import matplotlib.pyplot as plt
import tqdm
import joblib

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import DatasetDriftMetric, ColumnQuantileMetric

from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, mean_absolute_percentage_error

In [4]:
# Download the March 2024 Green Taxi data

year = 2024
month = 3
taxi_type = "green"

print('Downloading files...')

url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{taxi_type}_tripdata_{year:04d}-{month:02d}.parquet"
print(url)
response = requests.get(url, stream=True)

os.makedirs("data", exist_ok=True)

save_path = f"data/{taxi_type}-{year:04d}-{month:02d}.parquet"

with open(save_path, "wb") as f_out:
    for data in tqdm.tqdm(response.iter_content(),
                    desc=f"{save_path}",
                    postfix=f"save to {save_path}",
                    total = int(response.headers['Content-Length'])):
        f_out.write(data)

Downloading files...
https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-03.parquet


data/green-2024-03.parquet: 100%|██████████| 1372372/1372372 [00:11<00:00, 119882.45it/s, save to data/green-2024-03.parquet]


In [10]:
march_data = pd.read_parquet('data/green-2024-03.parquet')
print(f"Total rows in March 2024 dataset: {march_data.shape[0]}.")

Total rows in March 2024 dataset: 57457.


In [5]:
column_mapping = ColumnMapping(target='fare_amount', prediction=None)

In [8]:
daily_quantiles = []

for day in pd.date_range(start="2024-03-01", end="2024-03-31", freq="D"):
    
    next_day = day + pd.Timedelta(days=1)
    day_data = march_data[(march_data['lpep_pickup_datetime'] >= day) & (march_data['lpep_pickup_datetime'] < next_day)]

    if day_data.empty:
        continue

    report = Report(metrics=[ColumnQuantileMetric(column_name="fare_amount", quantile=0.5)])

    report.run(reference_data=None, current_data=day_data, column_mapping=column_mapping)

    result = report.as_dict() 

    # Assumes the quantile value is stored under result['metrics'][0]['result']['current']['value']
    quantile_value = result['metrics'][0]['result']['current']['value'] 
    daily_quantiles.append((day, quantile_value))

In [9]:
max_day, max_value = max(daily_quantiles, key=lambda x: x[1]) 
print(f"Maximum median fare_amount in March 2024 was on {max_day.date()} with a value of {max_value}")

Maximum median fare_amount in March 2024 was on 2024-03-03 with a value of 14.2
