In [None]:
'''Initial data processing and summarization.'''
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Load and Filter Data
# We're only interested in trips to LaGuardia Airport,
# so we'll create a new dataset for just those rows.
# This reduces the dataset from ~450MB/19M rows to only ~10MB/420k rows.
# ---
# Trip data source: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
# Title: "April 2024 High Volume For-Hire Vehicle Trip Records"
# Dataset URL: https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-04.parquet
# ---

# File paths
input_file = 'fhvhv_tripdata_2024-04.parquet'
output_file = 'filtered.parquet'
borough_file = 'taxi_boroughs.csv' # output of parse_taxi_zones.py

# Read the Parquet file
raw = pd.read_parquet(input_file)

# Select features relevant for trip cost. The sales_tax, airport_fee, and bcf (Black Car Fund)
# add to total cost, but are constant for each ride.
# They are excluded based on little independent, additive information.
columns_to_keep = [
    'pickup_datetime', 'dropoff_datetime', 'PULocationID', 'DOLocationID',
    'trip_miles', 'trip_time', 'base_passenger_fare', 'tolls', 'congestion_surcharge'
]
filtered_data = raw.loc[raw['DOLocationID'] == 138, columns_to_keep]

# Calculate total for variable costs
filtered_data['total_cost'] = (
    filtered_data['base_passenger_fare'] +
    filtered_data['tolls'] +
    filtered_data['congestion_surcharge']
)

# Read borough data
boroughs = pd.read_csv(borough_file)
# Merge boroughs into dataset on matching Taxi Zone number
filtered_data = filtered_data.merge(boroughs[['OBJECTID', 'borough']], left_on='PULocationID', right_on='OBJECTID').drop(columns=['OBJECTID'])

# Save to new working dataset
filtered_data.to_parquet(output_file, index=False)

In [None]:
# Read and summarize our working dataset
data = pd.read_parquet(output_file)
print(data.head())
print(data.info())
print(data.describe())

In [None]:
# Clean Data
# No changes in row count after these operations.
# Including for posterity / record of work.
# data.dropna(inplace=True)
# data.drop_duplicates(inplace=True)

# Transform Data
# Bin datetimes by hour of day
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime']).dt.hour
data['dropoff_datetime'] = pd.to_datetime(data['dropoff_datetime']).dt.hour

# Replace taxi zone numbers and borough names with T/F dummy variables
data = pd.get_dummies(data, columns=['PULocationID','DOLocationID', 'borough'])

# Sanity Check
print(data.head())
print(data.info())

In [None]:
# TODO Visualize Data
# Distribution of base fares
# Notable: very, very long tail
plt.figure(figsize=(10,6))
sns.histplot(data['total_cost'], bins=50, kde=True)
plt.title('Distribution of Fares')
plt.xlabel('Total Cost')
plt.ylabel('Count')
plt.show()
# Seeing a very very long tail!

# Average fare by pickup hour
# Notable: greater variance at night
plt.figure(figsize=(10,6))
sns.barplot(x='pickup_datetime', y='total_cost', data=data)
plt.title('Average Fare by Pickup Hour')
plt.xlabel('Pickup Hour')
plt.ylabel('Total Cost')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Average fare by borough
# Staten Island more tightly clustered, likely due to more uniform distances
sns.catplot(x="total_cost", y="borough", order=['Bronx','Brooklyn','Manhattan','Queens','Staten Island'], kind="boxen", data=filtered_data)
plt.title('Average Fare by Borough')
plt.xlabel('Borough')
plt.ylabel('Average Fare')
plt.show()

In [None]:
# TODO: Train the models
#rf = RandomForestRegressor(random_state=1)
#rt.fit(X_train, y_train)

# Save the model
# joblib.dump(rf, 'model.pkl')