In [None]:
'''Initial data processing and summarization.'''
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split


# Load and Filter Data
# We're only interested in trips to LaGuardia Airport,
# so we'll create a new dataset for just those rows.
# This reduces the dataset from ~450MB/19M rows to only ~10MB/420k rows.
# ---
# Trip data source: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
# Title: "April 2024 High Volume For-Hire Vehicle Trip Records"
# Dataset URL: https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-04.parquet
# ---

# File paths
input_file = 'fhvhv_tripdata_2024-04.parquet'
output_file = 'filtered.parquet'
borough_file = 'taxi_boroughs.csv' # output of parse_taxi_zones.py

# Read the Parquet file
raw = pd.read_parquet(input_file)

# Select features relevant for trip cost.
# The sales_tax, airport_fee, and bcf (Black Car Fund)
# add to total cost, but are a constant for each ride.
# They are excluded based on lack of independent additive information.
columns_to_keep = ['pickup_datetime', 'PULocationID', 'DOLocationID', 'trip_miles', 'trip_time']

# Select only trips to LGA – Taxi Zone 138, then drop the column
filtered_data = raw.loc[raw['DOLocationID'] == 138, columns_to_keep]
filtered_data = filtered_data.drop(columns=['DOLocationID'])

# Add column for total of variable costs
filtered_data['total_cost'] = (
    raw['base_passenger_fare'] +
    raw['tolls'] +
    raw['congestion_surcharge']
)

# Read borough data
boroughs = pd.read_csv(borough_file)
# Merge boroughs into dataset on matching Taxi Zone number
filtered_data = filtered_data.merge(boroughs[['OBJECTID', 'borough']], left_on='PULocationID', right_on='OBJECTID').drop(columns=['OBJECTID'])

# Save to new working dataset for graphs and models to follow
filtered_data.to_parquet(output_file, index=False)

In [None]:
# Read and Summarize Working Dataset
data = pd.read_parquet(output_file)
print(data.head())
print(data.info())
print(data.describe())

In [None]:
# Clean Data
# No changes in row count after these operations.
# Including for posterity / record of work.
# data.dropna(inplace=True)
# data.drop_duplicates(inplace=True)

# Transform Data
# Bin datetimes by hour of day
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime']).dt.hour

# Replace taxi zone numbers and borough names with T/F dummy variables
data = pd.get_dummies(data, columns=['PULocationID', 'borough'])

# Sanity Check
print(data.head())
print(data.info())

In [None]:
# TODO: Visualize Data
# TODO: Figure out how to add tick marks to this first one
# Distribution of total cost
# Notable: dip at ~$15-25 range, possible impact of tolls on similar-distance trips
# Notable: very, very long tail
plt.figure(figsize=(10,6))
sns.histplot(data['total_cost'], bins=90, kde=True)
plt.title('Distribution of Fares')
plt.xlabel('Total Cost')
plt.ylabel('Count')
plt.show()
# Seeing a very very long tail!

# Average cost by pickup hour
# Notable: greater variance at night
plt.figure(figsize=(10,6))
sns.barplot(x='pickup_datetime', y='total_cost', data=data)
plt.title('Average Cost by Pickup Hour')
plt.xlabel('Pickup Hour')
plt.ylabel('Total Cost')
plt.xticks(rotation=90)
plt.show()

# Average cost by borough
# Notable: Staten Island has highest base cost due to greatest distance
# Notable: Queens has lowest base cost due to shortest distance
sns.catplot(x="total_cost", y="borough", order=['Bronx','Brooklyn','Manhattan','Queens','Staten Island'], kind="boxen", data=filtered_data)
plt.title('Average Cost by Borough')
plt.xlabel('Borough')
plt.ylabel('Average Fare')
plt.show()

In [None]:
# Train Two Models

# We're predicting fares at two levels of granularity: borough and neighborhood (Taxi Zone).
# We'll use boroughs for our simplest baseline model.
# Then compare with the neighborhood model to see if more features improve predictive power.

# Before we create models, we need to make sure each is using only the relevant features.
# We'll identify the columns that need to be dropped.
borough_columns = [col for col in data.columns if col.startswith('borough')]
neighborhood_columns = [col for col in data.columns if col.startswith('PULocationID')]

# Borough data should exclude the neighborhoods
# Neighborhood data should exclude the boroughs
borough_data = data.drop(columns=neighborhood_columns)
neighborhood_data = data.drop(columns=borough_columns)

In [None]:
# Train and Evaluate Borough Model

# Prepare data
X_borough = borough_data.drop(columns=['total_cost'])
y_borough = borough_data['total_cost']

# Split data
X_train_borough, X_test_borough, y_train_borough, y_test_borough = train_test_split(X_borough, y_borough, test_size=0.2, random_state=38)

# Display training set to verify splits
print(X_train_borough.head())

# Train the model
rf_borough = RandomForestRegressor(random_state=38)
rf_borough.fit(X_train_borough, y_train_borough)

# Predict and evaluate
y_pred_borough = rf_borough.predict(X_test_borough)
mae_borough = mean_absolute_error(y_test_borough, y_pred_borough)
mse_borough = mean_squared_error(y_test_borough, y_pred_borough)
r2_borough = r2_score(y_test_borough, y_pred_borough)

# Print performance summary
print(f"Borough Model - MAE: {mae_borough}, MSE: {mse_borough}, R²: {r2_borough}")

In [None]:
# Train and Evaluate Neighborhood Model

# Prepare data
X_neighborhood = neighborhood_data.drop(columns=['total_cost'])
y_neighborhood = neighborhood_data['total_cost']

# Split data
X_train_neighborhood, X_test_neighborhood, y_train_neighborhood, y_test_neighborhood = train_test_split(X_neighborhood, y_neighborhood, test_size=0.2, random_state=38)

# Display training set to verify splits
print(X_train_neighborhood.head())

# Train the model
rf_neighborhood = RandomForestRegressor(random_state=38)
rf_neighborhood.fit(X_train_neighborhood, y_train_neighborhood)

# Predict and evaluate
y_pred_neighborhood = rf_neighborhood.predict(X_test_neighborhood)
mae_neighborhood = mean_absolute_error(y_test_neighborhood, y_pred_neighborhood)
mse_neighborhood = mean_squared_error(y_test_neighborhood, y_pred_neighborhood)
r2_neighborhood = r2_score(y_test_neighborhood, y_pred_neighborhood)

# Print performance summary
print(f"Neighborhood Model - MAE: {mae_neighborhood}, MSE: {mse_neighborhood}, R²: {r2_neighborhood}")

In [None]:
# Save the model
# joblib.dump(rf, 'model.pkl')