# 2025 Test Data Preprocessing

This notebook preprocesses 2025 taxi data (Jan-June) for out-of-sample testing of our service consistency models trained on 2024 data.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Initialize Spark session
spark = SparkSession.builder \
    .appName("TLC_2025_Test_Data_Preprocessing") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/08/15 14:30:20 WARN Utils: Your hostname, Jordans-MBP.local, resolves to a loopback address: 127.0.0.1; using 10.0.9.9 instead (on interface en0)
25/08/15 14:30:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/15 14:30:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load 2025 Raw Taxi Data

In [3]:
# Load 2025 taxi data (Jan-June)
months_2025 = ['2025-01', '2025-02', '2025-03', '2025-04', '2025-05', '2025-06']

# Load and union all yellow taxi data for 2025
yellow_dataframes_2025 = []
for month in months_2025:
    try:
        df = spark.read.parquet(f'../data/raw/taxi_2025/yellow_tripdata_{month}.parquet')
        df = df.withColumn("data_month", lit(month))
        yellow_dataframes_2025.append(df)
        print(f"Loaded {month}: {df.count():,} records")
    except Exception as e:
        print(f"Error loading {month}: {str(e)}")

yellow_2025 = yellow_dataframes_2025[0]
for df in yellow_dataframes_2025[1:]:
    yellow_2025 = yellow_2025.union(df)

print(f"Total Yellow Taxi 2025 Records: {yellow_2025.count():,}")

# Load and union all green taxi data for 2025
green_dataframes_2025 = []
for month in months_2025:
    try:
        df = spark.read.parquet(f'../data/raw/taxi_2025/green_tripdata_{month}.parquet')
        df = df.withColumn("data_month", lit(month))
        green_dataframes_2025.append(df)
        print(f"Loaded {month}: {df.count():,} records")
    except Exception as e:
        print(f"Error loading {month}: {str(e)}")

green_2025 = green_dataframes_2025[0]
for df in green_dataframes_2025[1:]:
    green_2025 = green_2025.union(df)

print(f"Total Green Taxi 2025 Records: {green_2025.count():,}")

                                                                                

Loaded 2025-01: 3,475,226 records
Loaded 2025-02: 3,577,543 records
Loaded 2025-03: 4,145,257 records
Loaded 2025-04: 3,970,553 records
Loaded 2025-05: 4,591,845 records
Loaded 2025-06: 4,322,960 records
Total Yellow Taxi 2025 Records: 24,083,384
Loaded 2025-01: 48,326 records
Loaded 2025-02: 46,621 records
Loaded 2025-03: 51,539 records
Loaded 2025-04: 52,132 records
Loaded 2025-05: 55,399 records
Loaded 2025-06: 49,390 records
Total Green Taxi 2025 Records: 303,407


## Apply Same Data Cleaning as 2024 Training Data

In [4]:
# Apply identical cleaning logic from training data
def clean_taxi_data(df, pickup_col, dropoff_col, datetime_col):
    return df.filter(
        (col("PULocationID") >= 1) & (col("PULocationID") <= 263) &
        (col("DOLocationID") >= 1) & (col("DOLocationID") <= 263) &
        col("PULocationID").isNotNull() & col("DOLocationID").isNotNull() &
        (col("trip_distance") >= 0) & (col("trip_distance") <= 100) &
        col(datetime_col).isNotNull()
    )

# Clean 2025 datasets
yellow_2025_clean = clean_taxi_data(yellow_2025, "PULocationID", "DOLocationID", "tpep_pickup_datetime")
green_2025_clean = clean_taxi_data(green_2025, "PULocationID", "DOLocationID", "lpep_pickup_datetime")

yellow_2025_clean_count = yellow_2025_clean.count()
green_2025_clean_count = green_2025_clean.count()

print(f"After cleaning 2025 data:")
print(f"  Yellow: {yellow_2025_clean_count:,} ({yellow_2025_clean_count/yellow_2025.count()*100:.1f}%)")
print(f"  Green: {green_2025_clean_count:,} ({green_2025_clean_count/green_2025.count()*100:.1f}%)")

# Handle passenger count nulls (same as training data)
yellow_median = yellow_2025_clean.select(expr("percentile_approx(passenger_count, 0.5)")).collect()[0][0]
green_median = green_2025_clean.select(expr("percentile_approx(passenger_count, 0.5)")).collect()[0][0]

yellow_median = int(yellow_median) if yellow_median else 1
green_median = int(green_median) if green_median else 1

yellow_2025_clean = yellow_2025_clean.fillna({"passenger_count": yellow_median})
green_2025_clean = green_2025_clean.fillna({"passenger_count": green_median})

print(f"Imputed passenger count nulls: Yellow={yellow_median}, Green={green_median}")

                                                                                

After cleaning 2025 data:
  Yellow: 23,903,476 (99.3%)
  Green: 298,835 (98.5%)


                                                                                

Imputed passenger count nulls: Yellow=1, Green=1


## Aggregate to Daily Zone Level (Same as Training)

In [6]:
# Aggregate yellow taxi to daily zone level
yellow_2025_daily = yellow_2025_clean.select(
    col("PULocationID").alias("LocationID"),
    date_format("tpep_pickup_datetime", "yyyy-MM-dd").alias("date"),
    col("trip_distance"),
    col("passenger_count")
).groupBy("LocationID", "date").agg(
    count("*").alias("trip_count"),
    avg("trip_distance").alias("avg_trip_distance"),
    sum("passenger_count").alias("total_passengers")
).withColumn("taxi_type", lit("yellow"))

# Aggregate green taxi to daily zone level  
green_2025_daily = green_2025_clean.select(
    col("PULocationID").alias("LocationID"),
    date_format("lpep_pickup_datetime", "yyyy-MM-dd").alias("date"),
    col("trip_distance"), 
    col("passenger_count")
).groupBy("LocationID", "date").agg(
    count("*").alias("trip_count"),
    avg("trip_distance").alias("avg_trip_distance"),
    sum("passenger_count").alias("total_passengers")
).withColumn("taxi_type", lit("green"))

# Combine yellow and green daily data
combined_2025_daily = yellow_2025_daily.union(green_2025_daily)

# Aggregate both taxi types to single daily zone level
daily_zone_2025 = combined_2025_daily.groupBy("LocationID", "date").agg(
    sum("trip_count").alias("daily_trips"),
    avg("avg_trip_distance").alias("avg_distance"),
    sum("total_passengers").alias("daily_passengers")
).withColumn("date", to_date(col("date")))

print(f"2025 daily zone aggregation: {daily_zone_2025.count():,} zone-day records")

# Check aggregation results
date_range_2025 = daily_zone_2025.select(min("date"), max("date")).collect()[0]
zone_coverage_2025 = daily_zone_2025.select(count_distinct("LocationID")).collect()[0][0]
print(f"2025 date range: {date_range_2025[0]} to {date_range_2025[1]}")
print(f"2025 zones with activity: {zone_coverage_2025}")

                                                                                

2025 daily zone aggregation: 43,433 zone-day records




2025 date range: 2007-12-05 to 2025-07-01
2025 zones with activity: 260


                                                                                

## Add Temporal Features (Same as Training)

In [11]:
from pyspark.sql.functions import year, month, dayofweek, dayofmonth, weekofyear
# Add identical temporal features as training data
daily_2025_with_temporal = daily_zone_2025.withColumn("year", year("date")) \
    .withColumn("month", month("date")) \
    .withColumn("day_of_week", dayofweek("date")) \
    .withColumn("day_of_month", dayofmonth("date")) \
    .withColumn("week_of_year", weekofyear("date")) \
    .withColumn("is_weekend", when(col("day_of_week").isin([1, 7]), 1).otherwise(0))

print("Added temporal features for 2025 data")

Added temporal features for 2025 data


In [12]:
daily_2025_with_temporal.describe().show()

25/08/15 14:36:18 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+-------+-----------------+------------------+-----------------+------------------+-------------------+------------------+-----------------+------------------+------------------+------------------+
|summary|       LocationID|       daily_trips|     avg_distance|  daily_passengers|               year|             month|      day_of_week|      day_of_month|      week_of_year|        is_weekend|
+-------+-----------------+------------------+-----------------+------------------+-------------------+------------------+-----------------+------------------+------------------+------------------+
|  count|            43433|             43433|            43433|             43433|              43433|             43433|            43433|             43433|             43433|             43433|
|   mean|133.1910528860544|   557.23323279534|5.038081063216802| 684.3585983008312| 2024.9983652982755| 3.534432344070177|4.004466649782423|15.633757741809223|13.804595584002946|0.2885824142932793|
| stddev|7

                                                                                

Removing years outside of 2025

In [13]:
filtered_daily = daily_2025_with_temporal.filter(col("year") == 2025)

print(f"before filtering: {daily_2025_with_temporal.count():,} records, after filtering: {filtered_daily.count():,} records")




before filtering: 43,433 records, after filtering: 43,409 records


                                                                                

## Load and Integrate External Data

In [16]:
# Load preprocessed external data (same as training)
weather_df = spark.read.parquet("../data/processed/weather_data.parquet")
census_df = spark.read.parquet("../data/processed/census_data.parquet")
zones_df = spark.read.parquet("../data/processed/taxi_zones.parquet")
# Filter weather data for 2025 period
weather_2025 = weather_df.filter(
    (col("date") >= "2025-01-01") & (col("date") <= "2025-06-30")
)

print(f"2025 weather data: {weather_2025.count()} days")

# Join with weather data
daily_2025_with_weather = filtered_daily.join(weather_2025, "date", "left")

# Join with census data
daily_2025_with_census = daily_2025_with_weather.join(census_df, "LocationID", "left")

# Join with zone information
final_2025_data = daily_2025_with_census.join(
    zones_df.select("LocationID", "Zone", "Borough", "service_zone"), 
    "LocationID", "left"
)

print(f"Final 2025 integrated dataset: {final_2025_data.count():,} records")

# Check integration success
integration_stats_2025 = final_2025_data.select(
    sum(col("temperature_avg").isNull().cast("int")).alias("missing_weather"),
    sum(col("Median_Income").isNull().cast("int")).alias("missing_census"),
    sum(col("Zone").isNull().cast("int")).alias("missing_zone_info")
).collect()[0]

print(f"2025 integration quality:")
print(f"  Missing weather: {integration_stats_2025['missing_weather']}")
print(f"  Missing census: {integration_stats_2025['missing_census']}")
print(f"  Missing zone info: {integration_stats_2025['missing_zone_info']}")

2025 weather data: 181 days


                                                                                

Final 2025 integrated dataset: 114,380 records




2025 integration quality:
  Missing weather: 6
  Missing census: 334
  Missing zone info: 0


                                                                                

In [17]:
final_2025_data.describe().show()

[Stage 229:>                                                        (0 + 1) / 1]

+-------+-----------------+------------------+------------------+------------------+------+------------------+-----------------+------------------+------------------+-------------------+------------------+------------------+------------------+-----------------+------------------+---------------------+------------------+--------------------+-------------+------------+
|summary|       LocationID|       daily_trips|      avg_distance|  daily_passengers|  year|             month|      day_of_week|      day_of_month|      week_of_year|         is_weekend|   temperature_avg|  precipitation_mm|           snow_mm|    Median_Income|Percent_No_Vehicle|No_Vehicle_Households|  Total_Households|                Zone|      Borough|service_zone|
+-------+-----------------+------------------+------------------+------------------+------+------------------+-----------------+------------------+------------------+-------------------+------------------+------------------+------------------+-----------------

                                                                                

In [18]:
clean_final_2025 = final_2025_data.dropna()

In [19]:
clean_final_2025.describe().show()

[Stage 246:>                                                        (0 + 1) / 1]

+-------+------------------+-----------------+------------------+------------------+------+------------------+------------------+------------------+-----------------+-------------------+------------------+------------------+------------------+------------------+------------------+---------------------+----------------+--------------------+-------------+------------+
|summary|        LocationID|      daily_trips|      avg_distance|  daily_passengers|  year|             month|       day_of_week|      day_of_month|     week_of_year|         is_weekend|   temperature_avg|  precipitation_mm|           snow_mm|     Median_Income|Percent_No_Vehicle|No_Vehicle_Households|Total_Households|                Zone|      Borough|service_zone|
+-------+------------------+-----------------+------------------+------------------+------+------------------+------------------+------------------+-----------------+-------------------+------------------+------------------+------------------+------------------+

                                                                                

In [23]:
# Create identical features as training data
clean_final_2025 = clean_final_2025 \
    .withColumn("vehicle_ownership_rate", 1 - col("Percent_No_Vehicle") / 100) \
    .withColumn("log_income", log(col("Median_Income") + 1)) \
    .withColumn("has_precipitation", when(col("precipitation_mm") > 0, 1).otherwise(0)) \
    .withColumn("has_snow", when(col("snow_mm") > 0, 1).otherwise(0))

In [24]:
clean_final_2025.write.mode("overwrite").parquet("../data/processed/taxi_2025_final.parquet")

                                                                                