# External Data Analysis

This notebook processes external datasets needed for our service consistency analysis:
- Weather data (temperature, precipitation)
- Census data (income, vehicle ownership by taxi zone)
- NYC taxi zone lookup table

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import requests
import pandas as pd
import json
from datetime import datetime, timedelta
import time

# Initialize Spark session
spark = SparkSession.builder \
    .appName("TLC_External_Data_Collection") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

print(f"Spark session initialized for external data collection")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/13 09:50:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark session initialized for external data collection


## Load External Datasets

In [9]:
# Load taxi zone lookup
zones_df = spark.read.option("header", "true").option("inferSchema", "true").csv("../data/raw/taxi_zone_lookup.csv")

# Load weather data  
weather_df = spark.read.option("header", "true").option("inferSchema", "true").csv("../data/external/weather_2024_2025.csv")

# Load census data
census_df = spark.read.option("header", "true").option("inferSchema", "true").csv("../data/external/census_acs_puma_2022.csv")

# Load zone to PUMA mapping
zone_to_puma_df = spark.read.option("header", "true").option("inferSchema", "true").csv("../data/processed/zone_to_puma.csv")

# Convert date column if needed
if weather_df.schema['date'].dataType == StringType():
    weather_df = weather_df.withColumn("date", to_date(col("date")))

## Map Census Data to Taxi Zones

In [11]:
# Join zone mapping with census data
census_by_zone = zone_to_puma_df.join(census_df, "PUMA", "inner") \
    .select("LocationID", "Median_Income", "Percent_No_Vehicle", 
            "No_Vehicle_Households", "Total_Households")

print(f"Census data mapped to {census_by_zone.count()} taxi zones")

# Show sample mapping
census_by_zone.show(5)

Census data mapped to 668 taxi zones
+----------+-------------+------------------+---------------------+----------------+
|LocationID|Median_Income|Percent_No_Vehicle|No_Vehicle_Households|Total_Households|
+----------+-------------+------------------+---------------------+----------------+
|       255|        54111| 83.31544554186422|                61326|           73607|
|       232|        54111| 83.31544554186422|                61326|           73607|
|       224|        54111| 83.31544554186422|                61326|           73607|
|       148|        54111| 83.31544554186422|                61326|           73607|
|       144|        54111| 83.31544554186422|                61326|           73607|
+----------+-------------+------------------+---------------------+----------------+
only showing top 5 rows


## Data Quality Check

In [13]:
census_by_zone.describe().show()

25/08/13 10:15:31 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+-----------------+-----------------+------------------+---------------------+------------------+
|summary|       LocationID|    Median_Income|Percent_No_Vehicle|No_Vehicle_Households|  Total_Households|
+-------+-----------------+-----------------+------------------+---------------------+------------------+
|  count|              668|              668|               668|                  668|               668|
|   mean|128.6062874251497|85302.76796407186| 54.20341977059021|   35060.600299401194| 62548.42964071856|
| stddev|74.62947407819937|36256.94055579313|20.975839005132915|    19858.22952736317|17715.281947665884|
|    min|                2|            32428| 7.929810116811609|                 4623|             37549|
|    max|              263|           171480| 84.54381014398379|                83928|            108958|
+-------+-----------------+-----------------+------------------+---------------------+------------------+



In [15]:
# Test zone-census join
zone_census_join = zones_df.join(census_by_zone, "LocationID", "inner")
join_success = zone_census_join.count()

print(f"Zone-Census integration: {join_success}/{zones_df.count()} zones successfully joined")

# Sample integrated data
zone_census_join.show(5)

Zone-Census integration: 668/265 zones successfully joined
+----------+---------+--------------------+------------+-------------+------------------+---------------------+----------------+
|LocationID|  Borough|                Zone|service_zone|Median_Income|Percent_No_Vehicle|No_Vehicle_Households|Total_Households|
+----------+---------+--------------------+------------+-------------+------------------+---------------------+----------------+
|       255| Brooklyn|Williamsburg (Nor...|   Boro Zone|        54111| 83.31544554186422|                61326|           73607|
|       232|Manhattan|Two Bridges/Sewar...| Yellow Zone|        54111| 83.31544554186422|                61326|           73607|
|       224|Manhattan|Stuy Town/Peter C...| Yellow Zone|        54111| 83.31544554186422|                61326|           73607|
|       148|Manhattan|     Lower East Side| Yellow Zone|        54111| 83.31544554186422|                61326|           73607|
|       144|Manhattan| Little Italy/No

## Weather Data Quality Check

In [29]:
weather_df.filter(col("snow_mm").isNull()).show(5)

+----------+---------------+----------------+-------+
|      date|temperature_avg|precipitation_mm|snow_mm|
+----------+---------------+----------------+-------+
|2025-04-26|           18.0|            20.2|   NULL|
+----------+---------------+----------------+-------+



In [32]:
weather_df.filter(col("date")== datetime(2025, 4, 27)).show(5)
weather_df.filter(col("date")== datetime(2025, 4, 25)).show(5)

+----------+---------------+----------------+-------+
|      date|temperature_avg|precipitation_mm|snow_mm|
+----------+---------------+----------------+-------+
|2025-04-27|           13.2|             0.0|    0.0|
+----------+---------------+----------------+-------+

+----------+---------------+----------------+-------+
|      date|temperature_avg|precipitation_mm|snow_mm|
+----------+---------------+----------------+-------+
|2025-04-25|           18.3|             0.0|    0.0|
+----------+---------------+----------------+-------+



In [34]:
weather_df = weather_df.withColumn("snow_mm", when(col("snow_mm").isNull(), 0).otherwise(col("snow_mm")))

In [35]:
weather_df.describe().show()

+-------+------------------+------------------+------------------+
|summary|   temperature_avg|  precipitation_mm|           snow_mm|
+-------+------------------+------------------+------------------+
|  count|               363|               363|               363|
|   mean|11.165840220385673|3.4815426997245176|0.2837465564738292|
| stddev| 8.511223727144994| 7.843195019297988|1.1391744911225399|
|    min|              -8.2|               0.0|               0.0|
|    max|              33.2|              65.2|              10.0|
+-------+------------------+------------------+------------------+



In [36]:
zones_df.write.mode("overwrite").parquet("../data/processed/taxi_zones.parquet")
weather_df.write.mode("overwrite").parquet("../data/processed/weather_data.parquet")
census_by_zone.write.mode("overwrite").parquet("../data/processed/census_data.parquet")

25/08/13 21:54:02 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 992327 ms exceeds timeout 120000 ms
25/08/13 21:54:02 WARN SparkContext: Killing executors is not supported by current scheduler.
25/08/13 22:11:58 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:342)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:132)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$