In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, unix_timestamp, lag, when, count, avg, lead, sum as spark_sum
from pyspark.sql.window import Window
import geopandas as gpd
from shapely.geometry import Point, Polygon
from pyspark.sql.types import StringType


In [22]:
# Initialize Spark Session
spark = SparkSession.builder.appName("NYC Taxi Analysis").getOrCreate()



In [23]:
taxi_df = spark.read.option("header", "true").csv("input/Sample NYC Data.csv")

In [24]:
# Load GeoJSON Data
geojson_path = "input/nyc-boroughs.geojson"
geojson_data = gpd.read_file(geojson_path)

In [25]:
# Convert GeoJSON Data to Dictionary for Fast Lookup
borough_polygons = {}
for _, row in geojson_data.iterrows():
    borough_polygons[row['borough']] = row['geometry']  # Use the existing Polygon directly

In [26]:
# Broadcast GeoJSON Data to Spark
borough_broadcast = spark.sparkContext.broadcast(borough_polygons)

In [27]:
def get_borough(lon, lat):
    try:
        lon, lat = float(lon), float(lat)
        point = Point(lon, lat)

        # Debugging: Print first few calls
        print(f"Checking: lon={lon}, lat={lat}")

        for borough, polygon in borough_broadcast.value.items():
            if polygon.contains(point):
                print(f"Matched: {lon}, {lat} -> {borough}")
                return borough

    except Exception as e:
        print(f"Error processing ({lon}, {lat}): {e}")  # Print error details

    return "Unknown"

# Register the function as a Spark UDF again
to_borough_udf = spark.udf.register("to_borough", get_borough, StringType())

In [28]:
# Assign Borough Names to Pickup and Drop-off Locations
taxi_df = taxi_df.withColumn("pickup_borough", to_borough_udf(col("pickup_longitude"), col("pickup_latitude")))
taxi_df = taxi_df.withColumn("dropoff_borough", to_borough_udf(col("dropoff_longitude"), col("dropoff_latitude")))
taxi_df.show(5)

+--------------------+--------------------+---------+---------+------------------+---------------+----------------+---------------+----------------+---------------+-----------------+----------------+--------------+---------------+
|           medallion|        hack_license|vendor_id|rate_code|store_and_fwd_flag|pickup_datetime|dropoff_datetime|passenger_count|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|pickup_borough|dropoff_borough|
+--------------------+--------------------+---------+---------+------------------+---------------+----------------+---------------+----------------+---------------+-----------------+----------------+--------------+---------------+
|89D227B655E5C82AE...|BA96DE419E711691B...|      CMT|        1|                 N| 01-01-13 15:11|  01-01-13 15:18|              4|      -73.978165|      40.757977|       -73.989838|       40.751171|       Unknown|        Unknown|
|0BD7C8F5BA12B88E0...|9FD8F69F0804BDB55...|      CMT|        1|             

In [29]:
# Define the max idle time threshold (4 hours in seconds)
four_hours_in_seconds = 4 * 60 * 60

# Convert pickup and dropoff datetime to Unix timestamps
taxi_df = taxi_df.withColumn("pickup_unix", unix_timestamp(col("pickup_datetime"), "dd-MM-yy HH:mm"))
taxi_df = taxi_df.withColumn("dropoff_unix", unix_timestamp(col("dropoff_datetime"), "dd-MM-yy HH:mm"))

# Compute trip duration in seconds
taxi_df = taxi_df.withColumn("duration", col("dropoff_unix") - col("pickup_unix"))

# Filter out incorrect durations (negative durations or trips longer than 4 hours)
taxi_df = taxi_df.filter((col("duration") > 0) & (col("duration") <= four_hours_in_seconds))

# Define window specification to order trips per taxi
window_spec = Window.partitionBy("medallion").orderBy("pickup_unix")

# Get the previous trip's dropoff time (Unix timestamp)
taxi_df = taxi_df.withColumn("prev_dropoff_unix", lag("dropoff_unix").over(window_spec))

# Compute idle time only when a previous trip exists
taxi_df = taxi_df.withColumn(
    "idle_time",
    when(col("prev_dropoff_unix").isNotNull(),
         when((col("pickup_unix") - col("prev_dropoff_unix")) <= four_hours_in_seconds,
              col("pickup_unix") - col("prev_dropoff_unix")
         ).otherwise(0)
    ).otherwise(0)
)

# Group by taxi to calculate total trip time and total idle time
utilization_df = taxi_df.groupBy("medallion").agg(
    spark_sum("duration").alias("total_trip_time"),
    spark_sum("idle_time").alias("total_idle_time")
)

# Calculate utilization rate
utilization_df = utilization_df.withColumn(
    "utilization_rate",
    col("total_trip_time") / (col("total_trip_time") + col("total_idle_time"))
)


taxi_window = Window.partitionBy("medallion").orderBy("dropoff_unix")

# Get the next trip's pickup time within the same taxi
taxi_df = taxi_df.withColumn("next_pickup_unix", lead("pickup_unix").over(taxi_window))

# Calculate time to next fare, ensuring non-negative values
taxi_df = taxi_df.withColumn(
    "time_to_next_fare",
    when(
        (col("next_pickup_unix").isNotNull()) & (col("next_pickup_unix") >= col("dropoff_unix")),
        col("next_pickup_unix") - col("dropoff_unix")
    ).otherwise(None)  # Ignore invalid (negative) idle times
)

# Now, calculate the average time to next fare per borough
next_fare_df = taxi_df \
    .filter(col("time_to_next_fare").isNotNull()) \
    .groupBy("dropoff_borough") \
    .agg(avg("time_to_next_fare").alias("avg_time_to_next_fare"))


In [30]:

# Count Trips Within the Same Borough
same_borough_df = taxi_df.filter(col("pickup_borough") == col("dropoff_borough"))
same_borough_count = same_borough_df.groupBy("pickup_borough").agg(count("medallion").alias("same_borough_trips"))


In [31]:
# Count Trips Between Different Boroughs
diff_borough_df = taxi_df.filter(col("pickup_borough") != col("dropoff_borough"))
diff_borough_count = diff_borough_df.groupBy("pickup_borough", "dropoff_borough").agg(count("medallion").alias("cross_borough_trips"))

In [32]:
# Show Results
utilization_df.show()
next_fare_df.show()
same_borough_count.show()
diff_borough_count.show()


+--------------------+---------------+---------------+-------------------+
|           medallion|total_trip_time|total_idle_time|   utilization_rate|
+--------------------+---------------+---------------+-------------------+
|000318C2E3E638158...|          13920|          17400| 0.4444444444444444|
|002E3B405B6ABEA23...|          10260|          16140| 0.3886363636363636|
|0030AD2648D81EE87...|           1980|            720| 0.7333333333333333|
|0036961468659D0BF...|          11700|          19740|0.37213740458015265|
|0038EF45118925A51...|          10920|          15120|0.41935483870967744|
|0053334C798EC6C8E...|           7920|          22440| 0.2608695652173913|
|005DED7D6E6C45441...|          11460|          11760| 0.4935400516795866|
|005F00B38F46E2100...|          18600|          42180| 0.3060217176702863|
|00790C7BAD30B7A9E...|          12360|          25320|0.32802547770700635|
|0094A03FFE6BAFBE0...|          10680|           5400|  0.664179104477612|
|009D3CCA83486B03F...|   