# Checkpoint 1

## Installation of dependencies

In [1]:
!pip install -r requirements.txt

Collecting shapely==2.0.7 (from -r requirements.txt (line 1))
  Using cached shapely-2.0.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (6.8 kB)
Collecting delta-spark==3.3.0 (from -r requirements.txt (line 2))
  Using cached delta_spark-3.3.0-py3-none-any.whl.metadata (2.0 kB)
Collecting py4j==0.10.9.7 (from pyspark<3.6.0,>=3.5.3->delta-spark==3.3.0->-r requirements.txt (line 2))
  Using cached py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Using cached shapely-2.0.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (2.4 MB)
Using cached delta_spark-3.3.0-py3-none-any.whl (21 kB)
Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Installing collected packages: py4j, shapely, delta-spark
Successfully installed delta-spark-3.3.0 py4j-0.10.9.7 shapely-2.0.7


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, unix_timestamp, lag, when, count, avg, lead, sum as spark_sum
from pyspark.sql.window import Window
import json
from shapely.geometry import Point, Polygon, shape
from pyspark.sql.types import (
    StructType,
    StructField,
    IntegerType,
    StringType,
    DoubleType
)


In [3]:
# Initialize Spark Session
spark = SparkSession.builder.appName("NYC Taxi Analysis").getOrCreate()



## Load taxi rides dataset

In [4]:
# Define schema for the data
schema = StructType(
    [
        StructField("medallion", StringType()),
        StructField("hack_license", StringType()),
        StructField("vendor_id", StringType()),
        StructField("rate_code", StringType()),
        StructField("store_and_fwd_flag", StringType()),
        StructField("pickup_datetime", StringType()),
        StructField(
            "dropoff_datetime", StringType()
        ),
        StructField("passenger_count", IntegerType()),
        StructField("pickup_longitude", DoubleType()),
        StructField("pickup_latitude", DoubleType()),
        StructField("dropoff_longitude", DoubleType()),
        StructField("dropoff_latitude", DoubleType()),
    ]
)


In [5]:
sample_path = "input/sample.csv"

taxi_df = (
    spark.read
    .option("header", True)
    .schema(schema)
    .option("dateFormat", "dd-MM-yy HH:mm")  # XXX: why is this not working?
    .csv(sample_path)
)


In [6]:
## Checking the columns we have
taxi_df.columns

['medallion',
 'hack_license',
 'vendor_id',
 'rate_code',
 'store_and_fwd_flag',
 'pickup_datetime',
 'dropoff_datetime',
 'passenger_count',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude']

In [7]:
# Load GeoJSON Data
borough_data = None
with open("./input/nyc-boroughs.geojson", "r") as file:
    borough_data = json.load(file)

# getting dictionary of boroughs for fast lookup
borough_polygons = {}
for feature in borough_data["features"]:
    borough_name = feature["properties"]["borough"]
    borough_geometry = shape(feature["geometry"])
    borough_polygons[borough_name] = borough_geometry


In [8]:
# check raw shapes in the dictionary
borough_polygons

{'Staten Island': <POLYGON ((-74.082 40.648, -74.081 40.649, -74.081 40.648, -74.08 40.648, -7...>,
 'Queens': <POLYGON ((-73.891 40.776, -73.891 40.777, -73.891 40.777, -73.891 40.777, -...>,
 'Brooklyn': <POLYGON ((-73.931 40.595, -73.932 40.594, -73.932 40.595, -73.931 40.595))>,
 'Manhattan': <POLYGON ((-73.907 40.876, -73.908 40.873, -73.908 40.873, -73.909 40.872, -...>,
 'Bronx': <POLYGON ((-73.804 40.813, -73.804 40.814, -73.804 40.814, -73.804 40.814, -...>}

In [9]:
# Broadcast GeoJSON Data to Spark
borough_broadcast = spark.sparkContext.broadcast(borough_polygons)

In [10]:
def get_borough(lon, lat):
    try:
        point = Point(lon, lat)

        # Debugging: Print first few calls
        # print(f"Checking: lon={lon}, lat={lat}")

        for borough, polygon in borough_broadcast.value.items():
            if polygon.contains(point):
                # print(f"Matched: {lon}, {lat} -> {borough}")
                return borough

    except Exception as e:
        print(f"Error processing ({lon}, {lat}): {e}")  # Print error details

    return "Unknown"

# Register the function as a Spark UDF again
to_borough_udf = spark.udf.register("to_borough", get_borough, StringType())

In [11]:
# Assign Borough Names to Pickup and Drop-off Locations
taxi_df = taxi_df.withColumn("pickup_borough", to_borough_udf(col("pickup_longitude"), col("pickup_latitude")))
taxi_df = taxi_df.withColumn("dropoff_borough", to_borough_udf(col("dropoff_longitude"), col("dropoff_latitude")))
taxi_df.show(5)

+--------------------+--------------------+---------+---------+------------------+---------------+----------------+---------------+----------------+---------------+-----------------+----------------+--------------+---------------+
|           medallion|        hack_license|vendor_id|rate_code|store_and_fwd_flag|pickup_datetime|dropoff_datetime|passenger_count|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|pickup_borough|dropoff_borough|
+--------------------+--------------------+---------+---------+------------------+---------------+----------------+---------------+----------------+---------------+-----------------+----------------+--------------+---------------+
|89D227B655E5C82AE...|BA96DE419E711691B...|      CMT|        1|                 N| 01-01-13 15:11|  01-01-13 15:18|              4|      -73.978165|      40.757977|       -73.989838|       40.751171|       Unknown|        Unknown|
|0BD7C8F5BA12B88E0...|9FD8F69F0804BDB55...|      CMT|        1|             

In [12]:
# Define the max idle time threshold (4 hours in seconds)
four_hours_in_seconds = 4 * 60 * 60

# Convert pickup and dropoff datetime to Unix timestamps
taxi_df = taxi_df.withColumn("pickup_unix", unix_timestamp(col("pickup_datetime"), "dd-MM-yy HH:mm"))
taxi_df = taxi_df.withColumn("dropoff_unix", unix_timestamp(col("dropoff_datetime"), "dd-MM-yy HH:mm"))

# Compute trip duration in seconds
taxi_df = taxi_df.withColumn("duration", col("dropoff_unix") - col("pickup_unix"))

# Filter out incorrect durations (negative durations or trips longer than 4 hours)
taxi_df = taxi_df.filter((col("duration") > 0) & (col("duration") <= four_hours_in_seconds))

# Define window specification to order trips per taxi
window_spec = Window.partitionBy("medallion").orderBy("pickup_unix")

# Get the previous trip's dropoff time (Unix timestamp)
taxi_df = taxi_df.withColumn("prev_dropoff_unix", lag("dropoff_unix").over(window_spec))

# Compute idle time only when a previous trip exists
taxi_df = taxi_df.withColumn(
    "idle_time",
    when(col("prev_dropoff_unix").isNotNull(),
         when((col("pickup_unix") - col("prev_dropoff_unix")) <= four_hours_in_seconds,
              col("pickup_unix") - col("prev_dropoff_unix")
         ).otherwise(0)
    ).otherwise(0)
)

# Group by taxi to calculate total trip time and total idle time
utilization_df = taxi_df.groupBy("medallion").agg(
    spark_sum("duration").alias("total_trip_time"),
    spark_sum("idle_time").alias("total_idle_time")
)

# Calculate utilization rate
utilization_df = utilization_df.withColumn(
    "utilization_rate",
    col("total_trip_time") / (col("total_trip_time") + col("total_idle_time"))
)


taxi_window = Window.partitionBy("medallion").orderBy("dropoff_unix")

# Get the next trip's pickup time within the same taxi
taxi_df = taxi_df.withColumn("next_pickup_unix", lead("pickup_unix").over(taxi_window))

# Calculate time to next fare, ensuring non-negative values
taxi_df = taxi_df.withColumn(
    "time_to_next_fare",
    when(
        (col("next_pickup_unix").isNotNull()) & (col("next_pickup_unix") >= col("dropoff_unix")),
        col("next_pickup_unix") - col("dropoff_unix")
    ).otherwise(None)  # Ignore invalid (negative) idle times
)

# Now, calculate the average time to next fare per borough
next_fare_df = taxi_df \
    .filter(col("time_to_next_fare").isNotNull()) \
    .groupBy("dropoff_borough") \
    .agg(avg("time_to_next_fare").alias("avg_time_to_next_fare"))


In [13]:

# Count Trips Within the Same Borough
same_borough_df = taxi_df.filter(col("pickup_borough") == col("dropoff_borough"))
same_borough_count = same_borough_df.groupBy("pickup_borough").agg(count("medallion").alias("same_borough_trips"))


In [14]:
# Count Trips Between Different Boroughs
diff_borough_df = taxi_df.filter(col("pickup_borough") != col("dropoff_borough"))
diff_borough_count = diff_borough_df.groupBy("pickup_borough", "dropoff_borough").agg(count("medallion").alias("cross_borough_trips"))

## Results

In [15]:
# Show Results
utilization_df.show()
next_fare_df.show()
same_borough_count.show()
diff_borough_count.show()


+--------------------+---------------+---------------+-------------------+
|           medallion|total_trip_time|total_idle_time|   utilization_rate|
+--------------------+---------------+---------------+-------------------+
|000318C2E3E638158...|          13920|          17400| 0.4444444444444444|
|002E3B405B6ABEA23...|          10260|          16140| 0.3886363636363636|
|0030AD2648D81EE87...|           1980|            720| 0.7333333333333333|
|0036961468659D0BF...|          11700|          19740|0.37213740458015265|
|0038EF45118925A51...|          10920|          15120|0.41935483870967744|
|0053334C798EC6C8E...|           7920|          22440| 0.2608695652173913|
|005DED7D6E6C45441...|          11460|          11760| 0.4935400516795866|
|005F00B38F46E2100...|          18600|          42180| 0.3060217176702863|
|00790C7BAD30B7A9E...|          12360|          25320|0.32802547770700635|
|0094A03FFE6BAFBE0...|          10680|           5400|  0.664179104477612|
|009D3CCA83486B03F...|   

## Basic EDA

In [16]:
utilization_df.summary().show()

+-------+--------------------+-----------------+------------------+-------------------+
|summary|           medallion|  total_trip_time|   total_idle_time|   utilization_rate|
+-------+--------------------+-----------------+------------------+-------------------+
|  count|                6435|             6435|              6435|               6435|
|   mean|                NULL|10111.48717948718|16852.195804195806|0.45085909572146127|
| stddev|                NULL|5049.823057369299|11327.890227052827|  0.202482145349854|
|    min|000318C2E3E638158...|               60|                 0| 0.0639269406392694|
|    25%|                NULL|             6480|              8520|0.31956521739130433|
|    50%|                NULL|            10860|             15780| 0.4051724137931034|
|    75%|                NULL|            13920|             23520| 0.5104166666666666|
|    max|FFFECF75AB6CC4FF9...|            25020|             55080|                1.0|
+-------+--------------------+--

In [17]:
df_with_hours_and_minutes = utilization_df.withColumn('trip_time_hours', col('total_trip_time') / 3600) \
                  .withColumn('idle_time_hours', col('total_idle_time') / 3600) \
                  .withColumn('idle_time_minutes', col('total_idle_time') / 60) \
                  .withColumn('trip_time_minutes', col('total_trip_time') / 60)

In [18]:
[field.dataType.simpleString() for field in df_with_hours_and_minutes.schema.fields]

['string',
 'bigint',
 'bigint',
 'double',
 'double',
 'double',
 'double',
 'double']

In [19]:
numeric_cols = [field.name for field in df_with_hours_and_minutes.schema.fields 
                if field.dataType.simpleString() in ['bigint', 'double', 'int', 'long', 'float']]

# Create a summary DataFrame
summary = df_with_hours_and_minutes.select(numeric_cols).summary(
    "count", "mean", "stddev", "min", "25%", "50%", "75%", "max"
).toPandas()

# Set the summary column as index for better display
summary.set_index('summary', inplace=True)

# Display in Jupyter notebook
display(summary)

Unnamed: 0_level_0,total_trip_time,total_idle_time,utilization_rate,trip_time_hours,idle_time_hours,idle_time_minutes,trip_time_minutes
summary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
count,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0
mean,10111.48717948718,16852.195804195806,0.4508590957214612,2.8087464387464376,4.6811655011655,280.8699300699301,168.52478632478633
stddev,5049.823057369299,11327.890227052829,0.202482145349854,1.4027286270470278,3.146636174181344,188.7981704508804,84.16371762282165
min,60.0,0.0,0.0639269406392694,0.0166666666666666,0.0,0.0,1.0
25%,6480.0,8520.0,0.3195652173913043,1.8,2.3666666666666667,142.0,108.0
50%,10860.0,15780.0,0.4051724137931034,3.0166666666666666,4.383333333333334,263.0,181.0
75%,13920.0,23520.0,0.5104166666666666,3.8666666666666663,6.533333333333333,392.0,232.0
max,25020.0,55080.0,1.0,6.95,15.3,918.0,417.0
