# Checkpoint 1

## Installation of missing dependencies

In [1]:
# Import Java gateway classes to access JVM
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, unix_timestamp, lag, when, count, avg, lead, sum as spark_sum
from pyspark.sql.window import Window
import json
from shapely.geometry import Point, Polygon, shape
from pyspark.sql.types import (
    StructType,
    StructField,
    IntegerType,
    StringType,
    DoubleType
)


In [3]:
import os
import sys

# Feel free to skip this cell, JAVA_HOME was a mess in the UT HPC

# Set Java home explicitly if not already set
if not os.environ.get('JAVA_HOME'):
    # Replace with actual path from the above commands
    os.environ['JAVA_HOME'] = '/gpfs/software/soft/rocket/linux-centos7-x86_64/gcc-9.2.0/openjdk-1.8.0_265-b01-atpvba2g5asxiqs7xvgiygtmxohys7zp'


In [4]:
spark = SparkSession.builder.appName("NYC Taxi Analysis") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "16g") \
    .config("spark.executor.cores", "8") \
    .config("spark.sql.shuffle.partitions", "500") \
    .config("spark.default.parallelism", "500") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.shuffle.service.enabled", "true") \
    .config("spark.sql.files.maxPartitionBytes", "256m") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/10 21:29:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load taxi rides dataset

In [5]:
# Define schema for the data
schema = StructType(
    [
        StructField("medallion", StringType()),
        StructField("pickup_datetime", StringType()),
        StructField(
            "dropoff_datetime", StringType()
        ),
        StructField("pickup_longitude", DoubleType()),
        StructField("pickup_latitude", DoubleType()),
        StructField("dropoff_longitude", DoubleType()),
        StructField("dropoff_latitude", DoubleType()),
    ]
)


In [6]:
parquet_path = "input/taxi_data.parquet"
taxi_df = (
    spark.read
    .option("header", True)
    .schema(schema)
    .parquet(parquet_path)
    .repartition(500)
)


In [7]:
## Checking the columns we have
taxi_df.columns

['medallion',
 'pickup_datetime',
 'dropoff_datetime',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude']

In [8]:
taxi_df.count()

                                                                                

173179759

In [8]:
# Load GeoJSON Data
borough_data = None
with open("./input/nyc-boroughs.geojson", "r") as file:
    borough_data = json.load(file)

# getting dictionary of boroughs for fast lookup
borough_polygons = {}
for feature in borough_data["features"]:
    borough_name = feature["properties"]["borough"]
    borough_geometry = shape(feature["geometry"])
    borough_polygons[borough_name] = borough_geometry


In [9]:
# check raw shapes in the dictionary
borough_polygons

{'Staten Island': <POLYGON ((-74.082 40.648, -74.081 40.649, -74.081 40.648, -74.08 40.648, -7...>,
 'Queens': <POLYGON ((-73.891 40.776, -73.891 40.777, -73.891 40.777, -73.891 40.777, -...>,
 'Brooklyn': <POLYGON ((-73.931 40.595, -73.932 40.594, -73.932 40.595, -73.931 40.595))>,
 'Manhattan': <POLYGON ((-73.907 40.876, -73.908 40.873, -73.908 40.873, -73.909 40.872, -...>,
 'Bronx': <POLYGON ((-73.804 40.813, -73.804 40.814, -73.804 40.814, -73.804 40.814, -...>}

In [10]:
# Broadcast GeoJSON Data to Spark
borough_broadcast = spark.sparkContext.broadcast(borough_polygons)

In [11]:
def get_borough(lon, lat):
    try:
        # basic check / catch is slower
        if lon is None or lat is None:
            return "Unknown"
        
        point = Point(lon, lat)

        for borough, polygon in borough_broadcast.value.items():
            if polygon.contains(point): # match found
                return borough

    except Exception as e:
        print(f"Error processing ({lon}, {lat}): {e}")  # Print error details

    return "Unknown"

# Register the function as a Spark UDF again
to_borough_udf = spark.udf.register("to_borough", get_borough, StringType())

In [12]:
# Checkpoint 1: Borough assignment
taxi_df_with_boroughs = taxi_df.withColumn("pickup_borough", 
                                           to_borough_udf(col("pickup_longitude"), col("pickup_latitude")))
taxi_df_with_boroughs = taxi_df_with_boroughs.withColumn("dropoff_borough", 
                                                        to_borough_udf(col("dropoff_longitude"), col("dropoff_latitude")))

In [16]:
taxi_df_with_boroughs.head(10)

ConnectionRefusedError: [Errno 111] Connection refused

In [13]:
# Checkpoint 2: Time computations
taxi_df_with_time = taxi_df_with_boroughs
# 1) Convert timestamps to unix
taxi_df_with_time = taxi_df_with_time.withColumn(
    "pickup_unix",
    unix_timestamp(col("pickup_datetime"), "yyyy-MM-dd HH:mm:ss")
)
taxi_df_with_time = taxi_df_with_time.withColumn(
    "dropoff_unix",
    unix_timestamp(col("dropoff_datetime"), "yyyy-MM-dd HH:mm:ss")
)

# 2) Add duration column
taxi_df_with_time = taxi_df_with_time.withColumn(
    "duration",
    col("dropoff_unix") - col("pickup_unix")
)

# 3) Then filter using duration
four_hours_in_seconds = 4 * 60 * 60
taxi_df_with_time = taxi_df_with_time.filter(
    (col("duration") > 0) & (col("duration") <= four_hours_in_seconds)
)

In [14]:
# Define window specification to order trips per taxi
window_spec = Window.partitionBy("medallion").orderBy("pickup_unix")

# Get the previous trip's dropoff time (Unix timestamp)
taxi_df = taxi_df_with_time.withColumn("prev_dropoff_unix", lag("dropoff_unix").over(window_spec))

# Compute idle time only when a previous trip exists
taxi_df = taxi_df.withColumn(
    "idle_time",
    when(col("prev_dropoff_unix").isNotNull(),
         when((col("pickup_unix") - col("prev_dropoff_unix")) <= four_hours_in_seconds,
              col("pickup_unix") - col("prev_dropoff_unix")
         ).otherwise(0)
    ).otherwise(0)
)



In [15]:
# Group by taxi to calculate total trip time and total idle time
utilization_df = taxi_df.groupBy("medallion").agg(
    spark_sum("duration").alias("total_trip_time"),
    spark_sum("idle_time").alias("total_idle_time")
)


# Calculate utilization rate
utilization_df = utilization_df.withColumn(
    "utilization_rate",
    col("total_trip_time") / (col("total_trip_time") + col("total_idle_time"))
)

In [None]:
utilization_df.head()

In [16]:
taxi_window = Window.partitionBy("medallion").orderBy("dropoff_unix")

# Get the next trip's pickup time within the same taxi
taxi_df = taxi_df.withColumn("next_pickup_unix", lead("pickup_unix").over(taxi_window))

# Calculate time to next fare, ensuring non-negative values
taxi_df = taxi_df.withColumn(
    "time_to_next_fare",
    when(
        (col("next_pickup_unix").isNotNull()) & (col("next_pickup_unix") >= col("dropoff_unix")),
        col("next_pickup_unix") - col("dropoff_unix")
    ).otherwise(None)  # Ignore invalid (negative) idle times
)

# Now, calculate the average time to next fare per borough
next_fare_df = taxi_df \
    .filter(col("time_to_next_fare").isNotNull()) \
    .groupBy("dropoff_borough") \
    .agg(avg("time_to_next_fare").alias("avg_time_to_next_fare"))

In [None]:
utilization_df.count()

In [17]:

# Count Trips Within the Same Borough
same_borough_df = taxi_df.filter(col("pickup_borough") == col("dropoff_borough"))
same_borough_count = same_borough_df.groupBy("pickup_borough").agg(count("medallion").alias("same_borough_trips"))


In [18]:
# Count Trips Between Different Boroughs
diff_borough_df = taxi_df.filter(col("pickup_borough") != col("dropoff_borough"))
diff_borough_count = diff_borough_df.groupBy("pickup_borough", "dropoff_borough").agg(count("medallion").alias("cross_borough_trips"))

## Results

### Query 1

In [19]:
# First print the explain to see the strategy taken
utilization_df.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [medallion#0, total_trip_time#116L, total_idle_time#118L, (cast(total_trip_time#116L as double) / cast((total_trip_time#116L + total_idle_time#118L) as double)) AS utilization_rate#122]
   +- HashAggregate(keys=[medallion#0], functions=[sum(duration#59L), sum(idle_time#86L)])
      +- HashAggregate(keys=[medallion#0], functions=[partial_sum(duration#59L), partial_sum(idle_time#86L)])
         +- Project [medallion#0, duration#59L, CASE WHEN isnotnull(prev_dropoff_unix#72L) THEN CASE WHEN ((pickup_unix#36L - prev_dropoff_unix#72L) <= 14400) THEN (pickup_unix#36L - prev_dropoff_unix#72L) ELSE 0 END ELSE 0 END AS idle_time#86L]
            +- Window [lag(dropoff_unix#47L, -1, null) windowspecdefinition(medallion#0, pickup_unix#36L ASC NULLS FIRST, specifiedwindowframe(RowFrame, -1, -1)) AS prev_dropoff_unix#72L], [medallion#0], [pickup_unix#36L ASC NULLS FIRST]
               +- Sort [medallion#0 ASC NULLS FIRST, pickup_un

In [21]:
utilization_df.head()

                                                                                

Row(medallion='06EAD4C8D98202F1E2D7057F2899CFE5', total_trip_time=11571113, total_idle_time=14932563, utilization_rate=0.4365852118023175)

In [22]:
utilization_df.summary().show()

25/03/10 19:35:10 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-------+--------------------+------------------+--------------------+--------------------+
|summary|           medallion|   total_trip_time|     total_idle_time|    utilization_rate|
+-------+--------------------+------------------+--------------------+--------------------+
|  count|               13950|             13950|               13950|               13950|
|   mean|                NULL|  9369729.37655914|1.0493901201433692E7| 0.48591915121191526|
| stddev|                NULL|2934460.2045294577|  3768215.3724788963| 0.08895252777518448|
|    min|00005007A9F30E289...|                 1|                   0|0.007591491799756499|
|    25%|                NULL|           8435580|             8870100|  0.4483242113031179|
|    50%|                NULL|          10126844|            11188662| 0.47429782186649366|
|    75%|                NULL|          11290790|            13151771|  0.5039663927829022|
|    max|FFFECF75AB6CC4FF9...|          14555220|            18421293|          

In [23]:
# Show Results
next_fare_df.show()
same_borough_count.show()
diff_borough_count.show()

                                                                                

+---------------+---------------------+
|dropoff_borough|avg_time_to_next_fare|
+---------------+---------------------+
|         Queens|    5452.720278444093|
|        Unknown|   1495.1725602356166|
|      Manhattan|     5703.44469598965|
|  Staten Island|         7951.6190625|
+---------------+---------------------+



                                                                                

+--------------+------------------+
|pickup_borough|same_borough_trips|
+--------------+------------------+
|        Queens|           2248833|
|       Unknown|         157850253|
| Staten Island|              3179|
|     Manhattan|                57|
+--------------+------------------+



ERROR:root:KeyboardInterrupt while sending command.            (359 + 24) / 500]
Traceback (most recent call last):
  File "/gpfs/helios/home/fidankarimova/myenv/lib/python3.9/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/gpfs/helios/home/fidankarimova/myenv/lib/python3.9/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/gpfs/space/software/jupyterhub/python3.9-rhel9/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [20]:
diff_borough_count.show()

                                                                                

+--------------+---------------+-------------------+
|pickup_borough|dropoff_borough|cross_borough_trips|
+--------------+---------------+-------------------+
|       Unknown|  Staten Island|              24341|
|     Manhattan|        Unknown|               1010|
|        Queens|      Manhattan|                731|
|       Unknown|      Manhattan|               5402|
|        Queens|        Unknown|            6324787|
|        Queens|  Staten Island|               7710|
|       Unknown|         Queens|            6195547|
| Staten Island|        Unknown|                650|
| Staten Island|         Queens|                 46|
|     Manhattan|         Queens|                 15|
+--------------+---------------+-------------------+



In [21]:
utilization_df.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [medallion#0, total_trip_time#116L, total_idle_time#118L, (cast(total_trip_time#116L as double) / cast((total_trip_time#116L + total_idle_time#118L) as double)) AS utilization_rate#122]
   +- HashAggregate(keys=[medallion#0], functions=[sum(duration#59L), sum(idle_time#86L)])
      +- HashAggregate(keys=[medallion#0], functions=[partial_sum(duration#59L), partial_sum(idle_time#86L)])
         +- Project [medallion#0, duration#59L, CASE WHEN isnotnull(prev_dropoff_unix#72L) THEN CASE WHEN ((pickup_unix#36L - prev_dropoff_unix#72L) <= 14400) THEN (pickup_unix#36L - prev_dropoff_unix#72L) ELSE 0 END ELSE 0 END AS idle_time#86L]
            +- Window [lag(dropoff_unix#47L, -1, null) windowspecdefinition(medallion#0, pickup_unix#36L ASC NULLS FIRST, specifiedwindowframe(RowFrame, -1, -1)) AS prev_dropoff_unix#72L], [medallion#0], [pickup_unix#36L ASC NULLS FIRST]
               +- Sort [medallion#0 ASC NULLS FIRST, pickup_un

## Basic EDA

In [22]:
utilization_df.summary().show()

25/03/10 22:17:40 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 15:>                                                         (0 + 1) / 1]

+-------+--------------------+------------------+--------------------+--------------------+
|summary|           medallion|   total_trip_time|     total_idle_time|    utilization_rate|
+-------+--------------------+------------------+--------------------+--------------------+
|  count|               13950|             13950|               13950|               13950|
|   mean|                NULL|  9369729.37655914|1.0493901201433692E7| 0.48591915121191526|
| stddev|                NULL|2934460.2045294577|  3768215.3724788963| 0.08895252777518448|
|    min|00005007A9F30E289...|                 1|                   0|0.007591491799756499|
|    25%|                NULL|           8435580|             8870100|  0.4483242113031179|
|    50%|                NULL|          10126844|            11188662| 0.47429782186649366|
|    75%|                NULL|          11290790|            13151771|  0.5039663927829022|
|    max|FFFECF75AB6CC4FF9...|          14555220|            18421293|          

                                                                                

In [23]:
df_with_hours_and_minutes = utilization_df.withColumn('trip_time_hours', col('total_trip_time') / 3600) \
                  .withColumn('idle_time_hours', col('total_idle_time') / 3600) \
                  .withColumn('idle_time_minutes', col('total_idle_time') / 60) \
                  .withColumn('trip_time_minutes', col('total_trip_time') / 60)

In [24]:
[field.dataType.simpleString() for field in df_with_hours_and_minutes.schema.fields]

['string',
 'bigint',
 'bigint',
 'double',
 'double',
 'double',
 'double',
 'double']

In [26]:
pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m92.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytz-2025.1-py2.py3-none-any.whl (507 kB)
Downloading tzdata-2025.1-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.3 pytz-2025.1 tzdata-2025.1
Note: you may need to restart the kernel to use updated packages.


In [27]:
numeric_cols = [field.name for field in df_with_hours_and_minutes.schema.fields 
                if field.dataType.simpleString() in ['bigint', 'double', 'int', 'long', 'float']]

# Create a summary DataFrame
summary = df_with_hours_and_minutes.select(numeric_cols).summary(
    "count", "mean", "stddev", "min", "25%", "50%", "75%", "max"
).toPandas()

# Set the summary column as index for better display
summary.set_index('summary', inplace=True)

# Display in Jupyter notebook
display(summary)

                                                                                

Unnamed: 0_level_0,total_trip_time,total_idle_time,utilization_rate,trip_time_hours,idle_time_hours,idle_time_minutes,trip_time_minutes
summary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
count,13950.0,13950.0,13950.0,13950.0,13950.0,13950.0,13950.0
mean,9369729.37655914,10493901.201433692,0.4859191512119152,2602.7026045997586,2914.972555953806,174898.353357228,156162.15627598573
stddev,2934460.2045294577,3768215.372478896,0.0889525277751844,815.1278345915168,1046.7264923552495,62803.589541315,48907.67007549101
min,1.0,0.0,0.0075914917997564,0.0002777777777777778,0.0,0.0,0.0166666666666666
25%,8435580.0,8870100.0,0.4483242113031179,2343.2166666666667,2463.9166666666665,147835.0,140593.0
50%,10126844.0,11188662.0,0.4742978218664936,2813.012222222222,3107.9616666666666,186477.7,168780.73333333334
75%,11290790.0,13151771.0,0.5039663927829022,3136.330555555556,3653.2697222222223,219196.1833333333,188179.8333333333
max,14555220.0,18421293.0,1.0,4043.116666666667,5117.025833333333,307021.55,242587.0
