# Checkpoint 1

## Installation of missing dependencies

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, unix_timestamp, lag, when, count, avg, lead, sum as spark_sum
from pyspark.sql.window import Window
import json
from shapely.geometry import Point, Polygon, shape
from pyspark.sql.types import (
    StructType,
    StructField,
    IntegerType,
    StringType,
    DoubleType
)


In [4]:
# Initialize Spark Session
spark = SparkSession.builder.appName("NYC Taxi Analysis") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.memory.storageFraction", "0.3") \
    .config("spark.sql.shuffle.partitions", "100") \
    .config("spark.default.parallelism", "20") \
    .config("spark.executor.cores", "4") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")

spark.sparkContext.setCheckpointDir("checkpoints")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/09 20:57:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/09 20:57:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Load taxi rides dataset

In [5]:
# Define schema for the data
schema = StructType(
    [
        StructField("medallion", StringType()),
        StructField("pickup_datetime", StringType()),
        StructField(
            "dropoff_datetime", StringType()
        ),
        StructField("pickup_longitude", DoubleType()),
        StructField("pickup_latitude", DoubleType()),
        StructField("dropoff_longitude", DoubleType()),
        StructField("dropoff_latitude", DoubleType()),
    ]
)


In [6]:
parquet_path = "input/prod/taxi_data.parquet"
taxi_df = (
    spark.read
    .option("header", True)
    .schema(schema)
    .parquet(parquet_path)
)


In [7]:
## Checking the columns we have
taxi_df.columns

['medallion',
 'pickup_datetime',
 'dropoff_datetime',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude']

In [8]:
taxi_df.count()

                                                                                

173179759

In [9]:
taxi_df = taxi_df.repartition(200, "medallion")

In [10]:
# Load GeoJSON Data
borough_data = None
with open("./input/nyc-boroughs.geojson", "r") as file:
    borough_data = json.load(file)

# getting dictionary of boroughs for fast lookup
borough_polygons = {}
for feature in borough_data["features"]:
    borough_name = feature["properties"]["borough"]
    borough_geometry = shape(feature["geometry"])
    borough_polygons[borough_name] = borough_geometry


In [11]:
# check raw shapes in the dictionary
borough_polygons

{'Staten Island': <POLYGON ((-74.082 40.648, -74.081 40.649, -74.081 40.648, -74.08 40.648, -7...>,
 'Queens': <POLYGON ((-73.891 40.776, -73.891 40.777, -73.891 40.777, -73.891 40.777, -...>,
 'Brooklyn': <POLYGON ((-73.931 40.595, -73.932 40.594, -73.932 40.595, -73.931 40.595))>,
 'Manhattan': <POLYGON ((-73.907 40.876, -73.908 40.873, -73.908 40.873, -73.909 40.872, -...>,
 'Bronx': <POLYGON ((-73.804 40.813, -73.804 40.814, -73.804 40.814, -73.804 40.814, -...>}

In [12]:
# Broadcast GeoJSON Data to Spark
borough_broadcast = spark.sparkContext.broadcast(borough_polygons)

In [13]:
def get_borough(lon, lat):
    try:
        # basic check / catch is slower
        if lon is None or lat is None:
            return "Unknown"
        
        point = Point(lon, lat)

        for borough, polygon in borough_broadcast.value.items():
            if polygon.contains(point): # match found
                return borough

    except Exception as e:
        print(f"Error processing ({lon}, {lat}): {e}")  # Print error details

    return "Unknown"

# Register the function as a Spark UDF again
to_borough_udf = spark.udf.register("to_borough", get_borough, StringType())

In [14]:
# Checkpoint # 1

# Assign Borough Names to Pickup and Drop-off Locations
taxi_df = taxi_df.withColumn("pickup_borough", to_borough_udf(col("pickup_longitude"), col("pickup_latitude")))
taxi_df = taxi_df.withColumn("dropoff_borough", to_borough_udf(col("dropoff_longitude"), col("dropoff_latitude")))

In [16]:
# checkpoint 2

# Define the max idle time threshold (4 hours in seconds)
four_hours_in_seconds = 4 * 60 * 60
# Convert pickup and dropoff datetime to Unix timestamps
taxi_df = taxi_df.withColumn("pickup_unix", unix_timestamp(col("pickup_datetime"), "dd-MM-yy HH:mm"))
taxi_df = taxi_df.withColumn("dropoff_unix", unix_timestamp(col("dropoff_datetime"), "dd-MM-yy HH:mm"))

# Compute trip duration in seconds
taxi_df = taxi_df.withColumn("duration", col("dropoff_unix") - col("pickup_unix"))

# Filter out incorrect durations (negative durations or trips longer than 4 hours)
taxi_df = taxi_df.filter((col("duration") > 0) & (col("duration") <= four_hours_in_seconds))

In [None]:
summary_taxis = taxi_df.describe()
summary_taxis.show()

In [None]:
# Define window specification to order trips per taxi
window_spec = Window.partitionBy("medallion").orderBy("pickup_unix")

# Get the previous trip's dropoff time (Unix timestamp)
taxi_df = taxi_df.withColumn("prev_dropoff_unix", lag("dropoff_unix").over(window_spec))

# Compute idle time only when a previous trip exists
taxi_df = taxi_df.withColumn(
    "idle_time",
    when(col("prev_dropoff_unix").isNotNull(),
         when((col("pickup_unix") - col("prev_dropoff_unix")) <= four_hours_in_seconds,
              col("pickup_unix") - col("prev_dropoff_unix")
         ).otherwise(0)
    ).otherwise(0)
)



In [None]:
# Group by taxi to calculate total trip time and total idle time
utilization_df = taxi_df.groupBy("medallion").agg(
    spark_sum("duration").alias("total_trip_time"),
    spark_sum("idle_time").alias("total_idle_time")
)


# Calculate utilization rate
utilization_df = utilization_df.withColumn(
    "utilization_rate",
    col("total_trip_time") / (col("total_trip_time") + col("total_idle_time"))
)

In [None]:
utilization_df.head()

In [None]:
taxi_window = Window.partitionBy("medallion").orderBy("dropoff_unix")

# Get the next trip's pickup time within the same taxi
taxi_df = taxi_df.withColumn("next_pickup_unix", lead("pickup_unix").over(taxi_window))

# Calculate time to next fare, ensuring non-negative values
taxi_df = taxi_df.withColumn(
    "time_to_next_fare",
    when(
        (col("next_pickup_unix").isNotNull()) & (col("next_pickup_unix") >= col("dropoff_unix")),
        col("next_pickup_unix") - col("dropoff_unix")
    ).otherwise(None)  # Ignore invalid (negative) idle times
)

# Now, calculate the average time to next fare per borough
next_fare_df = taxi_df \
    .filter(col("time_to_next_fare").isNotNull()) \
    .groupBy("dropoff_borough") \
    .agg(avg("time_to_next_fare").alias("avg_time_to_next_fare"))

In [None]:
utilization_df.count()

In [None]:

# Count Trips Within the Same Borough
same_borough_df = taxi_df.filter(col("pickup_borough") == col("dropoff_borough"))
same_borough_count = same_borough_df.groupBy("pickup_borough").agg(count("medallion").alias("same_borough_trips"))


In [None]:
# Count Trips Between Different Boroughs
diff_borough_df = taxi_df.filter(col("pickup_borough") != col("dropoff_borough"))
diff_borough_count = diff_borough_df.groupBy("pickup_borough", "dropoff_borough").agg(count("medallion").alias("cross_borough_trips"))

## Results

### Query 1

In [None]:
# First print the explain to see the strategy taken
utilization_df.explain()

In [None]:
utilization_df.head()

In [None]:
utilization_df.summary().show()

In [None]:
# Show Results
next_fare_df.show()
same_borough_count.show()
diff_borough_count.show()


In [None]:
utilization_df.explain()

## Basic EDA

In [None]:
utilization_df.summary().show()

In [None]:
df_with_hours_and_minutes = utilization_df.withColumn('trip_time_hours', col('total_trip_time') / 3600) \
                  .withColumn('idle_time_hours', col('total_idle_time') / 3600) \
                  .withColumn('idle_time_minutes', col('total_idle_time') / 60) \
                  .withColumn('trip_time_minutes', col('total_trip_time') / 60)

In [None]:
[field.dataType.simpleString() for field in df_with_hours_and_minutes.schema.fields]

In [None]:
numeric_cols = [field.name for field in df_with_hours_and_minutes.schema.fields 
                if field.dataType.simpleString() in ['bigint', 'double', 'int', 'long', 'float']]

# Create a summary DataFrame
summary = df_with_hours_and_minutes.select(numeric_cols).summary(
    "count", "mean", "stddev", "min", "25%", "50%", "75%", "max"
).toPandas()

# Set the summary column as index for better display
summary.set_index('summary', inplace=True)

# Display in Jupyter notebook
display(summary)