In [1]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, window, unix_timestamp
import time

In [2]:
# Initialize Spark Session (for Jupyter Notebook)
spark = SparkSession.builder.appName("FrequentRoutes").getOrCreate()

In [3]:
df = spark.read.parquet("Data/cleaned_data.parquet/cleaned_data.parquet")

In [4]:
# Show sample data
df.show(5)

+---------+--------------------+--------------------+--------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+------------+
|kafka_key|     kafka_timestamp|           medallion|        hack_license|    pickup_datetime|   dropoff_datetime|trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|total_amount|
+---------+--------------------+--------------------+--------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+------------+
|     NULL|2025-03-28 17:45:...|6C8C5507F1928059F...|10AC7E695DB02A51B...|2013-05-07 18:15:53|2013-05-07 18:22:48| 

In [5]:
# Filter out null values and invalid trips
df = df.filter(
    (col("pickup_longitude").isNotNull()) & (col("pickup_latitude").isNotNull()) &
    (col("dropoff_longitude").isNotNull()) & (col("dropoff_latitude").isNotNull()) &
    (col("trip_time_in_secs") > 0) & (col("trip_distance") > 0)
)

# Convert dropoff_datetime to a timestamp
df = df.withColumn("dropoff_time", unix_timestamp(col("dropoff_datetime")))

# Show cleaned data
df.show(5)


+---------+--------------------+--------------------+--------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+------------+------------+
|kafka_key|     kafka_timestamp|           medallion|        hack_license|    pickup_datetime|   dropoff_datetime|trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|total_amount|dropoff_time|
+---------+--------------------+--------------------+--------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+------------+------------+
|     NULL|2025-03-28 17:45:...|6C8C5507F1928059F...|10AC7E695DB02A51B...|20

In [6]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Function to compute grid cell ID
def get_cell_id(lat, lon):
    base_lat, base_lon = 41.474937, -74.913585  # NYC reference point
    cell_size = 0.0045  # Approx. 500m in degrees

    cell_x = int((lon - base_lon) / cell_size) + 1
    cell_y = int((base_lat - lat) / cell_size) + 1
    return f"{cell_x}.{cell_y}"

In [7]:
# Register UDF
grid_udf = udf(get_cell_id, StringType())

# Add columns for start and end grid cells
df = df.withColumn("start_cell", grid_udf(col("pickup_latitude"), col("pickup_longitude")))
df = df.withColumn("end_cell", grid_udf(col("dropoff_latitude"), col("dropoff_longitude")))

# Show updated dataframe with grid cells
df.select("start_cell", "end_cell").show(5)

+----------+--------+
|start_cell|end_cell|
+----------+--------+
|   210.158| 212.155|
|   202.170| 205.169|
|   207.164| 207.167|
|   210.160| 205.161|
|   205.166| 208.161|
+----------+--------+
only showing top 5 rows



In [8]:
from pyspark.sql.functions import col, count, max, window, expr

In [9]:
# Convert dropoff_time to timestamp
from pyspark.sql.functions import col, count, window, from_unixtime
df = df.withColumn("dropoff_time", from_unixtime(col("dropoff_time")).cast("timestamp"))

# Step 1: Find the time window with the most rides
windowed_counts = df.groupBy(window(col("dropoff_time"), "30 minutes")).agg(count("*").alias("num_rides"))
# Get the window with the highest number of rides
max_rides_window = windowed_counts.orderBy(col("num_rides").desc()).limit(1).collect()[0][0]

# Extract start and end of that time window
window_start, window_end = max_rides_window["start"], max_rides_window["end"]
# Step 2: Filter rides that happened in this peak 30-minute window
peak_time_df = df.filter((col("dropoff_time") >= window_start) & (col("dropoff_time") < window_end))

# Step 3: Group by start_cell and end_cell, counting the number of rides
routes = peak_time_df.groupBy(col("start_cell"), col("end_cell")).agg(count("*").alias("num_rides"))

In [10]:
# Step 4: Filter out routes where start_cell == end_cell
filtered_routes = routes.filter(col("start_cell") != col("end_cell"))

# Step 5: Get top 10 most frequent routes
top_routes = filtered_routes.orderBy(col("num_rides").desc()).limit(10)


In [11]:
# Show results
top_routes.show()

+----------+--------+---------+
|start_cell|end_cell|num_rides|
+----------+--------+---------+
|   206.165| 208.163|        3|
|   206.165| 209.162|        3|
|   207.162| 208.166|        3|
|   206.165| 205.162|        2|
|   204.168| 204.164|        2|
|   206.168| 202.171|        2|
|   204.168| 206.162|        2|
|   205.161| 207.164|        2|
|   206.168| 207.165|        2|
|   205.161| 207.157|        2|
+----------+--------+---------+



In [12]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

def query1_part2_batch(df):
    """Batch implementation of Part 2 requirements"""
    
    # 1. Prepare data with timestamps
    df = df.withColumn("dropoff_ts", col("dropoff_datetime").cast("timestamp"))
    
    # 2. Create time reference points every 1 minute (adjustable)
    min_time = df.select(min("dropoff_ts")).first()[0]
    max_time = df.select(max("dropoff_ts")).first()[0]
    
    # Generate time points at 1-minute intervals
    time_points = spark.range(
        int(min_time.timestamp()),
        int(max_time.timestamp()) + 60,
        60  # 1-minute intervals
    ).select(
        from_unixtime("id").alias("processing_time")
    )
    
    # 3. Join trips to time points (30-minute windows)
    windowed = df.join(
        time_points,
        (col("dropoff_ts") >= col("processing_time") - expr("INTERVAL 30 MINUTES")) & 
        (col("dropoff_ts") <= col("processing_time"))
    )
    
    # 4. Count routes per time window
    route_counts = windowed.groupBy(
        "processing_time", "start_cell", "end_cell"
    ).agg(
        count("*").alias("num_rides")
    )
    
    # 5. Rank routes within each window
    window_spec = Window.partitionBy("processing_time").orderBy(col("num_rides").desc())
    ranked_routes = route_counts.withColumn("rank", rank().over(window_spec)) \
                              .filter(col("rank") <= 10)
    
    # 6. Pivot to get the required output format
    pivot_df = ranked_routes.groupBy("processing_time").pivot("rank", range(1,11)).agg(
        first("start_cell").alias("start_cell"),
        first("end_cell").alias("end_cell")
    )
    
    # 7. Format final output
    output_cols = ["processing_time"]
    for i in range(1,11):
        output_cols.extend([
            col(f"{i}_start_cell").alias(f"start_cell_id_{i}"),
            col(f"{i}_end_cell").alias(f"end_cell_id_{i}")
        ])
    
    result = pivot_df.select(*output_cols).withColumn("delay", lit(0))
    
    return result



In [13]:
# Run the batch processing
part2_results = query1_part2_batch(df)


In [None]:
part2_results.show(truncate=False)