In [1]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, window, unix_timestamp, max
from pyspark.sql.window import Window
from pyspark.sql import functions as F
import math
import time

from delta import *
from delta.tables import *
from pyspark.sql.functions import col, to_json, struct, lit, current_timestamp, expr, when, from_json, window
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    DoubleType,
    TimestampType,
    IntegerType,
)
import pandas as pd
import os
import uuid
import json

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Create schema fields
schema_fields = [
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("delay", DoubleType(), True)
]

# Add fields for the 10 route positions
for i in range(1, 11):
    schema_fields.append(StructField(f"start_cell_id_{i}", StringType(), True))
    schema_fields.append(StructField(f"end_cell_id_{i}", StringType(), True))

# Create schema
output_schema = StructType(schema_fields)


In [2]:
# Create a consistent warehouse directory path - use absolute path
WAREHOUSE_DIR = "./spark-warehouse"

# Create the schema definition for NYC taxi data
def create_raw_taxi_schema():
    """
    Create the complete schema for NYC taxi data based on the DEBS Grand Challenge 2015 specification
    
    This schema includes all fields from the original dataset:
    - Basic identifiers (medallion, hack_license)
    - Time data (pickup_datetime, dropoff_datetime, trip_time_in_secs)
    - Trip information (trip_distance)
    - Location coordinates (pickup/dropoff longitude/latitude)
    - Payment information (payment_type, fare_amount, surcharge, mta_tax, tip_amount, tolls_amount)
    """
    return StructType([
        # Taxi and driver identifiers
        StructField("medallion", StringType(), True),         # Taxi vehicle identifier (md5sum)
        StructField("hack_license", StringType(), True),      # Taxi license identifier (md5sum)
        
        # Trip time information
        StructField("pickup_datetime", TimestampType(), True),   # Time of passenger pickup
        StructField("dropoff_datetime", TimestampType(), True),  # Time of passenger dropoff
        StructField("trip_time_in_secs", IntegerType(), True),   # Duration of the trip in seconds
        
        # Trip distance
        StructField("trip_distance", DoubleType(), True),     # Trip distance in miles
        
        # Pickup coordinates
        StructField("pickup_longitude", DoubleType(), True),  # Longitude coordinate of pickup
        StructField("pickup_latitude", DoubleType(), True),   # Latitude coordinate of pickup
        
        # Dropoff coordinates
        StructField("dropoff_longitude", DoubleType(), True), # Longitude coordinate of dropoff
        StructField("dropoff_latitude", DoubleType(), True),  # Latitude coordinate of dropoff
        
        # Payment information
        StructField("payment_type", StringType(), True),      # Payment method (credit card or cash)
        StructField("fare_amount", DoubleType(), True),       # Fare amount in dollars
        StructField("surcharge", DoubleType(), True),         # Surcharge in dollars
        StructField("mta_tax", DoubleType(), True),           # Tax in dollars
        StructField("tip_amount", DoubleType(), True),        # Tip in dollars
        StructField("tolls_amount", DoubleType(), True),      # Bridge and tunnel tolls in dollars
        
        # Additional fields that may be present
        StructField("total_amount", DoubleType(), True)       # Total amount paid (calculated field)
    ])

In [3]:
# Initialize Spark Session
def create_spark_session(app_name="FrequentRoutes"):
    """
    start spark session with kafka and delta support / memory config setup too
    """
    builder = SparkSession.builder.appName(app_name) \
        .config("spark.sql.session.timeZone", "UTC") \
        .master("local[*]") \
        .config("spark.network.timeout", "800s") \
        .config("spark.sql.broadcastTimeout", "1000s") \
        .config("spark.executor.heartbeatInterval", "60s") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .config("spark.sql.warehouse.dir", WAREHOUSE_DIR) \
        .config("spark.sql.catalogImplementation", "hive") \
        .config("spark.driver.memory", "48g") \
        .config("spark.executor.memory", "36g") \
        .config("spark.memory.offHeap.enabled", "true") \
        .config("spark.memory.offHeap.size", "22g") \
        .config("spark.driver.maxResultSize", "22g") \
        .config("spark.sql.shuffle.partitions", "200") \
        .config("spark.default.parallelism", "200") \
        .config("spark.memory.fraction", "0.8") \
        .config("spark.sql.debug.maxToStringFields", 100) \
        .enableHiveSupport()
    
    # delta config
    spark = configure_spark_with_delta_pip(builder).getOrCreate()
    
    # do not flood logs
    spark.sparkContext.setLogLevel("WARN")
    
    # Print configs for debugging
    print(f"Warehouse directory: {spark.conf.get('spark.sql.warehouse.dir')}")
    print(f"Catalog implementation: {spark.conf.get('spark.sql.catalogImplementation')}")
    
    return spark

In [4]:
# Initialize the Spark session
spark = create_spark_session()

:: loading settings :: url = jar:file:/gpfs/helios/home/andressebastian1/dev/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /gpfs/helios/home/andressebastian1/.ivy2/cache
The jars for the packages stored in: /gpfs/helios/home/andressebastian1/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f423c86e-d41c-452d-9869-17d5e1230b3a;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.3.0 in central
	found io.delta#delta-storage;3.3.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 128ms :: artifacts dl 5ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.3.0 from central in [default]
	io.delta#delta-storage;3.3.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	------------------------------------------------------------

Warehouse directory: file:/gpfs/helios/home/andressebastian1/big-data-management/spark-warehouse
Catalog implementation: hive


In [5]:
# Step 1: Define paths
CLEAN_TABLE_NAME = "clean_taxi_data"
FREQ_TABLE_NAME = "freq_taxi_data"
FREQ_TABLE_NAME_2 = "freq_taxi_data_part_2"

CLEAN_OUTPUT_PATH = os.path.join(WAREHOUSE_DIR, CLEAN_TABLE_NAME)
FREQ_OUTPUT_PATH = os.path.join(WAREHOUSE_DIR, FREQ_TABLE_NAME)
FREQ_OUTPUT_PATH_2 = os.path.join(WAREHOUSE_DIR, FREQ_TABLE_NAME_2)

CHECKPOINT_DIR = os.path.join(WAREHOUSE_DIR, "streaming/checkpoints")

CLEAN_CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, "clean_taxi_data")
FREQ_CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, "frequent_routes")
FREQ_ROUTES_CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, "frequent_routes")

SOURCE_DELTA_TABLE = CLEAN_OUTPUT_PATH
OUTPUT_DELTA_TABLE = FREQ_OUTPUT_PATH_2
CHECKPOINT_LOCATION = FREQ_ROUTES_CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, "frequent_routes_2")



In [6]:
# Define constants
CELL_SIZE = 500  # size of grid cell in meters
ORIGIN_LAT = 41.474937  # Latitude of cell 1.1
ORIGIN_LON = -74.913585  # Longitude of cell 1.1
MAX_CELL = 300  # grid expands 150km with 500m cells

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import time
import math

def query1_part2_streaming():
    """
    Streaming implementation for Query 1 Part 2 - Detect and output changes 
    in the top 10 most frequent routes during the last 30 minutes.
    
    Output format: pickup_datetime, dropoff_datetime, start_cell_id_1, end_cell_id_1, 
                  ..., start_cell_id_10, end_cell_id_10, delay
    """
    
    # Register UDF for creating cell ID in format "x.y"
    @F.udf(returnType=StringType())
    def get_cell_id(x, y):
        if x is None or y is None:
            return None
        return f"{x}.{y}"
    
    # Define UDFs for converting coordinates to grid cells
    @F.udf(returnType=IntegerType())
    def lat_to_cell_y(lat):
        # Convert latitude to cell Y coordinate (south)
        if lat is None:
            return None
        meters_south = (ORIGIN_LAT - lat) * 111000  # Approx conversion to meters
        cell_y = int(meters_south / CELL_SIZE) + 1
        return cell_y if 1 <= cell_y <= MAX_CELL else None

    @F.udf(returnType=IntegerType())
    def lon_to_cell_x(lon, lat):
        # Convert longitude to cell X coordinate (east)
        if lon is None or lat is None:
            return None
        # Account for longitude distance variation with latitude
        # Use math operations directly on float values
        meters_east = (lon - ORIGIN_LON) * 111000 * math.cos(math.radians(lat))
        cell_x = int(meters_east / CELL_SIZE) + 1
        return cell_x if 1 <= cell_x <= MAX_CELL else None
    
    df = spark.readStream.format("delta").load(SOURCE_DELTA_TABLE)
    
    # ingestion time in first read
    df = df.withColumn("ingestion_time", F.current_timestamp())
    
    # Ensure timestamp columns are properly formatted
    clean_df = df.withColumn(
        "pickup_datetime", F.to_timestamp("pickup_datetime")
    ).withColumn(
        "dropoff_datetime", F.to_timestamp("dropoff_datetime")
    )
    
    # Apply the UDFs to convert coordinates to grid cells
    routes_df = clean_df.withColumn(
        "start_cell_x", lon_to_cell_x(F.col("pickup_longitude"), F.col("pickup_latitude"))
    ).withColumn(
        "start_cell_y", lat_to_cell_y(F.col("pickup_latitude"))
    ).withColumn(
        "end_cell_x", lon_to_cell_x(F.col("dropoff_longitude"), F.col("dropoff_latitude"))
    ).withColumn(
        "end_cell_y", lat_to_cell_y(F.col("dropoff_latitude"))
    )
    
    # Filter outliers - cells must be within the grid (1,1) to (300,300)
    valid_routes_df = routes_df.filter(
        (F.col("start_cell_x").isNotNull()) & 
        (F.col("start_cell_y").isNotNull()) &
        (F.col("end_cell_x").isNotNull()) & 
        (F.col("end_cell_y").isNotNull())
    )
    
    # Create route identifiers in X.Y format
    cell_routes_df = valid_routes_df.withColumn(
        "start_cell", get_cell_id(F.col("start_cell_x"), F.col("start_cell_y"))
    ).withColumn(
        "end_cell", get_cell_id(F.col("end_cell_x"), F.col("end_cell_y"))
    )
    
    # Create a 30-minute sliding window based on dropoff_datetime
    # Using a sliding window with 5-minute slide to get more frequent updates
    windowed_routes = cell_routes_df.withWatermark("dropoff_datetime", "30 minutes") \
        .groupBy(
            F.window("dropoff_datetime", "30 minutes"),  # 30-min window sliding every 5 min
            "start_cell", 
            "end_cell"
        ) \
        .agg(
            F.count("*").alias("Number_of_Rides"),
            F.max("pickup_datetime").alias("latest_pickup"),
            F.max("dropoff_datetime").alias("latest_dropoff"),
            F.max("ingestion_time").alias("latest_ingestion")
        )
    
    # Reference for the previous top 10 routes
    previous_top_routes = None
    latest_window = None
    def detect_changes_and_output(batch_df, batch_id):
        """
        Process each micro-batch to detect changes in top 10 routes
        and output updates when changes occur.
        """
        nonlocal previous_top_routes
        nonlocal latest_window

        if batch_df.isEmpty():
            return
            
        # Get the latest 30-minute window from the batch
        # We only care about the most recent window for output
        if latest_window is None:
            window_ends = batch_df.select("window.end").distinct().collect()
            window_ends = [row["end"] for row in window_ends]

            if not window_ends:
                return

            # Sort the window ends
            window_ends.sort()

            # Calculate the 75th percentile index
            percentile_idx = int(len(window_ends) * 0.75)
            if percentile_idx >= len(window_ends):
                percentile_idx = len(window_ends) - 1

            # Get the window at the 75th percentile
            latest_window = window_ends[percentile_idx]
            print(latest_window)
            # latest_window = batch_df.select(F.max("window.end").alias("max_end")).collect()[0]["max_end"]
        
        if latest_window is not None:
            # Filter to data from the latest window
            window_data = batch_df.filter(F.col("window.end") == latest_window)
            
            # Get the top 10 routes by ride count
            top_routes_df = window_data.orderBy(F.col("Number_of_Rides").desc()).limit(10)
            
            # Convert to list of route tuples for comparison
            current_top_routes = []
            for row in top_routes_df.collect():
                current_top_routes.append((
                    row["start_cell"],
                    row["end_cell"],
                    row["Number_of_Rides"],
                    row["latest_pickup"],
                    row["latest_dropoff"],
                    row["latest_ingestion"]
                ))
            
            # Check if top routes have changed
            routes_changed = False
            
            if previous_top_routes is None:
                routes_changed = True
            else:
                # Compare current routes with previous ones
                current_route_set = set((r[0], r[1]) for r in current_top_routes)
                previous_route_set = set((r[0], r[1]) for r in previous_top_routes)
                
                if current_route_set != previous_route_set:
                    routes_changed = True
            
            if routes_changed:
                # We have a change - output the update
                
                # Find the latest event details (most recent dropoff)
                latest_event = None
                for route in current_top_routes:
                    if route[4] is not None:  # latest_dropoff
                        if latest_event is None or route[4] > latest_event[4]:
                            latest_event = route
                
                if latest_event:
                    # Extract timestamps for output
                    pickup_time = latest_event[3]  # latest_pickup
                    dropoff_time = latest_event[4]  # latest_dropoff
                    ingestion_time = latest_event[5]  # latest_ingestion
                    
                    # Calculate delay between ingestion time and current processing time
                    output_time = time.time()
                    delay = output_time - ingestion_time.timestamp()
                    
                    # Prepare output data
                    output_data = {
                        "pickup_datetime": pickup_time,
                        "dropoff_datetime": dropoff_time,
                        "delay": delay
                    }
                    
                    # Sort the routes to ensure consistent ordering by frequency
                    sorted_routes = sorted(
                        current_top_routes, 
                        key=lambda r: r[2], 
                        reverse=True
                    )
                    
                    # Add up to 10 routes to the output
                    for i in range(1, 11):
                        if i <= len(sorted_routes):
                            route = sorted_routes[i-1]
                            output_data[f"start_cell_id_{i}"] = route[0]
                            output_data[f"end_cell_id_{i}"] = route[1]
                        else:
                            # Fill with NULL for missing routes
                            output_data[f"start_cell_id_{i}"] = None
                            output_data[f"end_cell_id_{i}"] = None
                    
                    # Create output DataFrame
                    output_df = spark.createDataFrame([output_data], schema=output_schema)

                    # Write to Delta table
                    output_df.write.format("delta").mode("append").save(OUTPUT_DELTA_TABLE)
                
                # Update the reference for next comparison
                previous_top_routes = current_top_routes
    
    # Set up the streaming query with foreachBatch
    query = windowed_routes.writeStream \
        .outputMode("complete") \
        .foreachBatch(detect_changes_and_output) \
        .option("checkpointLocation", CHECKPOINT_LOCATION) \
        .start()
    
    return query

In [8]:
query_part_2 = query1_part2_streaming()

25/03/30 20:50:17 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


2013-10-01 22:30:00


In [12]:
query_part_2.status

{'message': 'Processing new data',
 'isDataAvailable': True,
 'isTriggerActive': True}

                                                                                

In [13]:
batch_df = spark.read.format("delta").load(FREQ_OUTPUT_PATH_2)


In [14]:
batch_df.head(10)

[Row(pickup_datetime=datetime.datetime(2013, 10, 1, 21, 41), dropoff_datetime=datetime.datetime(2013, 10, 1, 22, 27, 27), delay=76.60892987251282, start_cell_id_1='158.152', end_cell_id_1='157.155', start_cell_id_2='160.156', end_cell_id_2='159.158', start_cell_id_3='158.152', end_cell_id_3='161.156', start_cell_id_4='160.157', end_cell_id_4='156.159', start_cell_id_5='162.155', end_cell_id_5='162.156', start_cell_id_6='152.169', end_cell_id_6='157.156', start_cell_id_7='158.161', end_cell_id_7='160.160', start_cell_id_8='154.162', end_cell_id_8='154.167', start_cell_id_9='159.158', end_cell_id_9='162.155', start_cell_id_10='157.158', end_cell_id_10='155.158')]

In [15]:
pandas_df = batch_df.toPandas()

In [16]:
pandas_df

Unnamed: 0,pickup_datetime,dropoff_datetime,delay,start_cell_id_1,end_cell_id_1,start_cell_id_2,end_cell_id_2,start_cell_id_3,end_cell_id_3,start_cell_id_4,...,start_cell_id_6,end_cell_id_6,start_cell_id_7,end_cell_id_7,start_cell_id_8,end_cell_id_8,start_cell_id_9,end_cell_id_9,start_cell_id_10,end_cell_id_10
0,2013-10-01 18:41:00,2013-10-01 19:27:27,76.60893,158.152,157.155,160.156,159.158,158.152,161.156,160.157,...,152.169,157.156,158.161,160.16,154.162,154.167,159.158,162.155,157.158,155.158
