In [3]:
#######################################
# Part 0: Imports and Spark Session Setup
#######################################
import math, time, datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, udf, expr, when, lit, array, percentile_approx, desc
)
from pyspark.sql.types import (
    StructType, StructField, StringType, TimestampType,
    DoubleType, IntegerType, ArrayType
)

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, window, unix_timestamp, max
from pyspark.sql.window import Window
from pyspark.sql import functions as F
import math
import time

from delta import *
from delta.tables import *
from pyspark.sql.functions import col, to_json, struct, lit, current_timestamp, expr, when, from_json, window
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    DoubleType,
    TimestampType,
    IntegerType,
)
import pandas as pd
import os
import uuid
import json

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

In [4]:
WAREHOUSE_DIR = "/home/jovyan/spark-warehouse"

def create_spark_session(app_name="FrequentRoutes"):
    """
    start spark session with kafka and delta support / memory config setup too
    """
    builder = SparkSession.builder.appName(app_name) \
        .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3") \
        .config("spark.sql.session.timeZone", "UTC") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .config("spark.sql.warehouse.dir", WAREHOUSE_DIR) \
        .config("spark.sql.catalogImplementation", "hive") \
        .config("spark.driver.memory", "5g") \
        .config("spark.executor.memory", "4g") \
        .config("spark.memory.offHeap.enabled", "true") \
        .config("spark.memory.offHeap.size", "2g") \
        .config("spark.driver.maxResultSize", "2g") \
        .config("spark.sql.shuffle.partitions", "100") \
        .config("spark.default.parallelism", "100") \
        .config("spark.memory.fraction", "0.8") \
        .config("spark.sql.debug.maxToStringFields", 100) \
        .enableHiveSupport()
    
    # delta config
    spark = configure_spark_with_delta_pip(builder).getOrCreate()
    
    # do not flood logs
    spark.sparkContext.setLogLevel("WARN")
    
    # Print configs for debugging
    print(f"Warehouse directory: {spark.conf.get('spark.sql.warehouse.dir')}")
    print(f"Catalog implementation: {spark.conf.get('spark.sql.catalogImplementation')}")
    
    return spark

spark = create_spark_session()


Warehouse directory: file:/home/jovyan/spark-warehouse
Catalog implementation: hive


In [5]:
#######################################
# Part 1: Define Schema and Read Cleaned Data as a Stream
#######################################
def create_raw_taxi_schema():
    return StructType([
        StructField("medallion", StringType(), True),
        StructField("hack_license", StringType(), True),
        StructField("pickup_datetime", TimestampType(), True),
        StructField("dropoff_datetime", TimestampType(), True),
        StructField("trip_time_in_secs", IntegerType(), True),
        StructField("trip_distance", DoubleType(), True),
        StructField("pickup_longitude", DoubleType(), True),
        StructField("pickup_latitude", DoubleType(), True),
        StructField("dropoff_longitude", DoubleType(), True),
        StructField("dropoff_latitude", DoubleType(), True),
        StructField("payment_type", StringType(), True),
        StructField("fare_amount", DoubleType(), True),
        StructField("surcharge", DoubleType(), True),
        StructField("mta_tax", DoubleType(), True),
        StructField("tip_amount", DoubleType(), True),
        StructField("tolls_amount", DoubleType(), True),
        StructField("total_amount", DoubleType(), True),
    ])

raw_schema = create_raw_taxi_schema()
cleaned_parquet_path = "cleaned_data.parquet"


In [4]:
# Read the Parquet files as a stream; we simulate streaming with maxFilesPerTrigger.
df_raw = (spark.readStream
    .schema(raw_schema)
    .format("parquet")
    .option("maxFilesPerTrigger", 1)
    .load(cleaned_parquet_path)
)

In [5]:
#######################################
# Part 2: 250m x 250m Cell Mapping
#######################################
LAT_REF = 41.474937
LON_REF = -74.913585
EARTH_RADIUS = 6371000.0

def latlon_to_meters(lat, lon):
    lat_r = math.radians(lat)
    lon_r = math.radians(lon)
    lat_ref_r = math.radians(LAT_REF)
    lon_ref_r = math.radians(LON_REF)
    x = EARTH_RADIUS * (lon_r - lon_ref_r) * math.cos((lat_r + lat_ref_r)/2)
    y = EARTH_RADIUS * (lat_r - lat_ref_r)
    return (x, y)

def get_250m_cell(lat, lon):
    (x_m, y_m) = latlon_to_meters(lat, lon)
    cell_x = int(math.floor(x_m / 250.0)) + 1
    cell_y = int(math.floor((-1 * y_m) / 250.0)) + 1
    if 1 <= cell_x <= 600 and 1 <= cell_y <= 600:
        return [cell_x, cell_y]
    return None

@udf(ArrayType(IntegerType()))
def udf_get_250m_cell(lat, lon):
    if lat is None or lon is None:
        return None
    return get_250m_cell(lat, lon)

df_cells = (df_raw
    .withColumn("start_cell", udf_get_250m_cell(col("pickup_latitude"), col("pickup_longitude")))
    .withColumn("end_cell", udf_get_250m_cell(col("dropoff_latitude"), col("dropoff_longitude")))
    .withColumn("start_cell_x", expr("start_cell[0]"))
    .withColumn("start_cell_y", expr("start_cell[1]"))
    .withColumn("end_cell_x", expr("end_cell[0]"))
    .withColumn("end_cell_y", expr("end_cell[1]"))
    .filter(col("start_cell_x").isNotNull() & col("end_cell_x").isNotNull())
)


In [6]:
#######################################
# Part 3: Ephemeral Python State for Empty Taxi Tracking and Profit Aggregation
#######################################
# Global dictionary to track for each taxi its last dropoff state:
dropoff_state = {}  # key: medallion, value: (dropoff_datetime, end_cell_x, end_cell_y)
last_top10 = None   # to track previous top 10 result

def median(values):
    if not values:
        return None
    s = sorted(values)
    n = len(s)
    mid = n // 2
    return s[mid] if n % 2 == 1 else (s[mid-1] + s[mid]) / 2.0

In [15]:
def process_microbatch(batch_df, batch_id):
    global dropoff_state, last_top10

    read_time = time.time()

    
    # We'll do row-by-row iteration using toLocalIterator
    now_ts = datetime.datetime.utcnow()
    cutoff_15 = now_ts - datetime.timedelta(minutes=15)
    cutoff_30 = now_ts - datetime.timedelta(minutes=30)
    
    # We'll store partial results in:
    # 1) profit_map: (start_cell_x, start_cell_y) -> list of fare+tip for trips ended < 15 min
    profit_map = {}
    
    # Track max dropoff time in this microbatch for "triggering" event:
    max_dropoff_t = None
    max_pickup_t_for_that = None
    
    # Row-by-row iteration
    for row in batch_df.toLocalIterator():
        medallion = row["medallion"]
        pickup_t = row["pickup_datetime"]
        dropoff_t = row["dropoff_datetime"]
        start_x = row["start_cell_x"]
        start_y = row["start_cell_y"]
        end_x = row["end_cell_x"]
        end_y = row["end_cell_y"]
        fare = row["fare_amount"] or 0.0
        tip = row["tip_amount"] or 0.0
        fare_plus_tip = fare + tip
        
        # Update "triggering" event times
        if dropoff_t and (max_dropoff_t is None or dropoff_t > max_dropoff_t):
            max_dropoff_t = dropoff_t
            max_pickup_t_for_that = pickup_t
        
        # 1) Update dropoff_state
        old_val = dropoff_state.get(medallion, None)
        
        # If there's a new dropoff, store it
        if dropoff_t is not None:
            dropoff_state[medallion] = (dropoff_t, end_x, end_y)
        
        # If there's a pickup that is after the last dropoff, taxi is no longer empty
        if pickup_t and old_val:
            (old_drop_time, old_cell_x, old_cell_y) = old_val
            if pickup_t > old_drop_time:
                # remove from dropoff_state
                dropoff_state[medallion] = None
                if medallion in dropoff_state:
                    del dropoff_state[medallion]
        
        # 2) For the 15-min median, if dropoff_t <= now & dropoff_t >= cutoff_15
        if dropoff_t:
            if dropoff_t >= cutoff_15:
                k = (start_x, start_y)
                profit_map.setdefault(k, []).append(fare_plus_tip)
    
    # Now build empty_count_map from the ephemeral dropoff_state
    empty_count_map = {}
    for med, val in dropoff_state.items():
        if val is None:
            continue
        (d_time, cellx, celly) = val
        if d_time >= cutoff_30:
            # still within 30 min
            k = (cellx, celly)
            empty_count_map[k] = empty_count_map.get(k, 0) + 1
    
    # Build a profit_result map from profit_map
    #  => (cell_x, cell_y) -> median_fare_tip
    profit_result = {}
    for (cx, cy), fares in profit_map.items():
        mm = median(fares)
        profit_result[(cx, cy)] = mm
    
    # Build final list: (cell_x, cell_y, empties, median_fare_tip, profitability)
    results = []
    for (cx, cy), med_val in profit_result.items():
        empties = empty_count_map.get((cx, cy), 0)
        if empties > 0 and med_val is not None:
            profit_val = med_val / empties
        else:
            profit_val = None
        results.append((cx, cy, empties, med_val, profit_val))
    
    # Sort descending by profitability
    results.sort(key=lambda x: (x[4] if x[4] is not None else 0), reverse=True)
    top10 = results[:10]
    
    # Compare with last_top10
    has_changed = False
    if not last_top10 and top10:
        has_changed = True
    elif last_top10 and len(top10) != len(last_top10):
        has_changed = True
    else:
        for i in range(len(top10)):
            if (top10[i][0] != last_top10[i][0] or
                top10[i][1] != last_top10[i][1]):
                has_changed = True
                break
    
    if has_changed and top10:
        # Build the single-row output
        # "pickup_datetime, dropoff_datetime" from the "triggering" trip
        trigger_pickup = max_pickup_t_for_that
        trigger_dropoff = max_dropoff_t
        
        output_row = {
            "pickup_datetime": str(trigger_pickup) if trigger_pickup else None,
            "dropoff_datetime": str(trigger_dropoff) if trigger_dropoff else None
        }
        
        # fill columns for top10
        for i in range(10):
            idx = i+1
            if i < len(top10):
                (cx, cy, ecount, medp, prof) = top10[i]
                output_row[f"profitable_cell_id_{idx}"] = f"{cx}.{cy}"
                output_row[f"empty_taxies_in_cell_id_{idx}"] = ecount
                output_row[f"median_profit_in_cell_id_{idx}"] = medp
                output_row[f"profitability_of_cell_{idx}"] = prof
            else:
                output_row[f"profitable_cell_id_{idx}"] = None
                output_row[f"empty_taxies_in_cell_id_{idx}"] = None
                output_row[f"median_profit_in_cell_id_{idx}"] = None
                output_row[f"profitability_of_cell_{idx}"] = None
        
        # # 'delay' = (now - hypothetical read_time). We'll just put a placeholder
        # delay_val = 1.0
        # output_row["delay"] = delay_val

        # Actual delay measurement: difference between output_time and read_time
        output_time = time.time()
        delay_val = output_time - read_time
        output_row["delay"] = delay_val
        
        print("\n=== TOP 10 CHANGED === (batch_id = {})".format(batch_id))
        print(output_row)
        
        last_top10 = top10

In [None]:
#######################################
# Part 5: Launch the Streaming Query
#######################################
query = (df_cells
    .writeStream
    .outputMode("append")  # We simply need micro-batch triggers
    .format("console")     # Using console sink for demonstration
    .trigger(processingTime="10 seconds")
    .foreachBatch(process_microbatch)
    .start()
)

query.awaitTermination()

In [16]:
test_query = (df_raw.writeStream
    .outputMode("append")
    .format("console")
    .trigger(processingTime="10 seconds")
    .start())
test_query.awaitTermination(30)
test_query.stop()


25/03/29 14:46:15 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-f8a218ff-2ce9-43fc-94fb-3368a415f0b4. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/03/29 14:46:15 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


-------------------------------------------
Batch: 0
-------------------------------------------
+---------+------------+---------------+----------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+------------+
|medallion|hack_license|pickup_datetime|dropoff_datetime|trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|total_amount|
+---------+------------+---------------+----------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+------------+
+---------+------------+---------------+----------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+-------

25/03/29 14:46:23 ERROR Executor: Exception in task 11.0 in stage 1.0 (TID 12)4]
java.lang.OutOfMemoryError: Java heap space
	at java.base/java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:61)
	at java.base/java.nio.ByteBuffer.allocate(ByteBuffer.java:348)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1$adapted(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$$$Lambda$2868/0x00000008411b7840.apply(Unknown Source)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.allocateNewChunkIfNeeded(ChunkedByteBufferOutputStream.scala:87)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.write(ChunkedByteBufferOutputStream.scala:75)
	at java.base/java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1873)
	at java.base/java.io.ObjectOutputStream$BlockDataOutputStream.setBlockDataM

Py4JError: An error occurred while calling o83.awaitTermination

In [6]:
def create_raw_taxi_schema():
    return StructType([
        StructField("medallion", StringType(), True),
        StructField("hack_license", StringType(), True),
        StructField("pickup_datetime", TimestampType(), True),
        StructField("dropoff_datetime", TimestampType(), True),
        StructField("trip_time_in_secs", IntegerType(), True),
        StructField("trip_distance", DoubleType(), True),
        StructField("pickup_longitude", DoubleType(), True),
        StructField("pickup_latitude", DoubleType(), True),
        StructField("dropoff_longitude", DoubleType(), True),
        StructField("dropoff_latitude", DoubleType(), True),
        StructField("payment_type", StringType(), True),
        StructField("fare_amount", DoubleType(), True),
        StructField("surcharge", DoubleType(), True),
        StructField("mta_tax", DoubleType(), True),
        StructField("tip_amount", DoubleType(), True),
        StructField("tolls_amount", DoubleType(), True),
        StructField("total_amount", DoubleType(), True),
    ])


In [7]:
raw_schema = create_raw_taxi_schema()

# Define paths
PARQUET_PATH = "cleaned_data.parquet"        # Path to your static cleaned parquet file
TEST_OUTPUT_PATH = "test_delta_table"       # Output path for the Delta table

# Read the cleaned Parquet file (batch mode)
static_df = spark.read.schema(raw_schema).parquet(PARQUET_PATH)

# Write the static data to a Delta table (batch mode)
static_df.write.format("delta").mode("overwrite").save(TEST_OUTPUT_PATH)

# Now, read the Delta table as a stream.
# Note: We use format("delta") here.
df_raw = (spark.readStream
    .format("delta")
    .load(TEST_OUTPUT_PATH)
    .option("maxFilesPerTrigger", 1)
)

AttributeError: 'DataFrame' object has no attribute 'option'

In [3]:

# Read the cleaned data as a stream (simulate live streaming with maxFilesPerTrigger).
df_raw = (spark.readStream
    .schema(raw_schema)
    .format("parquet")
    .option("maxFilesPerTrigger", 1)
    .load(cleaned_parquet_path)
    # Uncomment the following line to process only a small fraction for testing:
    # .filter("rand() < 0.01")
)

#######################################
# Part 2: 250m x 250m Cell Mapping
#######################################
# Grid reference point: (41.474937, -74.913585); each cell is 250m x 250m.
LAT_REF = 41.474937
LON_REF = -74.913585
EARTH_RADIUS = 6371000.0  # in meters

def latlon_to_meters(lat, lon):
    lat_r = math.radians(lat)
    lon_r = math.radians(lon)
    lat_ref_r = math.radians(LAT_REF)
    lon_ref_r = math.radians(LON_REF)
    x = EARTH_RADIUS * (lon_r - lon_ref_r) * math.cos((lat_r + lat_ref_r) / 2)
    y = EARTH_RADIUS * (lat_r - lat_ref_r)
    return (x, y)

def get_250m_cell(lat, lon):
    (x_m, y_m) = latlon_to_meters(lat, lon)
    cell_x = int(math.floor(x_m / 250.0)) + 1
    cell_y = int(math.floor((-1 * y_m) / 250.0)) + 1
    if 1 <= cell_x <= 600 and 1 <= cell_y <= 600:
        return [cell_x, cell_y]
    return None

@udf(ArrayType(IntegerType()))
def udf_get_250m_cell(lat, lon):
    if lat is None or lon is None:
        return None
    return get_250m_cell(lat, lon)

df_cells = (df_raw
    .withColumn("start_cell", udf_get_250m_cell(col("pickup_latitude"), col("pickup_longitude")))
    .withColumn("end_cell", udf_get_250m_cell(col("dropoff_latitude"), col("dropoff_longitude")))
    .withColumn("start_cell_x", expr("start_cell[0]"))
    .withColumn("start_cell_y", expr("start_cell[1]"))
    .withColumn("end_cell_x", expr("end_cell[0]"))
    .withColumn("end_cell_y", expr("end_cell[1]"))
    .filter(col("start_cell_x").isNotNull() & col("end_cell_x").isNotNull())
)

#######################################
# Part 3: Windowed Aggregations for Profit and Empty Taxi Count
#######################################
# Aggregation A: Profit Aggregation
# Compute the median of (fare_amount + tip_amount) for trips that ended in the last 15 minutes,
# grouped by the start cell.
df_profit = (df_cells
    .withColumn("fare_plus_tip", col("fare_amount") + col("tip_amount"))
    .withWatermark("dropoff_datetime", "20 minutes")
    .groupBy(
        window(col("dropoff_datetime"), "15 minutes"),
        col("start_cell_x"), col("start_cell_y")
    )
    .agg(
        percentile_approx("fare_plus_tip", 0.5).alias("median_fare_tip"),
        spark_max("pickup_datetime").alias("trigger_pickup"),
        spark_max("dropoff_datetime").alias("trigger_dropoff")
    )
    .select(
        col("window.start").alias("profit_window_start"),
        col("window.end").alias("profit_window_end"),
        col("start_cell_x"),
        col("start_cell_y"),
        col("median_fare_tip"),
        col("trigger_pickup"),
        col("trigger_dropoff")
    )
)


profit_query = (
    df_profit
    .writeStream
    .outputMode("append")          # Or "update", depending on your aggregation logic
    .format("console")
    .option("truncate", False)
    .start()
)

# Aggregation B: Empty Taxi Count
# Use approx_count_distinct on medallion for dropoffs in the last 30 minutes,
# grouped by the dropoff cell.
df_empty = (df_cells
    .withWatermark("dropoff_datetime", "35 minutes")
    .groupBy(
        window(col("dropoff_datetime"), "30 minutes"),
        col("end_cell_x").alias("cell_x"),
        col("end_cell_y").alias("cell_y")
    )
    .agg(
        approx_count_distinct("medallion").alias("empty_taxis")
    )
    .select(
        col("window.start").alias("empty_window_start"),
        col("window.end").alias("empty_window_end"),
        col("cell_x"),
        col("cell_y"),
        col("empty_taxis")
    )
)

empty_query = (
    df_empty
    .writeStream
    .outputMode("append")          # Or "update", depending on your aggregation logic
    .format("console")
    .option("truncate", False)
    .start()
)

# Join the two aggregations on cell coordinates.
df_joined = (df_profit.join(
    df_empty,
    (df_profit.start_cell_x == df_empty.cell_x) & (df_profit.start_cell_y == df_empty.cell_y),
    "inner"
)
.select(
    df_profit.trigger_pickup,
    df_profit.trigger_dropoff,
    df_profit.start_cell_x,
    df_profit.start_cell_y,
    df_profit.median_fare_tip,
    df_empty.empty_taxis
)
.withColumn("profitability", when(col("empty_taxis") > 0, col("median_fare_tip")/col("empty_taxis")).otherwise(None))
)


joined_query = (
    df_joined
    .writeStream
    .outputMode("append")          # Or "update", depending on your aggregation logic
    .format("console")
    .option("truncate", False)
    .start()
)

#######################################
# Part 4: foreachBatch to Print Top 10 Results
#######################################
def process_batch(batch_df, batch_id):
    # Order by profitability descending and limit to top 10.
    rows = batch_df.orderBy(desc("profitability")).limit(10).collect()
    if rows:
        output = {}
        # Use trigger times from the first row as the "trigger" event.
        trigger_pickup = rows[0]["trigger_pickup"]
        trigger_dropoff = rows[0]["trigger_dropoff"]
        output["pickup_datetime"] = str(trigger_pickup) if trigger_pickup else None
        output["dropoff_datetime"] = str(trigger_dropoff) if trigger_dropoff else None
        for i, row in enumerate(rows, start=1):
            cell_id = f"{row['start_cell_x']}.{row['start_cell_y']}"
            output[f"profitable_cell_id_{i}"] = cell_id
            output[f"empty_taxies_in_cell_id_{i}"] = row["empty_taxis"]
            output[f"median_profit_in_cell_id_{i}"] = row["median_fare_tip"]
            output[f"profitability_of_cell_{i}"] = row["profitability"]
        # Fill remaining ranks up to 10 with NULL if needed.
        for i in range(len(rows)+1, 11):
            output[f"profitable_cell_id_{i}"] = None
            output[f"empty_taxies_in_cell_id_{i}"] = None
            output[f"median_profit_in_cell_id_{i}"] = None
            output[f"profitability_of_cell_{i}"] = None
        # Compute delay as a placeholder.
        output["delay"] = 1.0
        print("\n=== Top 10 Profitable Areas (Batch {}) ===".format(batch_id))
        print(output)

# Write the joined stream using foreachBatch.
# IMPORTANT: We use outputMode "append" because joins between two streaming DataFrames
# require "append" mode.
query = (df_joined.writeStream
    .outputMode("append")
    .trigger(processingTime="10 seconds")
    .foreachBatch(process_batch)
    .start()
)

query.awaitTermination()


NameError: name 'cleaned_parquet_path' is not defined