In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, count, lit, percentile_approx, unix_timestamp,
    lead, when, pandas_udf
)
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, TimestampType
from pyspark.sql.window import Window
import math
import pandas as pd

In [11]:
# 1. Initialize Spark & Load Data
################################
spark = SparkSession.builder \
    .appName("DEBS Query 2 - Profitable Areas") \
    .getOrCreate()

df = spark.read.parquet("/content/cleaned_data.parquet")
df.printSchema()

root
 |-- kafka_key: string (nullable = true)
 |-- kafka_timestamp: timestamp (nullable = true)
 |-- medallion: string (nullable = true)
 |-- hack_license: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- trip_time_in_secs: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- surcharge: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- total_amount: double (nullable = true)



In [12]:
START_LAT = 41.474937
START_LON = -74.913585
CELL_SIZE_METERS = 250
METERS_PER_DEGREE = 111320  # approximate for NYC


In [13]:

def get_cell_id(lat, lon):
    """
    Convert (lat, lon) to row.col based on the described grid.
    Returns None if outside the 600x600 grid.
    """
    if lat is None or lon is None:
        return None
    dx = (lon - START_LON) * math.cos(math.radians(lat)) * METERS_PER_DEGREE
    dy = (START_LAT - lat) * METERS_PER_DEGREE
    col_ = int(dx / CELL_SIZE_METERS) + 1
    row_ = int(dy / CELL_SIZE_METERS) + 1
    if 1 <= row_ <= 600 and 1 <= col_ <= 600:
        return f"{row_}.{col_}"
    return None

# Vectorized UDF to handle entire columns
@pandas_udf("string")
def udf_get_cell_id(lat: pd.Series, lon: pd.Series) -> pd.Series:
    return pd.Series([get_cell_id(a, b) for a, b in zip(lat, lon)])


In [14]:
################################
# 3. Add Grid Columns & Filter Valid Cells
################################
df = df.withColumn("pickup_cell", udf_get_cell_id(col("pickup_latitude"), col("pickup_longitude"))) \
       .withColumn("dropoff_cell", udf_get_cell_id(col("dropoff_latitude"), col("dropoff_longitude")))

df = df.filter(
    col("pickup_cell").isNotNull() & col("dropoff_cell").isNotNull()
)

In [15]:
# 4. Add Profit Column
################################
df = df.withColumn("profit", col("fare_amount") + col("tip_amount"))


In [16]:
# 5. Identify Taxis That Remain Empty
#    ("no following pickup within 30 minutes")
################################
# Window: partition by medallion, ordered by pickup time
window_medallion = Window.partitionBy("medallion").orderBy("pickup_datetime")

# For each trip row, find the next trip's pickup time
df = df.withColumn(
    "next_pickup_datetime",
    lead("pickup_datetime").over(window_medallion)
)

# Mark 'is_empty' if there's no next pickup or if it's > 30 min from the current dropoff
df = df.withColumn(
    "is_empty",
    when(
        (col("next_pickup_datetime").isNull()) |
        ((unix_timestamp("next_pickup_datetime") - unix_timestamp("dropoff_datetime")) > 1800),
        1
    ).otherwise(0)
)

In [17]:
# 6. Determine "Current Time" & Filter for Profit (Last 15 min)
################################
current_time = df.agg({"dropoff_datetime": "max"}).collect()[0][0]
print(f"Current time (max dropoff): {current_time}")

# Trips ended in last 15 minutes => For profit
filtered_15_df = df.filter(
    (unix_timestamp(lit(current_time)) - unix_timestamp(col("dropoff_datetime")) <= 900) &
    (unix_timestamp(lit(current_time)) - unix_timestamp(col("dropoff_datetime")) >= 0)
)

# Median profit per pickup cell
profit_df = filtered_15_df.groupBy("pickup_cell") \
    .agg(percentile_approx(col("profit"), 0.5).alias("median_profit"))

profit_df.show()

Current time (max dropoff): 2014-01-01 00:36:00
+-----------+-------------+
|pickup_cell|median_profit|
+-----------+-------------+
|    337.306|         41.5|
|    370.383|        45.12|
|    323.310|         37.0|
|    317.314|         24.5|
+-----------+-------------+



In [18]:
# 7. Count Empty Taxis (Last 30 min, still empty)
################################
empty_30_df = df.filter(
    (unix_timestamp(lit(current_time)) - unix_timestamp(col("dropoff_datetime")) <= 1800) &
    (unix_timestamp(lit(current_time)) - unix_timestamp(col("dropoff_datetime")) >= 0) &
    (col("is_empty") == 1)
)

empty_taxi_df = empty_30_df.groupBy("dropoff_cell") \
    .agg(count("medallion").alias("empty_taxi_count"))

empty_taxi_df.show()

+------------+----------------+
|dropoff_cell|empty_taxi_count|
+------------+----------------+
|     356.322|               1|
|     335.315|               1|
|     314.310|               1|
|     275.352|               1|
|     338.312|               1|
|     328.309|               1|
|     311.327|               1|
|     309.322|               1|
|     321.319|               1|
|     349.311|               1|
|     342.329|               1|
|     319.322|               1|
|     289.327|               1|
|     315.335|               1|
|     280.359|               1|
|     282.327|               1|
|     314.350|               1|
|     293.329|               1|
|     328.306|               1|
|     353.319|               1|
+------------+----------------+
only showing top 20 rows



In [22]:
# 8. Join on Cell & Compute Profitability
################################
result_df = profit_df.join(
    empty_taxi_df,
    profit_df.pickup_cell == empty_taxi_df.dropoff_cell,
    "inner"
).select(
    profit_df.pickup_cell,
    "median_profit",
    "dropoff_cell",
    "empty_taxi_count"
)

result_df = result_df.withColumn(
    "profitability",
    col("median_profit") / col("empty_taxi_count")
)

result_df.show()

+-----------+-------------+------------+----------------+-------------+
|pickup_cell|median_profit|dropoff_cell|empty_taxi_count|profitability|
+-----------+-------------+------------+----------------+-------------+
|    323.310|         37.0|     323.310|               1|         37.0|
+-----------+-------------+------------+----------------+-------------+



In [23]:
# 9. Sort & Display Results
################################
sorted_df = result_df.orderBy(col("profitability").desc())

sorted_df.select(
    lit(current_time).alias("pickup_datetime"),
    lit(current_time).alias("dropoff_datetime"),
    col("pickup_cell").alias("profitable_cell_id"),
    col("empty_taxi_count"),
    col("median_profit"),
    col("profitability")
).show(truncate=False)

+-------------------+-------------------+------------------+----------------+-------------+-------------+
|pickup_datetime    |dropoff_datetime   |profitable_cell_id|empty_taxi_count|median_profit|profitability|
+-------------------+-------------------+------------------+----------------+-------------+-------------+
|2014-01-01 00:36:00|2014-01-01 00:36:00|323.310           |1               |37.0         |37.0         |
+-------------------+-------------------+------------------+----------------+-------------+-------------+

