# Big Data Management Project 2:
## DESB GRAND CHALLENGE 2015

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp, regexp_extract, col, count, udf, window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DoubleType, FloatType

from pyspark.sql import functions as F
from pyspark.sql.window import Window

import math
import time

In [7]:
spark = SparkSession.builder \
    .appName('BDM_Project2') \
    .getOrCreate()

### Creating a 1GB dataset from the original

Rename the resulting file to sorted_data_sample and move original file elsewhere

In [14]:
#df = spark.read.csv("input/sorted_data.csv", header=True, inferSchema=True)

#df_5m = df.limit(5000000)
#df_5m.write.csv("input", header=True, mode="overwrite")

### Query 0
Data Cleansing and Setup

In [8]:
start_time = time.time()  

# Defining the schema for faster reading of data
schema = StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("trip_time_in_secs", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True),
    StructField("payment_type", StringType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("surcharge", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True)
])

# Creating a single dataframe of all the trip_data files
taxi_df_og = (
    spark.readStream
    .option("maxFilesPerTrigger", 1)
    .option("header", False)
    .schema(schema)
    .csv("input")
)

# Removing the trips with 0 passengers
# Transforming the data 
taxi_df = taxi_df_og.filter(
    (regexp_extract(col("medallion"), r"^[a-fA-F0-9]{32}$", 0) != "") &
    (regexp_extract(col("hack_license"), r"^[a-fA-F0-9]{32}$", 0) != "") &
    (col("pickup_datetime").isNotNull()) &
    (col("dropoff_datetime").isNotNull()) &               
    (col("trip_distance") > 0) &                    
    (col("fare_amount") > 0) &
    (col("tip_amount") >= 0)
)

# Convert timestamps to Unix format 
taxi_df = taxi_df.withColumn("pickup_ts", unix_timestamp("pickup_datetime")) \
    .withColumn("dropoff_ts", unix_timestamp("dropoff_datetime")) \
    .withColumn("duration", col("dropoff_ts") - col("pickup_ts")) \
    .select("*") \
    .dropna()  # Drop remaining null values

# Start the streaming query with trigger(once=True) to process data once and stop
query = (
    taxi_df.writeStream
    .outputMode("append")
    .format("parquet")
    .option("path", "output/preprocessed_data")
    .option("checkpointLocation", "output/checkpoint")
    .trigger(once=True)  
    .start()
)

query.awaitTermination()

print("Execution time", time.time() - start_time)

Execution time 0.8897030353546143


### Grid Cells for Query 1

In [9]:
start_lat = 41.474937
start_long = -74.913585
cell_size = 0.044 # 500m to degrees for latitude (and longitude)

def grid_cells_q1(point_long, point_lat):

    # calculating the point values for latitude and longitude
    long = math.floor((point_long - start_long) / cell_size) + 1
    lat = math.floor((start_lat - point_lat) / cell_size) + 1
    
    # Ensure the cell is within valid grid bounds (300x300)
    if not (1 <= long <= 300 and 1 <= lat <= 300):
        return None 
    
    return float(f"{long}.{lat}") # Convert to X.X format

get_grid = udf(grid_cells_q1, FloatType())

### Query 1
Frequent Routes

In [20]:
from pyspark.sql.functions import col, window, count, date_format
from pyspark.sql.streaming import DataStreamWriter
import pandas as pd
from IPython.display import display, Markdown

# Reading in the stream
taxi_df = (spark.readStream
  .option("maxFilesPerTrigger", 1)
  .schema(schema)
  .parquet("output/preprocessed_data")
)

# Query 1: create a query to find the top most frequent routes during the last 30 minutes (Show only the 10 most frequent routes)
# The output query results must be: start_cell, end_cell, Number of Rides
# Aid from ChatGPT was used for the following code

# Adding start_cell and end_cell columns
taxi_df_q1 = taxi_df.withColumn("start_cell", get_grid(taxi_df.pickup_longitude, taxi_df.pickup_latitude))\
    .withColumn("end_cell", get_grid(taxi_df.dropoff_longitude, taxi_df.dropoff_latitude))\
    .filter(
        col("start_cell").isNotNull() & col("end_cell").isNotNull()  # Filter out trips outside of the grid
    )

# Defining a 30-minute tumbling window and aggregating by start_cell and end_cell
top_routes_df = (
    taxi_df_q1
    .withWatermark("dropoff_datetime", "30 minutes") 
    .groupBy(window(col("dropoff_datetime"), "30 minutes"), col("start_cell"), col("end_cell"))
    .agg(count("*").alias("Number of Rides"))
)

# Function for processing the data in batches
def process_batch(df, epoch_id):
    
    df = df.orderBy(col("window.start").desc(), col("Number of Rides").desc())

    # Collect unique time windows
    windows = df.select("window").distinct().orderBy("window.start").collect()
    
    # Finding the top routes for each "last 30 minutes"
    for window_row in windows:
        
        time_window = window_row["window"]
        start_time = time_window.start.strftime("%Y-%m-%d %H:%M:%S")
        end_time = time_window.end.strftime("%Y-%m-%d %H:%M:%S")

        display(Markdown(f"Time Window: {start_time} to {end_time}"))
        # Filter the top 10 rides for the given window
        top_routes = (
            df.filter(col("window.start") == time_window.start)
            .select("start_cell", "end_cell", "Number of Rides")
            .orderBy(col("Number of Rides").desc())
            .limit(10)
            .toPandas()  # Using to Pandas DataFrame to display the results in the notebook
        )

        display(top_routes)

# Streaming query
query: DataStreamWriter = (
    top_routes_df
    .writeStream
    .outputMode("update")
    .foreachBatch(process_batch)
    .start()
)

query.awaitTermination()

Time Window: 2013-01-01 00:00:00 to 2013-01-01 00:30:00

Unnamed: 0,start_cell,end_cell,Number of Rides
0,22.17,22.17,1138
1,22.16,22.16,730
2,21.17,21.17,584
3,21.17,22.17,433
4,22.17,22.16,422
5,22.17,21.17,355
6,22.16,22.17,294
7,21.17,21.18,212
8,21.18,21.18,169
9,21.18,21.17,166


Time Window: 2013-01-01 00:30:00 to 2013-01-01 01:00:00

Unnamed: 0,start_cell,end_cell,Number of Rides
0,22.17,22.17,2280
1,22.16,22.16,1338
2,22.17,22.16,1019
3,21.17,21.17,1002
4,21.17,22.17,941
5,22.17,21.17,901
6,22.16,22.17,872
7,21.17,21.18,445
8,22.17,21.18,441
9,21.18,22.17,388


Time Window: 2013-01-01 01:00:00 to 2013-01-01 01:30:00

Unnamed: 0,start_cell,end_cell,Number of Rides
0,22.17,22.17,2244
1,22.16,22.16,1396
2,22.17,22.16,1096
3,21.17,22.17,966
4,22.16,22.17,945
5,22.17,21.17,924
6,21.17,21.17,838
7,21.18,22.17,449
8,21.17,22.16,442
9,22.17,21.18,432


Time Window: 2013-01-01 01:30:00 to 2013-01-01 02:00:00

Unnamed: 0,start_cell,end_cell,Number of Rides
0,22.17,22.17,2129
1,22.16,22.16,1397
2,22.17,22.16,1074
3,21.17,22.17,942
4,22.16,22.17,884
5,22.17,21.17,777
6,21.17,21.17,611
7,21.18,22.17,433
8,21.17,22.16,360
9,22.17,21.18,359


Time Window: 2013-01-01 02:00:00 to 2013-01-01 02:30:00

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

Unnamed: 0,start_cell,end_cell,Number of Rides
0,22.17,22.17,2069
1,22.16,22.16,1349
2,22.17,22.16,1015
3,22.16,22.17,980
4,21.17,22.17,734
5,22.17,21.17,722
6,21.17,21.17,563
7,21.18,22.17,417
8,22.16,21.17,369
9,22.17,21.18,357


Time Window: 2013-01-01 02:30:00 to 2013-01-01 03:00:00

In [21]:
query.stop()

### Grid Cells for Query 2

In [14]:
start_lat = 41.474937
start_long = -74.913585
cell_size = 0.022 # 250m to degrees for latitude (and longitude)

def grid_cells_q2(point_long, point_lat):
    
    long = math.floor((point_long - start_long) / cell_size) + 1
    lat = math.floor((start_lat - point_lat) / cell_size) + 1
    
    # Ensure the cell is within valid grid bounds (600x600)
    if not (1 <= long <= 600 and 1 <= lat <= 600):
        return None 
    
    return float(f"{long}.{lat}") # Convert to X.X format

In [15]:
get_grid2 = udf(grid_cells_q2, FloatType())

taxi_df_q2 = taxi_df.withColumn("start_cell", get_grid2(taxi_df.pickup_longitude, taxi_df.pickup_latitude))\
    .withColumn("end_cell", get_grid2(taxi_df.dropoff_longitude, taxi_df.dropoff_latitude))\
    .filter(
        col("start_cell").isNotNull() & col("end_cell").isNotNull()  # Filter out trips outside of the grid
    )

taxi_df_q2.show(5, truncate=False)

+--------------------------------+--------------------------------+-------------------+-------------------+---------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+----------+----------+--------+----------+--------+
|medallion                       |hack_license                    |pickup_datetime    |dropoff_datetime   |passenger_count|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|pickup_ts |dropoff_ts|duration|start_cell|end_cell|
+--------------------------------+--------------------------------+-------------------+-------------------+---------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+----------+----------+--------+----------+--------+
|319AE2555940BA65DB0749E1DD1

### Query 2
Profitable Areas

In [22]:
spark.stop() 