In [None]:
# importing functions we'll need for aggregations and date handling
from pyspark.sql.functions import (
    col, sum as _sum, avg, to_date, date_trunc, max as _max
)

# file path from ADLS Gen2 storage
file_path = "abfss://nyctaxi@saptadipnyctaxi.dfs.core.windows.net/raw/yellow_tripdata_2020-01.csv"

# reading the CSV file with comma as delimiter
df = spark.read.option("delimiter", ",") \
    .option("header", True) \
    .option("inferSchema", True) \
    .csv(file_path)

# checking schema and a few rows just to confirm it loaded fine
df.printSchema()
df.show(5)


In [None]:
# we'll now calculate revenue by summing up all the fare-related columns
from pyspark.sql.functions import col

df_with_revenue = df.withColumn(
    "Revenue",
    col("fare_amount") +
    col("extra") +
    col("mta_tax") +
    col("tip_amount") +
    col("tolls_amount") +
    col("improvement_surcharge") +
    col("congestion_surcharge")
)

# quick peek to see if the revenue column looks okay
df_with_revenue.select("VendorID", "fare_amount", "tip_amount", "congestion_surcharge", "Revenue").show(10)


In [None]:
# checking total number of passengers per pickup location
from pyspark.sql.functions import sum as _sum

pickup_passenger_df = df_with_revenue.groupBy("PULocationID") \
    .agg(_sum("passenger_count").alias("total_passengers")) \
    .orderBy("total_passengers", ascending=False)

pickup_passenger_df.show(10)

# now same thing, but for dropoff locations
dropoff_passenger_df = df_with_revenue.groupBy("DOLocationID") \
    .agg(_sum("passenger_count").alias("total_passengers")) \
    .orderBy("total_passengers", ascending=False)

dropoff_passenger_df.show(10)


In [None]:
# calculating average fare, total amount, and revenue for each vendor
from pyspark.sql.functions import avg

vendor_stats_df = df_with_revenue.groupBy("VendorID").agg(
    avg("fare_amount").alias("avg_fare"),
    avg("total_amount").alias("avg_total_amount"),
    avg("Revenue").alias("avg_revenue")
)

vendor_stats_df.show()


In [None]:
# we want to see how payment types are used over time, so we bucket time to the minute
from pyspark.sql.functions import date_trunc, count

payment_over_time = df_with_revenue.withColumn(
    "pickup_minute", date_trunc("minute", col("tpep_pickup_datetime"))
)

# counting number of payments per minute for each payment type
moving_payment_count = payment_over_time.groupBy("pickup_minute", "payment_type") \
    .agg(count("*").alias("payment_count")) \
    .orderBy("pickup_minute", ascending=True)

moving_payment_count.show(10)


In [None]:
# filtering for one specific day (15th Jan 2020) to check vendor performance
from pyspark.sql.functions import to_date

filtered_date = df_with_revenue.withColumn("trip_date", to_date("tpep_pickup_datetime")) \
    .filter(col("trip_date") == "2020-01-15")

# getting total revenue, passengers, and distance per vendor
top_vendors = filtered_date.groupBy("VendorID").agg(
    _sum("Revenue").alias("total_revenue"),
    _sum("passenger_count").alias("total_passengers"),
    _sum("trip_distance").alias("total_distance")
).orderBy("total_revenue", ascending=False)

# showing top 2 vendors on that day
top_vendors.show(2)


In [None]:
# now we'll find the most frequent pickup-dropoff route based on passenger count
popular_routes_df = df_with_revenue.groupBy("PULocationID", "DOLocationID").agg(
    _sum("passenger_count").alias("total_passengers")
).orderBy("total_passengers", ascending=False)

popular_routes_df.show(10)


In [None]:
# figuring out where people were picked up most in the last 10 seconds of data
from pyspark.sql.functions import max as _max

# first get the latest timestamp from the dataset
latest_ts = df_with_revenue.agg(_max("tpep_pickup_datetime")).first()[0]
print("Latest pickup timestamp in data:", latest_ts)

# subtract 10 seconds from the latest timestamp
from pyspark.sql.functions import col
window_start = latest_ts.replace(second=latest_ts.second - 10)

# filter trips in the last 10 second window
recent_pickups = df_with_revenue.filter(
    (col("tpep_pickup_datetime") > window_start) &
    (col("tpep_pickup_datetime") <= latest_ts)
)

# count passengers picked up at each location in that tiny window
hotspots = recent_pickups.groupBy("PULocationID").agg(
    _sum("passenger_count").alias("passenger_count")
).orderBy("passenger_count", ascending=False)

hotspots.show()


In [None]:
# saving each of the query results to the 'processed' folder in ADLS

# Revenue Data
df_with_revenue.write.mode("overwrite").parquet("abfss://nyctaxi@saptadipnyctaxi.dfs.core.windows.net/processed/revenue/")

# Passenger by Pickup Area
pickup_passenger_df.write.mode("overwrite").parquet("abfss://nyctaxi@saptadipnyctaxi.dfs.core.windows.net/processed/passenger_by_pickup_area/")

# Vendor-wise Averages
vendor_stats_df.write.mode("overwrite").parquet("abfss://nyctaxi@saptadipnyctaxi.dfs.core.windows.net/processed/vendor_averages/")

# Payment Type Over Time
moving_payment_count.write.mode("overwrite").parquet("abfss://nyctaxi@saptadipnyctaxi.dfs.core.windows.net/processed/payment_moving_count/")

# Top Vendors on Specific Date
top_vendors.write.mode("overwrite").parquet("abfss://nyctaxi@saptadipnyctaxi.dfs.core.windows.net/processed/top_vendors_by_date/")

# Most Popular Routes
popular_routes_df.write.mode("overwrite").parquet("abfss://nyctaxi@saptadipnyctaxi.dfs.core.windows.net/processed/popular_routes/")

# Recent Pickup Hotspots
hotspots.write.mode("overwrite").parquet("abfss://nyctaxi@saptadipnyctaxi.dfs.core.windows.net/processed/pickup_hotspots/")
