In [0]:
%sql
USE CATALOG nyc_taxi;
USE SCHEMA nyc_taxi_schema;

In [0]:
spark.sql("USE CATALOG nyc_taxi")
spark.sql("USE SCHEMA nyc_taxi_schema")


df = spark.table("nyc_taxi.nyc_taxi_schema.yellow_trips_csv_v")

In [0]:
df.printSchema()
display(df)

In [0]:
num_cols = ["trip_distance","passenger_count","fare_amount","tip_amount","tolls_amount","mta_tax","extra","improvement_surcharge","total_amount"]
display(df.select(*num_cols).summary())


In [0]:

clean = df.na.drop(subset=['fare_amount', 'trip_distance', 'passenger_count'])

clean = clean.filter((clean.fare_amount >=0)&(clean.trip_distance >=0)&(clean.passenger_count >0)& (clean.tpep_pickup_datetime < clean.tpep_dropoff_datetime)& (clean.pickup_longitude !=0) & (clean.pickup_latitude !=0) & (clean.dropoff_longitude !=0) & (clean.dropoff_latitude !=0))
print(clean.count())
     

In [0]:
from pyspark.sql import functions as F
clean = clean.withColumn("trip_duration (m)", F.round((F.col("tpep_dropoff_datetime").cast("long") -F.col("tpep_pickup_datetime").cast("long"))/60,2))
clean = clean.withColumn("trip_speed(mph)", F.round((F.col("trip_distance")/(F.col("trip_duration (m)")/60)),2))
display(clean)

In [0]:
import pandas as pd
import numpy as np
import pyspark.pandas as ps
from pyspark.sql import SparkSession

psdf = clean.pandas_api()
psdf = psdf.groupby('passenger_count')[['fare_amount','trip_distance']].mean().reset_index()
display(psdf)

In [0]:
by_hour = clean.withColumn("pickup_hour", F.hour(F.col("tpep_pickup_datetime"))).groupBy("pickup_hour").count().orderBy(F.desc("count"))
display(by_hour)

In [0]:
#round coords to 0.01 grids to make neighborhood
cells = clean.withColumn("pickup_lat_r",  F.round("pickup_latitude", 2)) \
             .withColumn("pickup_lon_r",  F.round("pickup_longitude", 2))

avg_fare_by_cell = cells.groupBy("pickup_lat_r","pickup_lon_r") \
                        .agg(F.avg("fare_amount").alias("avg_fare"), F.count("*").alias("n")) \
                        .filter("n >= 200") \
                        .orderBy(F.desc("avg_fare"))
display(avg_fare_by_cell)


In [0]:
#from pyspark.sql import functions as F
#import matplotlib.pyplot as plt

#bin for each mile distance,
bin_size = 1
max_cap = 50

clean_binned = (clean
    .withColumn("distance_bin",
        F.when(F.col("trip_distance") >= max_cap, max_cap)
         .otherwise((F.col("trip_distance")/bin_size).cast("int") * bin_size)
    )
)

bin_counts = (clean_binned
    .groupBy("distance_bin")
    .count()
    .orderBy("distance_bin")
)

bin_pdf = bin_counts.toPandas()
print(bin_pdf.head())


plt.figure(figsize=(10,6))
plt.bar(bin_pdf["distance_bin"], bin_pdf["count"], width=1, align="edge", edgecolor="black")
plt.title("Distribution of Trip Distances (0–50+ miles)")
plt.xlabel("Trip distance (miles)")
plt.ylabel("Number of trips")
plt.xticks(range(0, max_cap+5, 5))
plt.show()

In [0]:
fare_by_hour_sdf = (
    clean
    .withColumn("pickup_hour", F.hour(F.col("tpep_pickup_datetime")))
    .groupBy("pickup_hour")
    .agg(F.avg("fare_amount").alias("avg_fare"))
    .orderBy("pickup_hour")
)

fare_by_hour = fare_by_hour_sdf.toPandas()

plt.figure(figsize=(9,5))
plt.bar(fare_by_hour["pickup_hour"], fare_by_hour["avg_fare"])
plt.title("Average Fare by Hour of Day")
plt.xlabel("Hour of day (0–23)")
plt.ylabel("Average fare (USD)")
plt.xticks(range(0,24))
plt.tight_layout()


### Step 7

a) Peak Taxi Usage Times

- Highest demand during morning (7–9 AM) and evening (5–8 PM) rush hours.
- Secondary peak after midnight (12–3 AM) linked to nightlife.
- Midday hours (10 AM–3 PM) show steady but lower activity.

b) Trip Duration, Speed, and Fare

- Most trips are short (1–5 miles) with fares typically under $20.
- Speeds are lowest during rush hours, making trips longer despite short distance.
- Long trips during congestion are disproportionately costly in both time and fare.

c) Geography-Related Patterns

- Manhattan has the highest concentration of pickups and short but expensive trips.
- Airport trips (JFK, LaGuardia) appear as long, higher-fare outliers.
- Outer borough trips are generally longer in distance but cheaper per mile.

In [0]:
#b1
trip_duration_mins = clean.withColumn("trip_duration_mins", F.round((F.col("tpep_dropoff_datetime").cast("long") -F.col("tpep_pickup_datetime").cast("long"))/60,2))
display(trip_duration_mins.select("trip_duration_mins").head(20))

In [0]:
#b2
clean = clean.withColumn("pickup_hour", F.hour(F.col("tpep_pickup_datetime")))
clean = clean.withColumn("pickup_day", F.dayofweek(F.col("tpep_pickup_datetime"))) #mon = 1, sun = 7
display(clean.select("pickup_hour", "pickup_day").head(20))

In [0]:
#b4
avg_trip_duration_by_hour = clean.groupBy("pickup_hour").agg(F.avg("trip_duration (m)").alias("avg_trip_duration"))
display(avg_trip_duration_by_hour)

avg_trip_duration_by_hour_pd = avg_trip_duration_by_hour.toPandas()

plt.figure(figsize=(10,6))
plt.bar(avg_trip_duration_by_hour_pd["pickup_hour"], avg_trip_duration_by_hour_pd["avg_trip_duration"])
plt.title("Average Trip Duration by Hour of Day")
plt.xlabel("Hour of Day")
plt.ylabel("Average Trip Duration (minutes)")
plt.xticks(range(24))
plt.tight_layout()
plt.show()


In [0]:
#b5 counting hotspots
#round coordinates for clustering
clean = clean.withColumn("pickup_lat_r", F.round("pickup_latitude", 2))
clean = clean.withColumn("pickup_lon_r", F.round("pickup_longitude", 2))
clean = clean.withColumn("dropoff_lat_r", F.round("dropoff_latitude", 2))
clean = clean.withColumn("dropoff_lon_r", F.round("dropoff_longitude", 2))

#pickup coordinates count
pickup_hotspots = clean.groupBy("pickup_lat_r", "pickup_lon_r").count().orderBy(F.desc("count"))
display(pickup_hotspots)

#dropoff coordinates count
dropoff_hotspots = clean.groupBy("dropoff_lat_r", "dropoff_lon_r").count().orderBy(F.desc("count"))
display(dropoff_hotspots)


In [0]:
#b5 visualization
pickup_pd = pickup_hotspots.toPandas()
plt.figure(figsize=(10, 6))
sns.scatterplot(data=pickup_pd, x="pickup_lon_r", y="pickup_lat_r", size="count", hue="count",
                alpha=0.6, legend=False, sizes=(20, 200))
plt.title("Pickup Hotspots")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()

dropoff_pd = dropoff_hotspots.toPandas()
plt.figure(figsize=(10, 6))
sns.scatterplot(data=dropoff_pd, x="dropoff_lon_r", y="dropoff_lat_r", size="count", hue="count",
                alpha=0.6, legend=False, sizes=(20, 200))
plt.title("Dropoff Hotspots")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()

In [0]:
#b6
avg_fare_by_pickup_location = clean.groupBy("pickup_lat_r", "pickup_lon_r") \
                                   .agg(F.avg("fare_amount").alias("avg_fare"), F.count("*").alias("n")) \
                                   .filter("n >= 200") \
                                   .orderBy(F.desc("avg_fare"))
display(avg_fare_by_pickup_location)


In [0]:
#b7
numeric_columns = ["fare_amount", "trip_distance", "trip_duration (m)", "passenger_count"]
corr_matrix = clean.select(numeric_columns).toPandas().corr()

plt.figure(figsize=(10,6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix of Key Features")
plt.tight_layout()
plt.show()