<a href="https://colab.research.google.com/github/lochanpatra/bigdata/blob/main/nyc_trip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("NYC Taxi Data Analysis") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "16g") \
    .getOrCreate()

Load the Parquet File

In [None]:
file_path = "/content/drive/MyDrive/DATA FOR USES/yellow_tripdata_2024-01.parquet"
df = spark.read.parquet(file_path)


Preprocess the Data

In [None]:
from pyspark.sql.functions import to_timestamp, to_date, hour, unix_timestamp

df = df.withColumn("pickup_datetime", to_timestamp("tpep_pickup_datetime")) \
       .withColumn("dropoff_datetime", to_timestamp("tpep_dropoff_datetime")) \
       .withColumn("pickup_date", to_date("tpep_pickup_datetime")) \
       .withColumn("pickup_hour", hour("tpep_pickup_datetime")) \
       .withColumn("trip_duration_minutes",
                   (unix_timestamp("dropoff_datetime") - unix_timestamp("pickup_datetime")) / 60)


Apply Filters (Date Range + Payment Type)

In [None]:
from datetime import datetime
from pyspark.sql.functions import col

# Example filters
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 1, 31)
payment_type_selected = 1  # For example, 1 = Credit Card

df = df.filter(
    (col("pickup_date") >= start_date) &
    (col("pickup_date") <= end_date) &
    (col("payment_type") == payment_type_selected)
)


Clean the Data

In [None]:
df_clean = df.filter(
    (col("passenger_count") > 0) &
    (col("trip_distance") > 0) &
    (col("trip_distance") < 100) &
    (col("fare_amount") > 0)
)


Aggregations for Plots

In [None]:
# Trips by hour
pdf_hourly = df_clean.groupBy("pickup_hour").count().orderBy("pickup_hour").toPandas()

# Daily total fare
pdf_daily_fare = df_clean.groupBy("pickup_date").sum("fare_amount").orderBy("pickup_date").toPandas()

# Payment type counts
pdf_payment = df_clean.groupBy("payment_type").count().orderBy("count", ascending=False).toPandas()

# Trip distance vs fare sample
pdf_scatter = df_clean.select("trip_distance", "fare_amount") \
    .filter((col("trip_distance") < 50) & (col("fare_amount") < 200)) \
    .sample(fraction=0.01, seed=42).toPandas()


 Plot the Results Using Seaborn/Matplotlib

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Trips per hour
plt.figure(figsize=(10, 5))
sns.barplot(data=pdf_hourly, x="pickup_hour", y="count", palette="viridis")
plt.title("Number of Trips by Hour")
plt.xlabel("Hour of Day")
plt.ylabel("Trip Count")
plt.grid(True)
plt.show()

# Daily total fare
plt.figure(figsize=(12, 5))
sns.lineplot(data=pdf_daily_fare, x="pickup_date", y="sum(fare_amount)", marker="o")
plt.title("Total Fare per Day")
plt.xlabel("Date")
plt.ylabel("Fare Amount ($)")
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Payment type distribution
plt.figure(figsize=(8, 5))
sns.barplot(data=pdf_payment, x="payment_type", y="count", palette="pastel")
plt.title("Trip Counts by Payment Type")
plt.xlabel("Payment Type")
plt.ylabel("Count")
plt.grid(True)
plt.show()

# Trip distance vs fare
plt.figure(figsize=(10, 6))
sns.scatterplot(data=pdf_scatter, x="trip_distance", y="fare_amount", alpha=0.3)
plt.title("Trip Distance vs Fare")
plt.xlabel("Distance (miles)")
plt.ylabel("Fare ($)")
plt.grid(True)
plt.show()


In [None]:
spark.stop()
