<a href="https://colab.research.google.com/github/lochanpatra/bigdata/blob/main/nyc_trip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("NYC Taxi Data Analysis") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "16g") \
    .getOrCreate()

In [None]:
%%writefile app.py
import streamlit as st
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    to_timestamp, to_date, hour, col, unix_timestamp
)
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Streamlit page config
st.set_page_config(layout="wide")
st.title("🚖 NYC Yellow Taxi Trip Analysis Dashboard")

# Start Spark
spark = SparkSession.builder.appName("NYC_Taxi_Analysis").getOrCreate()

# Load and prepare data (no caching due to Spark limitations)
def load_data(path):
    df = spark.read.parquet(path)
    df = df.withColumn("pickup_datetime", to_timestamp("tpep_pickup_datetime")) \
           .withColumn("dropoff_datetime", to_timestamp("tpep_dropoff_datetime")) \
           .withColumn("pickup_date", to_date("tpep_pickup_datetime")) \
           .withColumn("pickup_hour", hour("tpep_pickup_datetime")) \
           .withColumn("trip_duration_minutes",
                       (unix_timestamp("dropoff_datetime") - unix_timestamp("pickup_datetime")) / 60)
    return df

# Load Data
file_path = "/content/drive/MyDrive/DATA FOR USES/yellow_tripdata_2024-01.parquet"
with st.spinner("Loading data..."):
    df = load_data(file_path)

# Sidebar Filters
st.sidebar.header("🛠️ Filters")
start_date = st.sidebar.date_input("Start Date", datetime(2024, 1, 1))
end_date = st.sidebar.date_input("End Date", datetime(2024, 1, 31))

if start_date > end_date:
    st.error("Start date must be before end date.")
    st.stop()

df = df.filter((df.pickup_date >= start_date) & (df.pickup_date <= end_date))

# Payment Type Filter
payment_types = df.select("payment_type").distinct().toPandas()
payment_type_options = payment_types["payment_type"].unique()
payment_type_selected = st.sidebar.selectbox("Select Payment Type", payment_type_options)
df = df.filter(df.payment_type == payment_type_selected)

# Data cleaning
df_clean = df.filter(
    (col("passenger_count") > 0) &
    (col("trip_distance") > 0) &
    (col("trip_distance") < 100) &
    (col("fare_amount") > 0)
)

# Aggregations and sample for plotting
pdf_hourly = df_clean.groupBy("pickup_hour").count().orderBy("pickup_hour").toPandas()
pdf_daily_fare = df_clean.groupBy("pickup_date").sum("fare_amount").orderBy("pickup_date").toPandas()
pdf_payment = df_clean.groupBy("payment_type").count().orderBy("count", ascending=False).toPandas()
pdf_scatter = df_clean.select("trip_distance", "fare_amount") \
    .filter((col("trip_distance") < 50) & (col("fare_amount") < 200)) \
    .sample(fraction=0.01, seed=42).toPandas()

# Plot: Trips per Hour
st.subheader("📊 Trips per Hour")
fig1, ax1 = plt.subplots(figsize=(10, 5))
sns.barplot(data=pdf_hourly, x="pickup_hour", y="count", palette="viridis", ax=ax1)
ax1.set_title("Number of Trips by Hour")
ax1.set_xlabel("Hour of Day")
ax1.set_ylabel("Trip Count")
ax1.grid(True)
st.pyplot(fig1)

# Plot: Daily Fare
st.subheader("📈 Daily Total Fare")
fig2, ax2 = plt.subplots(figsize=(12, 5))
sns.lineplot(data=pdf_daily_fare, x="pickup_date", y="sum(fare_amount)", marker="o", ax=ax2)
ax2.set_title("Total Fare per Day")
ax2.set_xlabel("Date")
ax2.set_ylabel("Fare Amount ($)")
ax2.grid(True)
fig2.autofmt_xdate()
st.pyplot(fig2)

# Plot: Payment Type Distribution
st.subheader("💳 Payment Type Distribution")
fig3, ax3 = plt.subplots(figsize=(8, 5))
sns.barplot(data=pdf_payment, x="payment_type", y="count", palette="pastel", ax=ax3)
ax3.set_title("Trip Counts by Payment Type")
ax3.set_xlabel("Payment Type")
ax3.set_ylabel("Count")
st.pyplot(fig3)

# Plot: Distance vs Fare
st.subheader("📉 Trip Distance vs Fare Amount")
fig4, ax4 = plt.subplots(figsize=(10, 6))
sns.scatterplot(data=pdf_scatter, x="trip_distance", y="fare_amount", alpha=0.3, ax=ax4)
ax4.set_title("Trip Distance vs Fare")
ax4.set_xlabel("Distance (miles)")
ax4.set_ylabel("Fare ($)")
ax4.grid(True)
st.pyplot(fig4)


In [None]:
!pip install streamlit pyngrok pyspark

In [None]:
!ngrok config add-authtoken 2wf1M8DB9CWeZUIBKwhcSAmu4m3_4K4TEKVvFuv9wVYrpc2R4

In [None]:
from pyngrok import ngrok

# Kill any existing tunnels
ngrok.kill()

# Start streamlit
get_ipython().system_raw('streamlit run app.py &')

# Create a public URL
url = ngrok.connect(8501)
print("Streamlit app is live at:", url)
