In [None]:
# Initialization
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder.appName("EDA").getOrCreate()

In [None]:
# Load data
tripdata_df = spark.read.parquet("gs://msca-bdp-student-gcs/Group_8/tripdata_cleaned", header=True, inferSchema=True)

In [None]:
# Display schema
tripdata_df.printSchema()

# Display first 10 rows
tripdata_df.show(10)

In [None]:
# Calculate and print percentage of missing values for all columns
from pyspark.sql.functions import col

for column in tripdata_df.columns:
    percent_missing = 100 * (1 - (tripdata_df.select(column).dropna().count() / tripdata_df.select(column).count()))
    print(f"Percent missing in '{column}': {percent_missing}%")

In [None]:
# Group and sort by PULocationID
grouped_sorted_df = tripdata_df.groupBy("PULocationID").count().orderBy("count", ascending=False)

# Show the result
grouped_sorted_df.show(10)

In [None]:
# Mapping license numbers to company names
from pyspark.sql.functions import when

code_to_company = {
    "HV0002": "Juno",
    "HV0003": "Uber",
    "HV0004": "Via",
    "HV0005": "Lyft"
}

# Initialize the column expression with a default value
column_expr = col("hvfhs_license_num")

# Chain when clauses for each mapping
for code, company in code_to_company.items():
    column_expr = when(col("hvfhs_license_num") == code, company).otherwise(column_expr)

# Apply the transformation to create the new "company" column
tripdata_df = tripdata_df.withColumn("company", column_expr)

# Show some rows to verify the change
tripdata_df.select("hvfhs_license_num", "company").show(10)

In [None]:
# Drop unnecessary columns
tripdata_df = tripdata_df.drop("request_datetime", "pickup_datetime")

# Display schema to verify
tripdata_df.printSchema()

In [None]:
# Trip Counts by Company
from pyspark.sql import functions as F

# Group by 'company' and count trips
trip_counts_by_company = tripdata_df.groupBy("company").agg(F.count("*").alias("trip_count"))

# Convert to Pandas DataFrame for visualization
pandas_trip_counts = trip_counts_by_company.toPandas()
sorted_pandas_trip_counts = pandas_trip_counts.sort_values(by="trip_count", ascending=False)

In [None]:
# Visualize Trip Counts by Company
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.barplot(x="company", y="trip_count", data=sorted_pandas_trip_counts)
plt.title('Trip Counts by Company')
plt.xlabel('Company')
plt.ylabel('Trip Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Trip Distribution by Day of the Week for Uber and Lyft
tripdata_df.createOrReplaceTempView("tripdata")

# Query to count trips by day of week for Uber and Lyft
query = """
SELECT
    request_day_of_week,
    company,
    COUNT(*) AS trip_count
FROM
    tripdata
WHERE
    company IN ('Uber', 'Lyft')
GROUP BY
    request_day_of_week, company
ORDER BY
    request_day_of_week
"""

trip_counts_day_of_week = spark.sql(query).toPandas()

# Plotting
plt.figure(figsize=(12, 8))
sns.lineplot(data=trip_counts_day_of_week[trip_counts_day_of_week["company"] == "Uber"], x="request_day_of_week", y="trip_count", color="blue", label="Uber")
sns.lineplot(data=trip_counts_day_of_week[trip_counts_day_of_week["company"] == "Lyft"], x="request_day_of_week", y="trip_count", color="green", label="Lyft", alpha=0.6)
plt.title('Trip Distribution by Day of the Week for Uber and Lyft', fontsize=24)
plt.xlabel('Day of the Week', fontsize=18)
plt.ylabel('Number of Trips', fontsize=18)
days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
plt.xticks(ticks=range(0, 7), labels=days_of_week, fontsize=12)
plt.yticks(fontsize=12)
plt.legend(fontsize='large')
plt.tight_layout()
plt.show()

In [None]:
# Trip Data Schema
tripdata_df.printSchema()

In [None]:
# Group by 'company' and calculate the proportion of shared rides
shared_rides_by_company = tripdata_df.groupBy("company").agg(
    (F.sum("shared_match_flag") / F.count("*")).alias("shared_ride_proportion")
)

# Convert to Pandas DataFrame
pandas_shared_rides = shared_rides_by_company.toPandas()

In [None]:
# Visualize Proportion of Shared Rides by Company
plt.figure(figsize=(10, 6))
sns.barplot(x="company", y="shared_ride_proportion", data=pandas_shared_rides)
plt.title('Proportion of Shared Rides by Company')
plt.xlabel('Company')
plt.ylabel('Proportion of Shared Rides')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Average Wait Times by Time of Day
# Group by 'pickup_time_of_day' and calculate average wait time
avg_wait_time_by_time_of_day = tripdata_df.groupBy("pickup_time_of_day").agg(
    F.avg("wait_time").alias("avg_wait_time")
)

# Convert to Pandas DataFrame
pandas_avg_wait_time = avg_wait_time_by_time_of_day.toPandas()

In [None]:
# Visualize Average Wait Times by Time of Day
plt.figure(figsize=(10, 6))
sns.barplot(x="pickup_time_of_day", y="avg_wait_time", data=pandas_avg_wait_time)
plt.title('Average Wait Time by Time of Day')
plt.xlabel('Time of Day')
plt.ylabel('Average Wait Time (Seconds)')
plt.xticks(rotation=45)
plt.show()