In [None]:
# Initialization
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder.appName("DataCleaning").getOrCreate()

In [None]:
# Load data
tripdata_df = spark.read.parquet("gs://msca-bdp-student-gcs/Group_8/tripdata", header=True, inferSchema=True)

In [None]:
# Display schema
tripdata_df.printSchema()

# Display first 10 rows
tripdata_df.show(10)

In [None]:
# Drop unnecessary columns
columns_to_drop = ["dispatching_base_num", "originating_base_num"]
tripdata_df = tripdata_df.drop(*columns_to_drop)

In [None]:
# Calculate and print percentage of missing values for all columns
from pyspark.sql.functions import col

for column in tripdata_df.columns:
    percent_missing = 100 * (1 - (tripdata_df.select(column).dropna().count() / tripdata_df.select(column).count()))
    print(f"Percent missing in '{column}': {percent_missing}%")

In [None]:
# Drop columns with lots of missing data
columns_to_drop = ["on_scene_datetime", "airport_fee", "wav_match_flag"]
tripdata_df = tripdata_df.drop(*columns_to_drop)

In [None]:
# Remove rows with NA values from critical columns
critical_columns = ["wav_request_flag"]
tripdata_df = tripdata_df.dropna(subset=critical_columns)

In [None]:
# Drop remaining rows with any NA values
tripdata_df = tripdata_df.na.drop("any")

In [None]:
# Feature Engineering for 'request_datetime'
from pyspark.sql.functions import date_format, dayofweek, month, hour, when

# Function to categorize time of the day
def categorize_time_of_day(df, timestamp_column):
    return df.withColumn(
        "request_time_of_day",
        when(date_format(timestamp_column, "HH").cast("int").between(5, 11), "morning")
        .when(date_format(timestamp_column, "HH").cast("int").between(12, 16), "afternoon")
        .when(date_format(timestamp_column, "HH").cast("int").between(17, 20), "evening")
        .otherwise("night")
    )

# Apply the categorize_time_of_day function
tripdata_df = categorize_time_of_day(tripdata_df, "request_datetime")

# Add day of the week and month columns
tripdata_df = tripdata_df.withColumn("request_hour", hour("request_datetime")) \
                         .withColumn("request_day_of_week", dayofweek("request_datetime")) \
                         .withColumn("request_month", month("request_datetime"))

# Show the result
tripdata_df.select("request_datetime", "request_time_of_day", "request_hour", "request_day_of_week", "request_month").show(10)

In [None]:
# Feature Engineering for 'pickup_datetime'
def categorize_time_of_day_pickup(df, timestamp_column):
    return df.withColumn(
        "pickup_time_of_day",
        when(date_format(timestamp_column, "HH").cast("int").between(5, 11), "morning")
        .when(date_format(timestamp_column, "HH").cast("int").between(12, 16), "afternoon")
        .when(date_format(timestamp_column, "HH").cast("int").between(17, 20), "evening")
        .otherwise("night")
    )

# Apply the categorize_time_of_day_pickup function
tripdata_df = categorize_time_of_day_pickup(tripdata_df, "pickup_datetime")

# Add day of the week and month columns for pickup_datetime
tripdata_df = tripdata_df.withColumn("pickup_hour", hour("pickup_datetime")) \
                         .withColumn("pickup_day_of_week", dayofweek("pickup_datetime")) \
                         .withColumn("pickup_month", month("pickup_datetime"))

# Show the result
tripdata_df.select("pickup_datetime", "pickup_time_of_day", "pickup_hour", "pickup_day_of_week", "pickup_month").show(10)

In [None]:
# Data Export
# Define the path
output_path = "gs://msca-bdp-student-gcs/Group_8/tripdata_cleaned"

# Write the DataFrame to the specified GCS bucket in Parquet format
tripdata_df.write.mode("overwrite").parquet(output_path)