In [0]:
#pip install pyspark

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Integrate and manage Public Transport data").getOrCreate()


In [0]:
spark.conf.set(
    f"fs.azure.account.key.omardbstorageaccount.dfs.core.windows.net", 
    "G3TTTGHer5ZdNR78TdQvASgi9Mw+jDXH8Uk/6558WdDETLX3h3z/yGKvzS4YvfBL5ulvlBcVBvy/+ASt2O0TFA=="
)


In [0]:
import os
import time
from pyspark.sql.functions import date_format, year, month, dayofmonth, dayofweek, unix_timestamp, when, col
from pyspark.sql import functions as F

# Define input and output directories in Data Lake Storage Gen2
raw_data_dir = "abfss://public-transport-data@omardbstorageaccount.dfs.core.windows.net/raw/"
processed_data_dir = "abfss://public-transport-data@omardbstorageaccount.dfs.core.windows.net/processed/"

# List of months and corresponding file names
months = ["January", "February", "March", "April", "May"]

for month_name in months:
    # Define input and output file paths
    input_file = os.path.join(raw_data_dir, f"public_transport_data_{month_name}.csv")
    output_file = os.path.join(processed_data_dir, f"public_transport_data_{month_name}_cleaned.csv")

    # Read the data
    df = spark.read.csv(input_file, header=True, inferSchema=True)

    # Your data transformations here
    # For example, convert "ArrivalTime" and "DepartureTime" to "HH:mm" format
    df = df.withColumn("ArrivalTime", date_format(df["ArrivalTime"], "HH:mm"))
    df = df.withColumn("DepartureTime", date_format(df["DepartureTime"], "HH:mm"))

    # Add columns for Year, Month, Day, and DayOfWeek
    df = df.withColumn("Year", year(df["Date"]))
    df = df.withColumn("Month", month(df["Date"]))
    df = df.withColumn("Day", dayofmonth(df["Date"]))
    df = df.withColumn("DayOfWeek", dayofweek(df["Date"]))

    # Convert DepartureTime and ArrivalTime columns to timestamp
    df = df.withColumn("DepartureTimeTimestamp", unix_timestamp(df["DepartureTime"], "HH:mm").cast("timestamp"))
    df = df.withColumn("ArrivalTimeTimestamp", unix_timestamp(df["ArrivalTime"], "HH:mm").cast("timestamp"))

    # Calculate TripDuration in minutes and hours
    df = df.withColumn("TripDurationMinutes", 
        when(col("ArrivalTimeTimestamp") >= col("DepartureTimeTimestamp"), 
            (col("ArrivalTimeTimestamp").cast("long") - col("DepartureTimeTimestamp").cast("long")) / 60)
        .otherwise(1440 - (col("DepartureTimeTimestamp").cast("long") - col("ArrivalTimeTimestamp").cast("long")) / 60))

    # Drop the intermediate timestamp columns
    df = df.drop("DepartureTimeTimestamp", "ArrivalTimeTimestamp")

    # Add DelayCategory column
    df = df.withColumn("DelayCategory", 
        when(col("Delay") == 0, "Pas de Retard")
        .when((col("Delay") >= 1) & (col("Delay") <= 10), "Retard Court")
        .when((col("Delay") >= 11) & (col("Delay") <= 20), "Retard Moyen")
        .when(col("Delay") > 20, "Long Retard")
        .otherwise("Unknown"))

    # Route analysis (you may need to define route_analysis DataFrame)
    route_analysis = df.groupBy("Route").agg(
        F.round(F.avg("Delay"), 2).alias("AverageDelay"),
        F.when((F.avg("Passengers") % 1) >= 0.5, F.ceil(F.avg("Passengers"))).otherwise(F.floor(F.avg("Passengers"))).cast("int").alias("AveragePassengers"),
        F.count("*").alias("TotalTrips")
    )

    df = df.join(route_analysis, on="Route", how="left")

    # HeureDePointe column
    threshold = 50
    df = df.withColumn("HeureDePointe", when(col("Passengers") >= threshold, "Peak").otherwise("Off-Peak"))

    # Save the transformed DataFrame to the processed directory
    df.write.option("header", "true").csv(output_file)

    # Pause execution for 2 minutes (120 seconds) before processing the next batch
    time.sleep(120)


In [0]:
df.show()

In [0]:
#file_location = "abfss://public-transport-data@omardbstorageaccount.dfs.core.windows.net/raw/public_transport_data_January.csv"

In [0]:
#df = spark.read.format("csv").option("inferSchema", "True").option("header",
#"True").option("delimeter",",").load(file_location)

In [0]:
#df.show()

In [0]:
#df.printSchema()

In [0]:
# Join route-level statistics with the original DataFrame on the "Route" column


In [0]:
#data = df.toPandas()
#data

In [0]:
#data.info()

In [0]:
# Define the file location for export
#file_location = "abfss://publictransportdata@omardbstorageaccount.dfs.core.windows.net/processed/"

# Export the DataFrame to the specified location in CSV format
#df.write.csv(file_location, header=True, mode="overwrite")


In [0]:
# Reduce the number of output partitions to 1
#df.repartition(1).write.option("header", "true").csv("abfss://public-transport-data@omardbstorageaccount.dfs.core.windows.net/processed/public_transport_data_February_cleaned.csv")
