In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create a Spark session
spark = SparkSession.builder \
    .appName("FlightDirectionAnalysis") \
    .getOrCreate()

try:
    # Load the flight data from your source (e.g., CSV, Parquet, etc.)
    flight_data = spark.read \
        .format("csv") \
        .option("header", "true") \
        .load("/home/jovyan/data/2023-10-02.csv")

    # Count the number of arrival and departure flights
    flight_counts = flight_data.groupBy("flightDirection").count()

    # Show the results
    flight_counts.show()

    # Optionally, you can save the results to a file or another data source
    # flight_counts.write.format("parquet").mode("overwrite").save("path/to/save/results")

finally:
    # Stop the Spark session
    spark.stop()


+---------------+-----+
|flightDirection|count|
+---------------+-----+
|              D| 2469|
|              A| 2451|
+---------------+-----+



In [2]:

# Create a Spark session
spark = SparkSession.builder \
    .appName("LandingTimeAnalysis") \
    .getOrCreate()

try:
    # Load the flight data from your source (e.g., CSV, Parquet, etc.)
    flight_data = spark.read \
        .format("csv") \
        .option("header", "true") \
        .load("/home/jovyan/data/2023-10-02.csv")

    # Convert timestamp columns to Spark TimestampType
    flight_data = flight_data.withColumn("actualLandingTime", col("actualLandingTime").cast("timestamp"))
    flight_data = flight_data.withColumn("estimatedLandingTime", col("estimatedLandingTime").cast("timestamp"))

    # Calculate the offset between estimated and actual landing times
    landing_time_offset = flight_data \
        .withColumn("landingTimeOffset", (col("actualLandingTime").cast("long") - col("estimatedLandingTime").cast("long")) / 60)  # Convert seconds to minutes

    # Calculate the average offset
    average_offset = landing_time_offset.agg({"landingTimeOffset": "avg"}).collect()[0][0]

    # Show the results
    print(f"Average landing time offset: {average_offset:.2f} minutes")

    # Optionally, you can save the results to a file or another data source
    # landing_time_offset.write.format("parquet").mode("overwrite").save("path/to/save/results")

finally:
    # Stop the Spark session
    spark.stop()


Average landing time offset: -0.10 minutes


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, when

# Create a Spark session
spark = SparkSession.builder \
    .appName("DepartureGatePierUtilizationAnalysis") \
    .getOrCreate()

try:
    # Load the flight data from your source (e.g., CSV, Parquet, etc.)
    flight_data = spark.read \
        .format("csv") \
        .option("header", "true") \
        .load("/home/jovyan/data/2023-10-02.csv")

    # Replace null or NaN values with a placeholder value (e.g., "Unknown")
    flight_data = flight_data.na.fill("Unknown")

    # Filter only departure flights
    departure_data = flight_data.filter(col("flightDirection") == "D")

    # Group by gate and pier, and count the number of departure flights for each
    gate_counts = departure_data.groupBy("gate").count().withColumnRenamed("count", "gateCount")
    pier_counts = departure_data.groupBy("pier").count().withColumnRenamed("count", "pierCount")

    # Calculate the total number of non-null departure flights
    total_departure_flights = departure_data.na.drop(subset=["gate", "pier"]).count()

    # Calculate the percentage of gate utilization for departures
    gate_utilization = gate_counts.withColumn("gatePercentage", (col("gateCount") / total_departure_flights) * 100)

    # Calculate the percentage of pier utilization for departures
    pier_utilization = pier_counts.withColumn("pierPercentage", (col("pierCount") / total_departure_flights) * 100)

    # Show the results
    print("Departure Gate Utilization:")
    gate_utilization.select("gate", "gatePercentage").show(truncate=False)

    print("Departure Pier Utilization:")
    pier_utilization.select("pier", "pierPercentage").show(truncate=False)

    # Optionally, you can save the results to a file or another data source
    # gate_utilization.write.format("parquet").mode("overwrite").save("path/to/save/gate_results")
    # pier_utilization.write.format("parquet").mode("overwrite").save("path/to/save/pier_results")

finally:
    # Stop the Spark session
    spark.stop()



Departure Gate Utilization:
+----+-------------------+
|gate|gatePercentage     |
+----+-------------------+
|D81 |0.8910490076954233 |
|C6  |0.9315512353179425 |
|B34 |1.6200891049007695 |
|C22 |0.16200891049007696|
|D28 |0.08100445524503848|
|H6  |0.12150668286755771|
|D16 |0.24301336573511542|
|D7  |0.688537869582827  |
|D5  |0.12150668286755771|
|B22 |1.0530579181855002 |
|B30 |2.67314702308627   |
|D29 |0.850546780072904  |
|D87 |1.0530579181855002 |
|M4  |0.24301336573511542|
|D27 |0.7695423248278656 |
|D53 |0.16200891049007696|
|D73 |0.6075334143377886 |
|B4  |1.7820980153908466 |
|M7  |0.04050222762251924|
|D18 |0.28351559335763465|
+----+-------------------+
only showing top 20 rows

Departure Pier Utilization:
+-------+------------------+
|pier   |pierPercentage    |
+-------+------------------+
|F      |3.1186715269339813|
|E      |5.30579181855002  |
|B      |31.632239773187525|
|M      |1.2960712839206157|
|Unknown|2.187120291616039 |
|D      |36.85702713649251 |
|C      |

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create a Spark session
spark = SparkSession.builder \
    .appName("ServiceTypePercentageAnalysis") \
    .getOrCreate()

try:
    # Load the flight data from your source (e.g., CSV, Parquet, etc.)
    flight_data = spark.read \
        .format("csv") \
        .option("header", "true") \
        .load("/home/jovyan/data/2023-10-02.csv")

    # Replace null or NaN values with a placeholder value (e.g., "Unknown")
    flight_data = flight_data.na.fill("Unknown")

    # Filter only departure flights
    departure_data = flight_data.filter(col("flightDirection") == "D")

    # Group by service type and count the number of departure flights for each
    service_type_counts = departure_data.groupBy("serviceType").count().withColumnRenamed("count", "serviceTypeCount")

    # Calculate the total number of departure flights
    total_departure_flights = departure_data.count()

    # Calculate the percentage of service type utilization for departures
    service_type_percentage = service_type_counts.withColumn("percentage", (col("serviceTypeCount") / total_departure_flights) * 100)

    # Show the results
    print("Service Type Utilization for Departures:")
    service_type_percentage.select("serviceType", "percentage").show(truncate=False)

    # Optionally, you can save the results to a file or another data source
    # service_type_percentage.write.format("parquet").mode("overwrite").save("path/to/save/service_type_results")

finally:
    # Stop the Spark session
    spark.stop()


Service Type Utilization for Departures:
+-----------+-------------------+
|serviceType|percentage         |
+-----------+-------------------+
|F          |0.9315512353179425 |
|C          |0.3645200486026731 |
|J          |98.58242203321183  |
|P          |0.12150668286755771|
+-----------+-------------------+



In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, from_json, split
from pyspark.sql.types import StringType, MapType, ArrayType

# Create a Spark session
spark = SparkSession.builder \
    .appName("TopDestinationsAnalysis") \
    .getOrCreate()

try:
    # Load the flight data from your source (e.g., CSV, Parquet, etc.)
    flight_data = spark.read \
        .format("csv") \
        .option("header", "true") \
        .load("/home/jovyan/data/2023-10-02.csv")

    # Replace null or NaN values with a placeholder value (e.g., "Unknown")
    flight_data = flight_data.na.fill("Unknown")

    # Filter only departure flights
    departure_data = flight_data.filter(col("flightDirection") == "D")

    # Convert the string representation of the route to a dictionary
    departure_data = departure_data.withColumn("route_dict", from_json(col("route"), MapType(StringType(), StringType())))

    # Split the destinations string into an array
    departure_data = departure_data.withColumn("destinations_array", split(col("route_dict.destinations"), ","))

    # Explode the destinations array to have one row per destination
    destinations_data = departure_data.select("destinations_array").withColumn("destination", explode("destinations_array"))

    # Group by destination and count the number of departure flights for each
    destination_counts = destinations_data.groupBy("destination").count().withColumnRenamed("count", "flightCount")
    print(destination_counts)
    
    # Sort the destinations by flight count in descending order
    sorted_destinations = destination_counts.orderBy(col("flightCount").desc())

    # Take the top 10 destinations
    top_10_destinations = sorted_destinations.limit(10)

    # Show the results
    print("Top 10 Destinations for Departures:")
    top_10_destinations.show(truncate=False)

    # Optionally, you can save the results to a file or another data source
    # top_10_destinations.write.format("parquet").mode("overwrite").save("path/to/save/top_destinations")

finally:
    # Stop the Spark session
    spark.stop()


DataFrame[destination: string, flightCount: bigint]
Top 10 Destinations for Departures:
+-----------+-----------+
|destination|flightCount|
+-----------+-----------+
+-----------+-----------+

