Set dataset file path

In [114]:
file_path = "/home/jovyan/data/flight_data/2023-10-03.csv"

Create sparksession and load flight data

In [115]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create a Spark session
spark = SparkSession.builder \
    .appName("FlightBatchProcessing") \
    .getOrCreate()

# Load the flight data from your source (e.g., CSV, Parquet, etc.)
flight_data = spark.read \
    .format("csv") \
    .option("header", "true") \
    .load(file_path)


Count the number of incoming and outgoing flights for the entire day

In [116]:
# Count the number of arrival and departure flights
flight_counts = flight_data.groupBy("flightDirection").count()

# Show the results
flight_counts.show()

+---------------+-----+
|flightDirection|count|
+---------------+-----+
|              D| 2412|
|              A| 2388|
+---------------+-----+



Calculate the average difference between the actual and estimated landing time 

In [117]:
from  pyspark.sql.functions import abs

# Convert timestamp columns to Spark TimestampType
flight_data = flight_data \
            .withColumn("actualLandingTime", col("actualLandingTime") \
            .cast("timestamp")) \
            .withColumn("estimatedLandingTime", col("estimatedLandingTime") \
            .cast("timestamp"))

# Calculate the offset between estimated and actual landing times
landing_time_offset = flight_data \
    .withColumn("landingTimeOffset", abs(col("actualLandingTime").cast("long") - col("estimatedLandingTime").cast("long")))  # Convert seconds to minutes

# Calculate the average offset
average_offset = landing_time_offset.agg({"landingTimeOffset": "avg"}).collect()[0][0]

# Show the results
print(f"Average landing time offset: {average_offset:.2f} seconds")


Average landing time offset: 6.21 seconds


Check gate and pier utilization

In [118]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, when


# Replace null or NaN values with a placeholder value (e.g., "Unknown")
flight_data = flight_data.na.fill("Unknown")

# Filter only departure flights
departure_data = flight_data.filter(col("flightDirection") == "D")

# Group by gate and pier, and count the number of departure flights for each
gate_counts = departure_data.groupBy("gate").count().withColumnRenamed("count", "gateCount")
pier_counts = departure_data.groupBy("pier").count().withColumnRenamed("count", "pierCount")

# Calculate the total number of non-null departure flights
total_departure_flights = departure_data.na.drop(subset=["gate", "pier"]).count()

# Calculate the percentage of gate utilization for departures
gate_utilization = gate_counts.withColumn("gatePercentage", (col("gateCount") / total_departure_flights) * 100)

# Calculate the percentage of pier utilization for departures
pier_utilization = pier_counts.withColumn("pierPercentage", (col("pierCount") / total_departure_flights) * 100)

# Show the results
print("Departure Gate Utilization:")
gate_utilization.select("gate", "gatePercentage").show(truncate=False)

print("Departure Pier Utilization:")
pier_utilization.select("pier", "pierPercentage").show(truncate=False)

# Define the condition for joining based on the first letter of the gate
condition = col("gate").substr(1, 1) == col("pier")

# Perform the join
result_df = gate_utilization.join(pier_utilization, condition, "inner")

# Show the result DataFrame
result_df.show()



Departure Gate Utilization:
+----+-------------------+
|gate|gatePercentage     |
+----+-------------------+
|D81 |0.24875621890547264|
|C6  |1.285240464344942  |
|D66 |0.9950248756218906 |
|B34 |0.9950248756218906 |
|C22 |0.12437810945273632|
|D28 |0.41459369817578773|
|D16 |0.41459369817578773|
|H6  |0.16583747927031509|
|D5  |0.08291873963515754|
|D7  |0.20729684908789386|
|B22 |1.6998341625207296 |
|B30 |2.155887230514096  |
|D29 |0.08291873963515754|
|D87 |0.8706467661691543 |
|E9  |0.3731343283582089 |
|M4  |0.24875621890547264|
|D27 |0.12437810945273632|
|D53 |0.16583747927031509|
|B4  |1.6998341625207296 |
|H1  |0.24875621890547264|
+----+-------------------+
only showing top 20 rows

Departure Pier Utilization:
+-------+------------------+
|pier   |pierPercentage    |
+-------+------------------+
|F      |3.316749585406302 |
|E      |4.1459369817578775|
|B      |31.38474295190713 |
|M      |0.9950248756218906|
|Unknown|4.353233830845771 |
|D      |36.81592039800995 |
|C      |

Check the ratio of charter and passenger flights

In [119]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Replace null or NaN values with a placeholder value (e.g., "Unknown")
flight_data = flight_data.na.fill("Unknown")

# Filter only departure flights
departure_data = flight_data.filter(col("flightDirection") == "D")

# Group by service type and count the number of departure flights for each
service_type_counts = departure_data.groupBy("serviceType").count().withColumnRenamed("count", "serviceTypeCount")

# Calculate the total number of departure flights
total_departure_flights = departure_data.count()

# Calculate the percentage of service type utilization for departures
service_type_percentage = service_type_counts.withColumn("percentage", (col("serviceTypeCount") / total_departure_flights) * 100)

# Show the results
print("Service Type Utilization for Departures:")
service_type_percentage.select("serviceType", "percentage").show(truncate=False)



Service Type Utilization for Departures:
+-----------+-------------------+
|serviceType|percentage         |
+-----------+-------------------+
|F          |0.8291873963515755 |
|C          |0.4975124378109453 |
|J          |98.424543946932    |
|P          |0.24875621890547264|
+-----------+-------------------+



Define a udf so that in the next code block we can change the string representation of the route dict into a dict again

In [157]:
from pyspark.sql.types import StringType, MapType, ArrayType
from pyspark.sql.functions import udf

# Define a UDF to parse the string and convert it to a dictionary
def string_to_dict_or_list(s):
    import ast
    return ast.literal_eval(s)

# Register the UDF
udf_string_to_dict = udf(string_to_dict_or_list, MapType(StringType(), StringType()))
udf_string_to_list = udf(string_to_dict_or_list, ArrayType(StringType()))


Check for the top 10 destinations

In [121]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, from_json, split

# Filter only departure flights
departure_data = flight_data.filter(col("flightDirection") == "D")

# Convert the string representation of the route to a dictionary
departure_data = departure_data.withColumn("route_dict", udf_string_to_dict(departure_data["route"]))

# Split the destinations string into an array
departure_data = departure_data.withColumn("destinations_array", split(col("route_dict.destinations"), ","))

# Explode the destinations array to have one row per destination
destinations_data = departure_data.select("destinations_array").withColumn("destination", explode("destinations_array"))

# Group by destination and count the number of departure flights for each
destination_counts = destinations_data.groupBy("destination").count().withColumnRenamed("count", "flightCount")

# Sort the destinations by flight count in descending order
sorted_destinations = destination_counts.orderBy(col("flightCount").desc())

# Take the top 10 destinations
top_10_destinations = sorted_destinations.limit(10)

# Show the results
print("Top 10 Destinations for Departures:")
top_10_destinations.show(truncate=False)


Top 10 Destinations for Departures:
+-----------+-----------+
|destination|flightCount|
+-----------+-----------+
|[CDG]      |98         |
|[MAN]      |72         |
|[LHR]      |71         |
|[CPH]      |70         |
|[ARN]      |59         |
|[MAD]      |56         |
|[OSL]      |51         |
|[FCO]      |49         |
|[BER]      |47         |
|[MUC]      |44         |
+-----------+-----------+



In [144]:
file_path_dests = "/home/jovyan/data/destinations_data/destinations_with_coords.csv"


In [163]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, abs


# Load the flight data from your source (e.g., CSV, Parquet, etc.)
destinations_data = spark.read \
    .format("csv") \
    .option("header", "true") \
    .load(file_path_dests)

destinations_data.show()

# Filter only departure flights
departure_data = flight_data.filter(col("flightDirection") == "D")

# Convert the string representation of the route to a dictionary
departure_data = departure_data.withColumn("route_dict", udf_string_to_dict(departure_data["route"]))

departure_data.select("route_dict").show()

# Create a new column 'new_column' with the desired values
departure_data = departure_data.withColumn('destinations', col('route_dict').getItem(desired_key))

departure_data = departure_data.withColumn("destinations_list", udf_string_to_list(departure_data["destinations"]))

print(departure_data.dtypes)

departure_data.select("destinations").show()
departure_data.select("destinations_list").show()

+---+------------+--------------+----+------------------+------------------+----------+
|_c0|        city|       country|iata|        publicName|         longitude|  latitude|
+---+------------+--------------+----+------------------+------------------+----------+
|  0|    Al-Arish|         Egypt| AAC|          Al-Arish|        33.8032762| 31.132093|
|  1|      Annaba|       Algeria| AAE|            Annaba|7.7500122000000005| 36.897375|
|  2|Apalachicola|           USA| AAF|     Municipal, FL|       -84.9832435|29.7257675|
|  3|      Aachen|       Germany| AAH|        Merzbrueck| 6.083886800000001|50.7753455|
|  4|     Aalborg|       Danmark| AAL|           Aalborg|          9.921747|57.0488195|
|  5|      Al Ain|United Arab Em| AAN|            Al Ain|        55.8023118|24.1301619|
|  6|     Houston|           USA| AAP|Andrau Airpark, TX|       -95.3698028|29.7604267|
|  7|       Anapa|  Russia (CIS)| AAQ|         Vitiazevo|        37.3158041|44.8935914|
|  8|      Aarhus|       Danmark

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/tmp/ipykernel_162/379580448.py", line 7, in string_to_dict_or_list
  File "/opt/conda/lib/python3.11/ast.py", line 110, in literal_eval
    return _convert(node_or_string)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/ast.py", line 90, in _convert
    return list(map(_convert, node.elts))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/ast.py", line 109, in _convert
    return _convert_signed_num(node)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/ast.py", line 83, in _convert_signed_num
    return _convert_num(node)
           ^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/ast.py", line 74, in _convert_num
    _raise_malformed_node(node)
  File "/opt/conda/lib/python3.11/ast.py", line 71, in _raise_malformed_node
    raise ValueError(msg + f': {node!r}')
ValueError: malformed node or string on line 1: <ast.Name object at 0x7faefaad4940>


In [110]:
spark.stop()