In [20]:

import sys
!{sys.executable} -m pip install google-cloud-bigquery



Set dataset file path

In [21]:
file_path = 'gs://schiphol-flight-data-bucket/flight_data/2023-10-04.csv'
file_path_dests = 'gs://schiphol-flight-data-bucket/destination_data/destinations_with_coords.csv'

Create sparksession and load flight data

In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create a Spark session
spark = SparkSession.builder \
    .appName("FlightBatchProcessing") \
    .getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Load the flight data from your source (e.g., CSV, Parquet, etc.)
flight_data = spark.read \
    .format("csv") \
    .option("header", "true") \
    .load(file_path)


Count the number of incoming and outgoing flights for the entire day

In [23]:
# Count the number of arrival and departure flights
flight_counts = flight_data.groupBy("flightDirection", "scheduleDate").count()

# Show the results
flight_counts.show()

+---------------+------------+-----+
|flightDirection|scheduleDate|count|
+---------------+------------+-----+
|              D|  2023-10-01| 2348|
|              A|  2023-10-01| 2332|
+---------------+------------+-----+



Calculate the average difference between the actual and estimated landing time 

In [5]:
from  pyspark.sql.functions import abs

# Convert timestamp columns to Spark TimestampType
flight_data = flight_data \
            .withColumn("actualLandingTime", col("actualLandingTime") \
            .cast("timestamp")) \
            .withColumn("estimatedLandingTime", col("estimatedLandingTime") \
            .cast("timestamp"))

# Calculate the offset between estimated and actual landing times
landing_time_offset = flight_data \
    .withColumn("landingTimeOffset", abs(col("actualLandingTime").cast("long") - col("estimatedLandingTime").cast("long")))  # Convert seconds to minutes

# Calculate the average offset
average_offset = landing_time_offset.groupBy("scheduleDate").agg({"landingTimeOffset": "avg"})

# Show the results
average_offset.show()


+------------+----------------------+
|scheduleDate|avg(landingTimeOffset)|
+------------+----------------------+
|  2023-10-01|    13.884649122807017|
+------------+----------------------+



Check gate and pier utilization

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, when


# Replace null or NaN values with a placeholder value (e.g., "Unknown")
flight_data = flight_data.na.fill("Unknown")

# Filter only departure flights
departure_data = flight_data.filter(col("flightDirection") == "D")

# Group by gate and pier, and count the number of departure flights for each
gate_counts = departure_data.groupBy("gate", "scheduleDate").count().withColumnRenamed("count", "gateCount")
pier_counts = departure_data.groupBy("pier", "scheduleDate").count().withColumnRenamed("count", "pierCount")

# Calculate the total number of non-null departure flights
total_departure_flights = departure_data.na.drop(subset=["gate", "pier"]).count()

# Calculate the percentage of gate utilization for departures
gate_utilization = gate_counts.withColumn("gatePercentage", (col("gateCount") / total_departure_flights) * 100)

# Calculate the percentage of pier utilization for departures
pier_utilization = pier_counts.withColumn("pierPercentage", (col("pierCount") / total_departure_flights) * 100)

# Show the results
print("Departure Gate Utilization:")
gate_table = gate_utilization.select("gate", "gatePercentage", "scheduleDate")

gate_table.show(truncate=False)

print("Departure Pier Utilization:")
pier_table = pier_utilization.select("pier", "pierPercentage", "scheduleDate")

pier_table.show(truncate=False)


Departure Gate Utilization:
+----+-------------------+------------+
|gate|gatePercentage     |scheduleDate|
+----+-------------------+------------+
|F8  |0.46848381601362865|2023-10-01  |
|D78 |1.4906303236797274 |2023-10-01  |
|G9  |0.2555366269165247 |2023-10-01  |
|E21 |0.9369676320272573 |2023-10-01  |
|D44 |0.2555366269165247 |2023-10-01  |
|D63 |0.5110732538330494 |2023-10-01  |
|B23 |0.6388415672913117 |2023-10-01  |
|B27 |0.9795570698466781 |2023-10-01  |
|B35 |0.7240204429301533 |2023-10-01  |
|B5  |0.5536626916524702 |2023-10-01  |
|C18 |1.6183986371379897 |2023-10-01  |
|C23 |0.12776831345826234|2023-10-01  |
|C6  |0.2555366269165247 |2023-10-01  |
|B17 |0.42589437819420783|2023-10-01  |
|D59 |0.596252129471891  |2023-10-01  |
|B28 |1.7035775127768313 |2023-10-01  |
|E5  |0.5536626916524702 |2023-10-01  |
|H3  |0.17035775127768313|2023-10-01  |
|D6  |9.412265758091994  |2023-10-01  |
|B32 |1.2776831345826234 |2023-10-01  |
+----+-------------------+------------+
only showing

Check the ratio of charter and passenger flights

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Replace null or NaN values with a placeholder value (e.g., "Unknown")
flight_data = flight_data.na.fill("Unknown")

# Filter only departure flights
departure_data = flight_data.filter(col("flightDirection") == "D")

# Group by service type and count the number of departure flights for each
service_type_counts = departure_data.groupBy("serviceType", "scheduleDate").count().withColumnRenamed("count", "serviceTypeCount")

# Calculate the total number of departure flights
total_departure_flights = departure_data.count()

# Calculate the percentage of service type utilization for departures
service_type_percentage = service_type_counts.withColumn("percentage", (col("serviceTypeCount") / total_departure_flights) * 100)

# Show the results
print("Service Type Utilization for Departures:")
service_type_table = service_type_percentage.select("serviceType", "percentage", "scheduleDate")
service_type_table.show(truncate=False)


Service Type Utilization for Departures:
+-----------+-------------------+------------+
|serviceType|percentage         |scheduleDate|
+-----------+-------------------+------------+
|C          |0.46848381601362865|2023-10-01  |
|J          |98.50936967632026  |2023-10-01  |
|F          |0.8517887563884157 |2023-10-01  |
|P          |0.17035775127768313|2023-10-01  |
+-----------+-------------------+------------+



Define a udf so that in the next code block we can change the string representation of the route dict into a dict again

In [8]:
from pyspark.sql.types import StringType, MapType, ArrayType
from pyspark.sql.functions import udf

# Define a UDF to parse the string and convert it to a dictionary
def string_to_dict_or_list(s):
    import ast
    return ast.literal_eval(s)

# Register the UDF
udf_string_to_dict = udf(string_to_dict_or_list, MapType(StringType(), StringType()))
udf_string_to_list = udf(string_to_dict_or_list, ArrayType(StringType()))


Check for the top 10 destinations

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, from_json, split

# Filter only departure flights
departure_data = flight_data.filter(col("flightDirection") == "D")

# Convert the string representation of the route to a dictionary
departure_data = departure_data.withColumn("route_dict", udf_string_to_dict(departure_data["route"]))

# Split the destinations string into an array
departure_data = departure_data.withColumn("destinations_array", split(col("route_dict.destinations"), ","))

# Explode the destinations array to have one row per destination
destinations_data = departure_data.select("destinations_array", "scheduleDate").withColumn("destination", explode("destinations_array"))

# Group by destination and count the number of departure flights for each
destination_counts = destinations_data.groupBy("destination", "scheduleDate").count().withColumnRenamed("count", "flightCount")

# Sort the destinations by flight count in descending order
sorted_destinations = destination_counts.orderBy(col("flightCount").desc())

# Take the top 10 destinations
top_10_destinations = sorted_destinations.limit(10)

# Show the results
print("Top 10 Destinations for Departures:")
top_10_destinations.show(truncate=False)


Top 10 Destinations for Departures:
+-----------+------------+-----------+
|destination|scheduleDate|flightCount|
+-----------+------------+-----------+
|[CDG]      |2023-10-01  |82         |
|[CPH]      |2023-10-01  |71         |
|[MAN]      |2023-10-01  |68         |
|[LHR]      |2023-10-01  |57         |
|[MAD]      |2023-10-01  |56         |
|[ARN]      |2023-10-01  |51         |
|[OSL]      |2023-10-01  |51         |
|[FCO]      |2023-10-01  |49         |
|[BCN]      |2023-10-01  |49         |
|[BER]      |2023-10-01  |47         |
+-----------+------------+-----------+



In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, abs


def extract_first_element(lst):
    return str(lst[0]) if lst else None

# Register the UDF
extract_first_element_udf = udf(extract_first_element, StringType())


# Load the flight data from your source (e.g., CSV, Parquet, etc.)
destinations_data = spark.read \
    .format("csv") \
    .option("header", "true") \
    .load(file_path_dests)

destinations_data.show()

# Filter only departure flights
departure_data = flight_data.filter(col("flightDirection") == "D")

# Convert the string representation of the route to a dictionary
departure_data = departure_data.withColumn("route_dict", udf_string_to_dict(departure_data["route"]))

departure_data.select("route_dict").show()

# Create a new column 'new_column' with the desired values
departure_data = departure_data.withColumn('destinations', col('route_dict').getItem('destinations'))

departure_data = departure_data.withColumn("destinations_list", udf_string_to_list(departure_data["destinations"]))

departure_data.select("destinations").show()



+---+------------+--------------+----+------------------+------------------+----------+
|_c0|        city|       country|iata|        publicName|         longitude|  latitude|
+---+------------+--------------+----+------------------+------------------+----------+
|  0|    Al-Arish|         Egypt| AAC|          Al-Arish|        33.8032762| 31.132093|
|  1|      Annaba|       Algeria| AAE|            Annaba|7.7500122000000005| 36.897375|
|  2|Apalachicola|           USA| AAF|     Municipal, FL|       -84.9832435|29.7257675|
|  3|      Aachen|       Germany| AAH|        Merzbrueck| 6.083886800000001|50.7753455|
|  4|     Aalborg|       Danmark| AAL|           Aalborg|          9.921747|57.0488195|
|  5|      Al Ain|United Arab Em| AAN|            Al Ain|        55.8023118|24.1301619|
|  6|     Houston|           USA| AAP|Andrau Airpark, TX|       -95.3698028|29.7604267|
|  7|       Anapa|  Russia (CIS)| AAQ|         Vitiazevo|        37.3158041|44.8935914|
|  8|      Aarhus|       Danmark

In [15]:
import os 
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '~/.config/gcloud/application_default_credentials.json'
os.environ['GCLOUD_PROJECT'] = 'data-engineering-assignment-2'

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "tmp-bucket-for-data-engineering"  # use your bucket 
spark.conf.set('temporaryGcsBucket', bucket)

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Saving the data to BigQuery
flight_counts.write.format('bigquery') \
  .option('table', 'schiphol_data.flight_counts') \
  .mode("overwrite") \
  .save()

top_10_destinations.write.format('bigquery') \
  .option('table', 'schiphol_data.popular_dests') \
  .mode("overwrite") \
  .save()

gate_table.write.format('bigquery') \
  .option('table', 'schiphol_data.gate_utilization') \
  .mode("overwrite") \
  .save()

pier_table.write.format('bigquery') \
  .option('table', 'schiphol_data.pier_utilization') \
  .mode("overwrite") \
  .save()

service_type_table.write.format('bigquery') \
  .option('table', 'schiphol_data.service_type_utilization') \
  .mode("overwrite") \
  .save()

Py4JJavaError: An error occurred while calling o224.save.
: com.google.cloud.spark.bigquery.repackaged.com.google.inject.ProvisionException: Unable to provision, see the following errors:

1) [Guice/ErrorInCustomProvider]: IllegalArgumentException: A project ID is required for this service but could not be determined from the builder or the environment.  Please set a project ID using the builder.
  at SparkBigQueryConnectorModule.provideSparkBigQueryConfig(SparkBigQueryConnectorModule.java:78)
  while locating SparkBigQueryConfig

Learn more:
  https://github.com/google/guice/wiki/ERROR_IN_CUSTOM_PROVIDER

1 error

======================
Full classname legend:
======================
SparkBigQueryConfig:          "com.google.cloud.spark.bigquery.SparkBigQueryConfig"
SparkBigQueryConnectorModule: "com.google.cloud.spark.bigquery.SparkBigQueryConnectorModule"
========================
End of classname legend:
========================

	at com.google.cloud.spark.bigquery.repackaged.com.google.inject.internal.InternalProvisionException.toProvisionException(InternalProvisionException.java:251)
	at com.google.cloud.spark.bigquery.repackaged.com.google.inject.internal.InjectorImpl$1.get(InjectorImpl.java:1104)
	at com.google.cloud.spark.bigquery.repackaged.com.google.inject.internal.InjectorImpl.getInstance(InjectorImpl.java:1139)
	at com.google.cloud.spark.bigquery.write.CreatableRelationProviderHelper.createBigQueryInsertableRelationInternal(CreatableRelationProviderHelper.java:114)
	at com.google.cloud.spark.bigquery.write.CreatableRelationProviderHelper.createBigQueryInsertableRelation(CreatableRelationProviderHelper.java:95)
	at com.google.cloud.spark.bigquery.write.CreatableRelationProviderHelper.createRelation(CreatableRelationProviderHelper.java:47)
	at com.google.cloud.spark.bigquery.BigQueryRelationProvider.createRelation(BigQueryRelationProvider.scala:107)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:48)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:859)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:388)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:361)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:248)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.lang.IllegalArgumentException: A project ID is required for this service but could not be determined from the builder or the environment.  Please set a project ID using the builder.
	at com.google.cloud.spark.bigquery.repackaged.com.google.common.base.Preconditions.checkArgument(Preconditions.java:143)
	at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.ServiceOptions.<init>(ServiceOptions.java:304)
	at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryOptions.<init>(BigQueryOptions.java:92)
	at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryOptions.<init>(BigQueryOptions.java:30)
	at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryOptions$Builder.build(BigQueryOptions.java:87)
	at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryOptions.getDefaultInstance(BigQueryOptions.java:162)
	at com.google.cloud.bigquery.connector.common.BigQueryConfigurationUtil.lambda$defaultBilledProject$1(BigQueryConfigurationUtil.java:41)
	at com.google.cloud.spark.bigquery.repackaged.com.google.common.base.Absent.or(Absent.java:61)
	at com.google.cloud.spark.bigquery.SparkBigQueryConfig.from(SparkBigQueryConfig.java:330)
	at com.google.cloud.spark.bigquery.SparkBigQueryConfig.from(SparkBigQueryConfig.java:247)
	at com.google.cloud.spark.bigquery.SparkBigQueryConnectorModule.lambda$provideSparkBigQueryConfig$0(SparkBigQueryConnectorModule.java:80)
	at java.base/java.util.Optional.orElseGet(Optional.java:364)
	at com.google.cloud.spark.bigquery.SparkBigQueryConnectorModule.provideSparkBigQueryConfig(SparkBigQueryConnectorModule.java:78)
	at com.google.cloud.spark.bigquery.SparkBigQueryConnectorModule$$FastClassByGuice$$1232932.GUICE$TRAMPOLINE(<generated>)
	at com.google.cloud.spark.bigquery.SparkBigQueryConnectorModule$$FastClassByGuice$$1232932.apply(<generated>)
	at com.google.cloud.spark.bigquery.repackaged.com.google.inject.internal.ProviderMethod$FastClassProviderMethod.doProvision(ProviderMethod.java:260)
	at com.google.cloud.spark.bigquery.repackaged.com.google.inject.internal.ProviderMethod.doProvision(ProviderMethod.java:171)
	at com.google.cloud.spark.bigquery.repackaged.com.google.inject.internal.InternalProviderInstanceBindingImpl$CyclicFactory.provision(InternalProviderInstanceBindingImpl.java:185)
	at com.google.cloud.spark.bigquery.repackaged.com.google.inject.internal.InternalProviderInstanceBindingImpl$CyclicFactory.get(InternalProviderInstanceBindingImpl.java:162)
	at com.google.cloud.spark.bigquery.repackaged.com.google.inject.internal.ProviderToInternalFactoryAdapter.get(ProviderToInternalFactoryAdapter.java:40)
	at com.google.cloud.spark.bigquery.repackaged.com.google.inject.internal.SingletonScope$1.get(SingletonScope.java:169)
	at com.google.cloud.spark.bigquery.repackaged.com.google.inject.internal.InternalFactoryToProviderAdapter.get(InternalFactoryToProviderAdapter.java:45)
	at com.google.cloud.spark.bigquery.repackaged.com.google.inject.internal.InjectorImpl$1.get(InjectorImpl.java:1101)
	... 46 more


In [110]:
spark.stop()