# Big Data Management Project 3:
## Analysing Flight Interconnected Data


### Imports & Setup

In [1]:
import pyspark
!pip install delta-spark
from delta import configure_spark_with_delta_pip

!pip install graphframes
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType

# 🔧 Spark Session with GraphFrames
builder = pyspark.sql.SparkSession.builder.appName("FlightGraphAnalysis") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(
    builder, extra_packages=["graphframes:graphframes:0.8.4-spark3.5-s_2.12"]
).getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", spark._sc.defaultParallelism)
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100)

import graphframes as gf # Import the module after installing




### Data download/extraction:

##### To simplify access to the dataset, we added a script to automatically check for the presence of the required data file (2009.csv). If the file is not found, the script downloads a zipped version from Google Drive (link from Moodle) using gdown, then extracts it into the right directory. This approach makes the data available locally without downloading. 

In [2]:
!pip install gdown
import gdown
import os
import zipfile

zip_filename = "input/2009.csv.zip"
csv_filename = "input/2009.csv"

if os.path.exists(csv_filename):
    print(f"{csv_filename} already exists. Nothing to do.")

elif os.path.exists(zip_filename):
    print(f"{csv_filename} not found, but {zip_filename} exists. Unzipping...")
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall("input/.")
    print("Unzipping complete.")

else:
    print("File is missing, importing from Google Drive")
    !gdown 1trFtRCe3xPBLr90hIWBF__OqppEnJPR_ -O input/
    print("File downloaded, going to unzip")
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall("input/.")
    print("Unzipping complete.")

input/2009.csv already exists. Nothing to do.


### 📂 Load Data and selecting

##### We loaded the flight dataset and selected only the necessary columns. This helped reduce memory usage and speeds up processing.

In [3]:
# Make sure the file is inside the container path (e.g., /app/input/flights.csv)
#df = spark.read.csv("/content/2009.csv", header=True, inferSchema=True)
#display(df.limit(20))

df = (spark.read
      .option("header", "true")
      .option("inferSchema", "true")
      .csv("input/2009.csv")
      # choose only rows we are interested in
      .select("ORIGIN", "DEST", "FL_DATE", "DISTANCE"))

df.limit(20).show()

+------+----+----------+--------+
|ORIGIN|DEST|   FL_DATE|DISTANCE|
+------+----+----------+--------+
|   DCA| EWR|2009-01-01|   199.0|
|   EWR| IAD|2009-01-01|   213.0|
|   EWR| DCA|2009-01-01|   199.0|
|   DCA| EWR|2009-01-01|   199.0|
|   IAD| EWR|2009-01-01|   213.0|
|   ATL| EWR|2009-01-01|   745.0|
|   CLE| ATL|2009-01-01|   554.0|
|   DCA| EWR|2009-01-01|   199.0|
|   EWR| DCA|2009-01-01|   199.0|
|   EWR| DCA|2009-01-01|   199.0|
|   DCA| EWR|2009-01-01|   199.0|
|   EWR| DCA|2009-01-01|   199.0|
|   CLE| DCA|2009-01-01|   310.0|
|   DCA| EWR|2009-01-01|   199.0|
|   ORD| EWR|2009-01-01|   719.0|
|   EWR| ORD|2009-01-01|   719.0|
|   ORD| EWR|2009-01-01|   719.0|
|   EWR| ORD|2009-01-01|   719.0|
|   ORD| EWR|2009-01-01|   719.0|
|   EWR| ORD|2009-01-01|   719.0|
+------+----+----------+--------+



### Graph
##### In this graph each airport is represented as a vertex and each flight as a directed edge. The `airports_df` shows unique airport codes from both origin and destination columns. The `edges_df` contains flight connections with `src` (origin) and `dst` (destination). Using these, we constructed the `flights_graph`.

In [26]:
# 👥 Vertices: Airports , needs to contain id column
airports_df = df.select(F.col("ORIGIN").alias("id")).union(
    df.select(F.col("DEST").alias("id"))
).distinct()

# ✈️ Edges: Flights , Needs to contain src and dst columns
edges_df = df.select(
    F.col("ORIGIN").alias("src"),
    F.col("DEST").alias("dst")
)

# 📊 GraphFrame
flights_graph = gf.GraphFrame(airports_df, edges_df)
print("Airports total:", flights_graph.vertices.count())
print("Flights total:", flights_graph.edges.count())
airports_df.cache()
edges_df.cache()
display(flights_graph)
print("The airports from data: ") 
display(flights_graph.vertices)
print("The flights between airports from data: ")
display(flights_graph.edges)

Airports total: 296
Flights total: 6429338


GraphFrame(v:[id: string], e:[src: string, dst: string])

The airports from data: 


id
DCA
CLT
AVL
BTV
BNA
XNA
GJT
LGB
MBS
TVC


The flights between airports from data: 


src,dst
DCA,EWR
EWR,IAD
EWR,DCA
DCA,EWR
IAD,EWR
ATL,EWR
CLE,ATL
DCA,EWR
EWR,DCA
EWR,DCA


### Query 1

### Custom in-degree, out-degree, degree, triangle count

##### First, we computed the in-degree by counting how many flights arrive at each airport (dst). Then, we calculated the out-degree by counting how many flights depart from each airport (src). Finally, we combined both to compute the total degree for each airport by summing in-degree and out-degree. Missing values were handled using coalesce.

In [5]:
# In-degree
in_degree_df = edges_df.groupBy("dst").count() \
    .withColumnRenamed("dst", "id") \
    .withColumnRenamed("count", "inDegree")

# Out-degree
out_degree_df = edges_df.groupBy("src").count() \
    .withColumnRenamed("src", "id") \
    .withColumnRenamed("count", "outDegree")

# Total degree (merge in & out)
degree_df = in_degree_df.join(out_degree_df, on="id", how="full_outer") \
    .withColumn("inDegree", F.coalesce(F.col("inDegree"), F.lit(0))) \
    .withColumn("outDegree", F.coalesce(F.col("outDegree"), F.lit(0))) \
    .withColumn("totalDegree", F.col("inDegree") + F.col("outDegree"))

degree_df.show()

+---+--------+---------+-----------+
| id|inDegree|outDegree|totalDegree|
+---+--------+---------+-----------+
|ABE|    4037|     4034|       8071|
|ABI|    2490|     2490|       4980|
|ABQ|   35577|    35582|      71159|
|ABY|     997|      995|       1992|
|ACK|     343|      342|        685|
|ACT|    1052|     1053|       2105|
|ACV|    3364|     3370|       6734|
|ACY|     522|      522|       1044|
|ADK|     103|      103|        206|
|ADQ|     631|      631|       1262|
|AEX|    2948|     2947|       5895|
|AGS|    3106|     3107|       6213|
|AKN|      77|       77|        154|
|ALB|   12020|    12018|      24038|
|ALO|     331|      330|        661|
|AMA|    6649|     6649|      13298|
|ANC|   17788|    17791|      35579|
|ASE|    4708|     4701|       9409|
|ATL|  417457|   417449|     834906|
|ATW|    5306|     5303|      10609|
+---+--------+---------+-----------+
only showing top 20 rows



##### Calculating the number of unique triangles in the graph by joining edges to form two-step paths (A → B → C) and then checking if a closing edge (C → A) exists. To avoid duplicate counting, we sorted and filtered nodes (A < B < C). The result of Q2 - total triangles in the graph is 16015

In [10]:
from pyspark.sql import functions as F

# Step 1: Edges (a -> b) ensuring a < b to avoid double counting
edges_filtered = edges_df.select(
    F.least("src", "dst").alias("src"),
    F.greatest("src", "dst").alias("dst")
).distinct().cache()

# Step 2: Find two-edge paths: (a -> b) join (b -> c)
paths_two = edges_filtered.alias("e1").join(
    edges_filtered.alias("e2"),
    F.col("e1.dst") == F.col("e2.src")
).select(
    F.col("e1.src").alias("A"),
    F.col("e1.dst").alias("B"),
    F.col("e2.dst").alias("C")
).filter("A < B AND B < C")

# Step 3: Close the triangles by checking (a,c) exists
triangles = paths_two.join(
    edges_filtered.alias("e3"),
    (F.col("e3.src") == F.col("A")) & (F.col("e3.dst") == F.col("C"))
).select("A", "B", "C").distinct()

# Count total triangles
triangle_count = triangles.count()
print(f"Total triangles in the graph: {triangle_count}")

# Display some triangles for verification
triangles.show(10, truncate=False)


Total triangles in the graph: 16015
+---+---+---+
|A  |B  |C  |
+---+---+---+
|IAH|JAX|MIA|
|IAH|JAX|TPA|
|IAH|JAX|STL|
|IAH|JAX|MEM|
|IAH|JAX|MSP|
|IAH|JAX|LGA|
|IAH|JAX|LAS|
|IAH|JAX|ORF|
|IAH|JAX|JFK|
|IAH|JAX|PHL|
+---+---+---+
only showing top 10 rows



### Query  2

In [11]:
#Total triangles
print(f"Total number of triangles in graph (again): {triangle_count}")

Total number of triangles in graph (again): 16015


### Query 3
#### Centrality Measure (custom) – Degree Centrality 
##### Degree Centrality Normalization:  We normalized the totalDegree by dividing it by the total number of airports in the data. Here we can see that the resulting values can be compared proportionally across the entire graph. This can help to understand how much each airport influences the flight network.

In [8]:
degree_centrality = degree_df.withColumn("degreeCentrality", F.col("totalDegree") / airports_df.count())
display(degree_centrality.orderBy("degreeCentrality", ascending=False))

id,inDegree,outDegree,totalDegree,degreeCentrality
ATL,417457,417449,834906,2820.6283783783783
ORD,313769,313848,627617,2120.3277027027025
DFW,264398,264396,528794,1786.4662162162165
DEN,235700,235675,471375,1592.483108108108
LAX,192916,192879,385795,1303.3614864864865
PHX,183491,183502,366993,1239.8412162162165
IAH,182088,182097,364185,1230.3547297297298
LAS,153984,153993,307977,1040.462837837838
DTW,152075,152081,304156,1027.554054054054
SFO,136532,136488,273020,922.3648648648648


### Query 4

In [28]:
#PageRank (custom)
# Custom iterative PageRank implementation (simplified version)
# This is a simplified version and assumes uniform weights and damping

N = airports_df.count()
damping = 0.85
iterations = 10

ranks = airports_df.withColumn("rank", F.lit(1.0 / N))

for i in range(iterations):
    contribs = edges_df.join(ranks, edges_df.src == ranks.id, "left") \
        .groupBy("dst") \
        .agg(F.sum(F.col("rank") / F.count("src")).alias("contrib"))

    ranks = contribs.withColumn("rank", (1 - damping) / N + damping * F.col("contrib")) \
        .select(F.col("dst").alias("id"), "rank") \
        .union(airports_df.join(contribs, airports_df.id == contribs.dst, "left_anti")
               .select("id").withColumn("rank", F.lit((1 - damping) / N)))

display(ranks.orderBy("rank", ascending=False))

AnalysisException: [NESTED_AGGREGATE_FUNCTION] It is not allowed to use an aggregate function in the argument of another aggregate function. Please use the inner aggregate function in a sub-query.;
Aggregate [dst#4201], [dst#4201, sum((rank#4474 / cast(count(src#4200) as double))) AS contrib#4520]
+- Join LeftOuter, (src#4200 = id#4166)
   :- Project [ORIGIN#20 AS src#4200, DEST#21 AS dst#4201]
   :  +- Project [ORIGIN#20, DEST#21, FL_DATE#17, DISTANCE#38]
   :     +- Relation [FL_DATE#17,OP_CARRIER#18,OP_CARRIER_FL_NUM#19,ORIGIN#20,DEST#21,CRS_DEP_TIME#22,DEP_TIME#23,DEP_DELAY#24,TAXI_OUT#25,WHEELS_OFF#26,WHEELS_ON#27,TAXI_IN#28,CRS_ARR_TIME#29,ARR_TIME#30,ARR_DELAY#31,CANCELLED#32,CANCELLATION_CODE#33,DIVERTED#34,CRS_ELAPSED_TIME#35,ACTUAL_ELAPSED_TIME#36,AIR_TIME#37,DISTANCE#38,CARRIER_DELAY#39,WEATHER_DELAY#40,... 4 more fields] csv
   +- Project [id#4166, 0.0033783783783783786 AS rank#4474]
      +- Deduplicate [id#4166]
         +- Union false, false
            :- Project [ORIGIN#4480 AS id#4166]
            :  +- Project [ORIGIN#4480, DEST#4481, FL_DATE#4477, DISTANCE#4498]
            :     +- Relation [FL_DATE#4477,OP_CARRIER#4478,OP_CARRIER_FL_NUM#4479,ORIGIN#4480,DEST#4481,CRS_DEP_TIME#4482,DEP_TIME#4483,DEP_DELAY#4484,TAXI_OUT#4485,WHEELS_OFF#4486,WHEELS_ON#4487,TAXI_IN#4488,CRS_ARR_TIME#4489,ARR_TIME#4490,ARR_DELAY#4491,CANCELLED#4492,CANCELLATION_CODE#4493,DIVERTED#4494,CRS_ELAPSED_TIME#4495,ACTUAL_ELAPSED_TIME#4496,AIR_TIME#4497,DISTANCE#4498,CARRIER_DELAY#4499,WEATHER_DELAY#4500,... 4 more fields] csv
            +- Project [DEST#4174 AS id#4168]
               +- Project [ORIGIN#4173, DEST#4174, FL_DATE#4170, DISTANCE#4191]
                  +- Relation [FL_DATE#4170,OP_CARRIER#4171,OP_CARRIER_FL_NUM#4172,ORIGIN#4173,DEST#4174,CRS_DEP_TIME#4175,DEP_TIME#4176,DEP_DELAY#4177,TAXI_OUT#4178,WHEELS_OFF#4179,WHEELS_ON#4180,TAXI_IN#4181,CRS_ARR_TIME#4182,ARR_TIME#4183,ARR_DELAY#4184,CANCELLED#4185,CANCELLATION_CODE#4186,DIVERTED#4187,CRS_ELAPSED_TIME#4188,ACTUAL_ELAPSED_TIME#4189,AIR_TIME#4190,DISTANCE#4191,CARRIER_DELAY#4192,WEATHER_DELAY#4193,... 4 more fields] csv


### Query 5

In [None]:
# Most connected airports (based on total degree)
most_connected = degree_df.orderBy("totalDegree", ascending=False).limit(10)
display(most_connected)

# Optional: visualizations with matplotlib or seaborn if you extract data locally