# **Spark Graphframes - Airpot Delays Dataset**

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("patrickzel/flight-delay-and-cancellation-dataset-2019-2023")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'flight-delay-and-cancellation-dataset-2019-2023' dataset.
Path to dataset files: /kaggle/input/flight-delay-and-cancellation-dataset-2019-2023


In [2]:
!java -version
!pip install "pyspark==3.5.0"
# Install Java 17
!sudo apt-get update
!sudo apt-get install -y openjdk-17-jdk-headless

!java -version

openjdk version "17.0.17" 2025-10-21
OpenJDK Runtime Environment (build 17.0.17+10-Ubuntu-122.04)
OpenJDK 64-Bit Server VM (build 17.0.17+10-Ubuntu-122.04, mixed mode, sharing)
Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:2 https://cli.github.com/packages stable InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of con

In [3]:
%pip install graphframes-py==0.10.0



In [28]:
try:
    SparkSession.getActiveSession().stop()
except:
    pass

In [4]:
# Set JAVA_HOME to Java 17
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("GraphFrames") \
    .master("local[*]") \
    .config("spark.jars.packages", "io.graphframes:graphframes-spark3_2.12:0.10.0") \
    .getOrCreate()

print(f"spark version: {spark.version}")
print("spark session created with graphframes package specified!")

spark version: 3.5.0
spark session created with graphframes package specified!


In [8]:
# Imports
import os
import glob
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
import os
import glob
from pyspark.sql.functions import col
from graphframes import GraphFrame

In [6]:
df = spark.read.csv(f"{path}/flights_sample_3m.csv", header=True, inferSchema=True)
df.show(5)

+----------+--------------------+--------------------+------------+--------+---------+------+-------------------+----+--------------------+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+------------+--------+--------+-----------------+-----------------+-------------+------------------+-----------------------+
|   FL_DATE|             AIRLINE|         AIRLINE_DOT|AIRLINE_CODE|DOT_CODE|FL_NUMBER|ORIGIN|        ORIGIN_CITY|DEST|           DEST_CITY|CRS_DEP_TIME|DEP_TIME|DEP_DELAY|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|ARR_DELAY|CANCELLED|CANCELLATION_CODE|DIVERTED|CRS_ELAPSED_TIME|ELAPSED_TIME|AIR_TIME|DISTANCE|DELAY_DUE_CARRIER|DELAY_DUE_WEATHER|DELAY_DUE_NAS|DELAY_DUE_SECURITY|DELAY_DUE_LATE_AIRCRAFT|
+----------+--------------------+--------------------+------------+--------+---------+------+-------------------+----+--------------------+------------+--------

### **Creating a Graph**

In [23]:
vertices = (
    df.select(col("ORIGIN").alias("id"), col("ORIGIN_CITY").alias("city"))
      .union(
          df.select(col("DEST").alias("id"), col("DEST_CITY").alias("city"))
      )
      .distinct()
)

In [24]:
edges = df.select(
    col("ORIGIN").alias("src"),
    col("DEST").alias("dst"),
    col("AIRLINE"),
    col("FL_NUMBER"),
    col("FL_DATE"),
    col("DEP_DELAY"),
    col("ARR_DELAY"),
    col("CANCELLED"),
    col("DISTANCE"),
    col("AIR_TIME"),
)

In [25]:
g = GraphFrame(vertices, edges)

In [10]:
print("Number of airports (vertices):", g.vertices.count())
print("Number of flights (edges):", g.edges.count())

Number of airports (vertices): 381
Number of flights (edges): 3000000


In [11]:
g.vertices.show(5)
g.edges.show(5)

+---+--------------------+
| id|                city|
+---+--------------------+
|COS|Colorado Springs, CO|
|SDF|      Louisville, KY|
|PIR|          Pierre, SD|
|CLL|College Station/B...|
|MSN|         Madison, WI|
+---+--------------------+
only showing top 5 rows

+---+---+--------------------+---------+----------+---------+---------+---------+--------+--------+
|src|dst|             AIRLINE|FL_NUMBER|   FL_DATE|DEP_DELAY|ARR_DELAY|CANCELLED|DISTANCE|AIR_TIME|
+---+---+--------------------+---------+----------+---------+---------+---------+--------+--------+
|FLL|EWR|United Air Lines ...|     1562|2019-01-09|     -4.0|    -14.0|      0.0|  1065.0|   153.0|
|MSP|SEA|Delta Air Lines Inc.|     1149|2022-11-19|     -6.0|     -5.0|      0.0|  1399.0|   189.0|
|DEN|MSP|United Air Lines ...|      459|2022-07-22|      6.0|      0.0|      0.0|   680.0|    87.0|
|MSP|SFO|Delta Air Lines Inc.|     2295|2023-03-06|     -1.0|     24.0|      0.0|  1589.0|   249.0|
|MCO|DFW|    Spirit Air Lines|  

### **Degree Analysis**

In [12]:
# Total connections
g.degrees.orderBy("degree", ascending=False).show(10)

# Incoming flights
g.inDegrees.orderBy("inDegree", ascending=False).show(10)

# Outgoing flights
g.outDegrees.orderBy("outDegree", ascending=False).show(10)

+---+------+
| id|degree|
+---+------+
|ATL|307125|
|DFW|260104|
|ORD|245630|
|DEN|239511|
|CLT|189717|
|LAX|171493|
|PHX|150420|
|LAS|146932|
|SEA|141738|
|MCO|127701|
+---+------+
only showing top 10 rows

+---+--------+
| id|inDegree|
+---+--------+
|ATL|  153569|
|DFW|  129770|
|ORD|  123334|
|DEN|  119592|
|CLT|   95413|
|LAX|   85621|
|PHX|   75605|
|LAS|   73462|
|SEA|   70832|
|MCO|   63818|
+---+--------+
only showing top 10 rows

+---+---------+
| id|outDegree|
+---+---------+
|ATL|   153556|
|DFW|   130334|
|ORD|   122296|
|DEN|   119919|
|CLT|    94304|
|LAX|    85872|
|PHX|    74815|
|LAS|    73470|
|SEA|    70906|
|MCO|    63883|
+---+---------+
only showing top 10 rows



### **Page Rank (airport importance)**

Subsampling the dataset (only 2023 flights) bceause of limited resources.

In [14]:
df_2023 = df.filter(df.FL_DATE.startswith("2023"))

# rebuild edges & vertices
vertices = (
    df_2023.select(col("ORIGIN").alias("id"))
           .union(df_2023.select(col("DEST").alias("id")))
           .distinct()
)
edges = df_2023.select(
    col("ORIGIN").alias("src"),
    col("DEST").alias("dst"),
    col("AIRLINE"),
    col("FL_NUMBER"),
    col("FL_DATE"),
    col("DEP_DELAY"),
    col("ARR_DELAY"),
    col("CANCELLED"),
    col("DISTANCE"),
    col("AIR_TIME"),
)

g_small = GraphFrame(vertices, edges)

# Now run PageRank
pr = g_small.pageRank(resetProbability=0.15, tol=0.01)
pr.vertices.orderBy("pagerank", ascending=False).show(10)


+---+------------------+
| id|          pagerank|
+---+------------------+
|DFW|14.971593423888596|
|ATL|14.507191176323765|
|DEN| 14.40067801549485|
|ORD|12.315512358376749|
|CLT| 8.351375393057927|
|SEA| 7.719350369113929|
|LAX| 7.584317218445229|
|LAS|  7.53915066136384|
|PHX| 7.105252947695911|
|LGA| 6.729546189010543|
+---+------------------+
only showing top 10 rows



In [16]:
# Number of arriving flights
g_small.edges.filter("dst = 'DFW'").count()

19157

In [17]:
# Number of departing flights
g_small.edges.filter("src = 'DFW'").count()

19271

### **Shortest Paths to JFK**

In [20]:
paths = g_small.shortestPaths(landmarks=["JFK"])
# Sort by distance to FLL ascending (closest airports first)
paths.select("id", "distances").orderBy(col("distances")["JFK"]).show(10, truncate=False)

+---+----------+
|id |distances |
+---+----------+
|JFK|{JFK -> 0}|
|DCA|{JFK -> 1}|
|SJU|{JFK -> 1}|
|ORF|{JFK -> 1}|
|MSY|{JFK -> 1}|
|SAV|{JFK -> 1}|
|BUR|{JFK -> 1}|
|CMH|{JFK -> 1}|
|SJC|{JFK -> 1}|
|AUS|{JFK -> 1}|
+---+----------+
only showing top 10 rows



In [22]:
# Sort descending (farthest airports first)
paths.select("id", "distances").orderBy(col("distances")["JFK"].desc()).show(10, truncate=False)

+---+----------+
|id |distances |
+---+----------+
|SCC|{JFK -> 3}|
|BET|{JFK -> 3}|
|WRG|{JFK -> 3}|
|PSG|{JFK -> 3}|
|IAG|{JFK -> 3}|
|TOL|{JFK -> 3}|
|BRW|{JFK -> 3}|
|HGR|{JFK -> 3}|
|CDV|{JFK -> 3}|
|OME|{JFK -> 3}|
+---+----------+
only showing top 10 rows



### **Route-level analysis**

Average delay per route

In [11]:
g_small.edges.groupBy("src", "dst").avg("ARR_DELAY").orderBy("avg(ARR_DELAY)", ascending=False).show(10)


+---+---+------------------+
|src|dst|    avg(ARR_DELAY)|
+---+---+------------------+
|DEN|ABE|            1080.0|
|SFB|GFK|             866.5|
|IDA|PDX|             746.5|
|PSC|SAN|             620.0|
|SMX|LAS|336.42857142857144|
|LAS|AZA|             313.0|
|HTS|PGD|             265.0|
|GEG|ORD|             234.2|
|CHS|LCK|             218.0|
|FCA|DFW|207.53846153846155|
+---+---+------------------+
only showing top 10 rows



Cancellations

In [15]:
g_small.edges.groupBy("src", "dst").sum("CANCELLED").orderBy("sum(CANCELLED)", ascending=False).show(10)


+---+---+--------------+
|src|dst|sum(CANCELLED)|
+---+---+--------------+
|BOS|LGA|          25.0|
|LGA|ORD|          24.0|
|EWR|ORD|          23.0|
|ORD|LGA|          21.0|
|DFW|LGA|          20.0|
|FLL|LGA|          18.0|
|JFK|BOS|          18.0|
|LGA|BOS|          18.0|
|DEN|LAS|          17.0|
|EWR|FLL|          16.0|
+---+---+--------------+
only showing top 10 rows



### **Connected Components**

In [30]:
# Compute connected components
cc = g_small.connectedComponents()

In [31]:
# Show each airport with its component id
cc.select("id", "component").show(10)

+---+---------+
| id|component|
+---+---------+
|BGM|        0|
|PSE|        0|
|DLG|        0|
|INL|        0|
|MSY|        0|
|PPG|        0|
|GEG|        0|
|DRT|        0|
|SNA|        0|
|BUR|        0|
+---+---------+
only showing top 10 rows



In [32]:
# Count number of airports per component
cc.groupBy("component").count().orderBy("count", ascending=False).show(10)

+---------+-----+
|component|count|
+---------+-----+
|        0|  348|
+---------+-----+



### **Triangle Count**

In [34]:
from pyspark import StorageLevel

In [35]:
spark.sparkContext.setCheckpointDir("/tmp/graphframes-checkpoint")

In [38]:
triangles = g_small.triangleCount(storage_level=StorageLevel.MEMORY_AND_DISK)

# Show the number of triangles each airport participates in
triangles.select("id", "count").orderBy("count", ascending=False).show(10)

+---+-----+
| id|count|
+---+-----+
|DFW| 2199|
|ATL| 2161|
|DEN| 2146|
|ORD| 2069|
|CLT| 1897|
|LAS| 1768|
|MSP| 1765|
|PHX| 1560|
|IAH| 1551|
|LAX| 1535|
+---+-----+
only showing top 10 rows

