# Project 3: Big graphs

The objective of this project is to use Spark’s APIs to analyze the flight interconnected data to understand the popularity of the airports and flight patterns.

## Session setup

In [1]:
import pyspark
from delta import configure_spark_with_delta_pip

# Prepare the Spark builder
builder = pyspark.sql.SparkSession.builder.appName("Project_3") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "16g")

spark = configure_spark_with_delta_pip(builder,extra_packages=["graphframes:graphframes:0.8.4-spark3.5-s_2.12"]).getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", spark._sc.defaultParallelism)

#spark.conf.set("spark.sql.repl.eagerEval.enabled",True) # OK for exploration, not great for performance
spark.conf.set("spark.sql.repl.eagerEval.truncate", 500)

In [2]:
import graphframes as gf
import pyspark.sql.functions as F

## Creating the Graph

In [3]:
# Reading in the data

flight_df = spark.read.csv("input/2009.csv", header=True, inferSchema=True)

flight_df.limit(5).show()

+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+-----------+
|   FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|DEST|CRS_DEP_TIME|DEP_TIME|DEP_DELAY|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|ARR_DELAY|CANCELLED|CANCELLATION_CODE|DIVERTED|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|DISTANCE|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|Unnamed: 27|
+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+-----------+
|

In [4]:
# Vertices

flight_vertices = (flight_df
                  .select(F.col("ORIGIN").alias("id"))
                  .union(flight_df
                        .select(F.col("DEST").alias("id")))
                  .distinct()
)

display(flight_vertices)

DataFrame[id: string]

In [5]:
# Edges

flight_edges = (flight_df
               .withColumnRenamed("ORIGIN","src")
               .withColumnRenamed("DEST","dst")
)

display(flight_edges)

DataFrame[FL_DATE: date, OP_CARRIER: string, OP_CARRIER_FL_NUM: int, src: string, dst: string, CRS_DEP_TIME: int, DEP_TIME: double, DEP_DELAY: double, TAXI_OUT: double, WHEELS_OFF: double, WHEELS_ON: double, TAXI_IN: double, CRS_ARR_TIME: int, ARR_TIME: double, ARR_DELAY: double, CANCELLED: double, CANCELLATION_CODE: string, DIVERTED: double, CRS_ELAPSED_TIME: double, ACTUAL_ELAPSED_TIME: double, AIR_TIME: double, DISTANCE: double, CARRIER_DELAY: double, WEATHER_DELAY: double, NAS_DELAY: double, SECURITY_DELAY: double, LATE_AIRCRAFT_DELAY: double, Unnamed: 27: string]

#### GraphFrame

In [6]:
flight_graph = gf.GraphFrame(flight_vertices, flight_edges)

flight_vertices.cache()
flight_edges.cache()

display(flight_graph)

GraphFrame(v:[id: string], e:[src: string, dst: string ... 26 more fields])

## Query 1 (0)

Compute different statistics : in-degree, out-degree, total degree and triangle count (without existing functions). 

### In-degree

To calculate the in-degree of each node, we counted the number of edges where the node is the destination.

In [13]:
# Custom query
in_degree = flight_graph.edges.groupBy("dst").agg(F.count("*").alias("our_inDegree"))

# Adding the built-in function column for validation
bi_result = flight_graph.inDegrees
in_degree_renamed = in_degree.withColumnRenamed("dst", "id")
comparison = in_degree_renamed.join(bi_result, on="id", how="inner")

comparison.show()

+---+------------+--------+
| id|our_inDegree|inDegree|
+---+------------+--------+
|IAH|      182088|  182088|
|JAX|       28813|   28813|
|ABQ|       35577|   35577|
|IND|       38198|   38198|
|BOS|      110463|  110463|
|GRR|       13970|   13970|
|MEM|       71721|   71721|
|PBI|       25496|   25496|
|XNA|       13764|   13764|
|LBB|        8004|    8004|
|BTV|        6021|    6021|
|VPS|        6958|    6958|
|SYR|        9330|    9330|
|JFK|      119571|  119571|
|MBS|        3443|    3443|
|SBN|        4527|    4527|
|PDX|       52251|   52251|
|RDD|        1433|    1433|
|LNK|        2765|    2765|
|HPN|       10661|   10661|
+---+------------+--------+
only showing top 20 rows



### Out-degree

To calculate the out-degree of each node, we counted the number of edges where the node is the source.

In [15]:
# Custom query
out_degree = flight_graph.edges.groupBy("src").agg(F.count("*").alias("our_outDegree"))

# Adding the built-in function column for validation
bi_result = flight_graph.outDegrees # For validation
out_degree_renamed = out_degree.withColumnRenamed("src", "id")
comparison = out_degree_renamed.join(bi_result, on="id", how="inner")

comparison.show()

+---+-------------+---------+
| id|our_outDegree|outDegree|
+---+-------------+---------+
|IAH|       182097|   182097|
|JAX|        28810|    28810|
|ABQ|        35582|    35582|
|IND|        38201|    38201|
|GRR|        13973|    13973|
|LBB|         8002|     8002|
|MEM|        71713|    71713|
|BTV|         6028|     6028|
|BOS|       110460|   110460|
|PBI|        25500|    25500|
|XNA|        13755|    13755|
|VPS|         6959|     6959|
|SYR|         9336|     9336|
|JFK|       119574|   119574|
|MBS|         3444|     3444|
|SBN|         4526|     4526|
|PDX|        52242|    52242|
|RDD|         1433|     1433|
|LNK|         2765|     2765|
|HPN|        10657|    10657|
+---+-------------+---------+
only showing top 20 rows



### Total degree

To calculate the total degree of each node, we counted how many times it appears as either the source or destination in the edges

In [17]:
# Finding the total degrees (custom query)
total_degree = flight_graph.edges.select(F.col("src").alias("id")) \
    .union(flight_graph.edges.select(F.col("dst").alias("id"))) \
    .groupBy("id") \
    .agg(F.count("*").alias("our_degree"))

# Adding the built-in function column for validation
bi_result = flight_graph.degrees # For validation
comparison = total_degree.join(bi_result, on="id", how="inner")

comparison.show()

+---+----------+------+
| id|our_degree|degree|
+---+----------+------+
|IAH|    364185|364185|
|JAX|     57623| 57623|
|ABQ|     71159| 71159|
|IND|     76399| 76399|
|BOS|    220923|220923|
|GRR|     27943| 27943|
|LBB|     16006| 16006|
|MEM|    143434|143434|
|BTV|     12049| 12049|
|PBI|     50996| 50996|
|XNA|     27519| 27519|
|VPS|     13917| 13917|
|SYR|     18666| 18666|
|JFK|    239145|239145|
|MBS|      6887|  6887|
|SBN|      9053|  9053|
|PDX|    104493|104493|
|RDD|      2866|  2866|
|LNK|      5530|  5530|
|HPN|     21318| 21318|
+---+----------+------+
only showing top 20 rows



### Triangle count

We calculate how many times each node appears in by converting the graph into an undirected graph, then finding all triangles in the graph and finally counting how many triangles each node is part of. To find all triangles in the graph we first look for paths a -> b and b -> c and then select triangles where there is an edge c -> a. We sort the nodes of all triangles alphabetically to remove all duplicate triangles.

In [13]:
# Removing all duplicate edges from the graph based on src and dst
flight_edges_distinct = flight_edges.dropDuplicates(['src', 'dst'])

# Making the graph undirected to mimic the built-in funtion
undirected_edges = flight_edges_distinct.selectExpr("src", "dst") \
    .union(flight_edges_distinct.selectExpr("dst as src", "src as dst")).dropDuplicates()

# Finding all a -> b, b -> c pairs 
paths = undirected_edges.alias("e1") \
    .join(flight_edges_distinct.alias("e2"), F.col("e1.dst") == F.col("e2.src")) \
    .select(F.col("e1.src").alias("a"),
            F.col("e1.dst").alias("b"),
            F.col("e2.dst").alias("c"))

# Finding all triangles with c -> a
triangles = paths \
    .join(undirected_edges.alias("e3"), (F.col("e3.src") == F.col("c")) & (F.col("e3.dst") == F.col("a"))) \
    .select("a", "b", "c")

# Sorting the triangle nodes alphabetically to drop duplicates
triangles_sorted = triangles.select(F.array_sort(F.array("a", "b", "c")).alias("triangle")).dropDuplicates()

# Converting all triangles into one column of veritces [a, b, c] [c, d, a] -> a, b, c, c, d, a
triangle_vertices = triangles_sorted.select(F.explode("triangle").alias("id"))

# Counting the number of triangles each airport is part of 
triangle_counts = triangle_vertices.groupBy("id").count()

# Adding nodes that are part of 0 triangles to the result to match the built-in function
result = flight_graph.vertices.select("id") \
    .join(triangle_counts, on="id", how="left") \
    .withColumn("count", F.coalesce(F.col("count"), F.lit(0)))

result.show()

+---+-----+
| id|count|
+---+-----+
|IAH| 1338|
|JAX|  342|
|ABQ|  311|
|IND|  612|
|GRR|   96|
|LBB|   23|
|MEM| 1105|
|BTV|   34|
|BOS|  860|
|PBI|  168|
|XNA|   97|
|VPS|   10|
|SYR|   82|
|JFK|  942|
|MBS|    6|
|SBN|   13|
|PDX|  413|
|RDD|    1|
|LNK|   38|
|HPN|   36|
+---+-----+
only showing top 20 rows



In [14]:
# Comparing our results with the built-in function to validate all results
joined_triangles = flight_graph.triangleCount().alias('a').join(result.alias('b'), on='id')
diff_counts = joined_triangles.filter(F.col('a.count') != F.col('b.count')) # Should be empty if all is correct
print("Differences in the results (should be empty):")
diff_counts.select('id', F.col('a.count').alias('Built-in_count'), F.col('b.count').alias('Custom_count')).show()

Differences in the results (should be empty):
+---+--------------+------------+
| id|Built-in_count|Custom_count|
+---+--------------+------------+
+---+--------------+------------+



## Query 2 (1)
Compute the total number of triangles in the graph.

The query to find the total number of triangles in the graph uses the same logic as the previous query, but it just counts the total number of unique triangles, not node appearances.

In [15]:
# Removing all duplicate edges from the graph based on src and dst
flight_edges_distinct = flight_edges.dropDuplicates(['src', 'dst'])

# Making the graph undirected by adding reversed edges (src <-> dst) to mimic the built-in funtion
undirected_edges = flight_edges_distinct.selectExpr("src", "dst") \
    .union(flight_edges_distinct.selectExpr("dst as src", "src as dst")).dropDuplicates()

# Finding all a -> b, b -> c pairs 
paths = undirected_edges.alias("e1") \
    .join(flight_edges_distinct.alias("e2"), F.col("e1.dst") == F.col("e2.src")) \
    .select(F.col("e1.src").alias("a"),
            F.col("e1.dst").alias("b"),
            F.col("e2.dst").alias("c"))

# Finding all triangles with c -> a
triangles = paths \
    .join(undirected_edges.alias("e3"), (F.col("e3.src") == F.col("c")) & (F.col("e3.dst") == F.col("a"))) \
    .select("a", "b", "c")

# Sorting the triangle nodes alphabetically to drop duplicates
triangles_sorted = triangles.select(F.array_sort(F.array("a", "b", "c")).alias("triangle")).dropDuplicates()

# Count the total triangles in the graph
total_triangle_count = triangles_sorted.count()

print(f"Total number of triangles in the flight graph is {total_triangle_count}.")

Total number of triangles in the flight graph is 16015.


To find the total number of of unique triangles in the graph using the built-in function replica, we sum all the individual node triangle counts and divided it by 3 since each triangle is included three times (once for each of its 3 vertices).

In [16]:
# Finding the sum of all triangles in the graph using the triangle count results from q1
total_triangle_count = triangle_counts.agg(F.sum("count")).first()[0] // 3
print(f"Total number of triangles in the flight graph is {total_triangle_count}.")

Total number of triangles in the flight graph is 16015.


In [17]:
# For validation
triangle_count_val = flight_graph.triangleCount().agg(F.sum("count")).first()[0] // 3
print(f"Total number of triangles in the flight graph based on the built-in function is {triangle_count_val}.")

Total number of triangles in the flight graph based on the built-in function is 16015.


## Query 3 (2)
Compute a centrality measure of your choice natively on Spark using Graphframes.

## Query 4 (3)
Implement the PageRank algorithm natively on Spark using Graphframes.

## Query 5 (4)
Find the group of the most connected airports.

To find the group of most connected components, we replicated the connectedComponents() function. As preparation, we made the graph undirected and removed all duplicate edges for less processing. 

To find all connected components, we first initialized the components so that each node belongs to a separate component. After this, we added all nodes that are reachable from the initial node to respective components. This process is repeated until the results converge (in this specific case, it takes 4 iterations, and the graph is fully connected). Finally, the largest connected component is identified by selecting the component with the largest size.

In [61]:
# Removing all duplicate edges from the graph based on src and dst
flight_edges_distinct = flight_edges.dropDuplicates(['src', 'dst'])

# Making the graph undirected by adding reversed edges (src <-> dst) to mimic the built-in function
undirected_edges = flight_edges_distinct.select("src", "dst") \
    .union(flight_edges_distinct.selectExpr("dst as src", "src as dst")) \
    .dropDuplicates() \
    .repartition(200, "src")

# Creating initial components where each node starts as its own component
components = flight_vertices.select("id").withColumnRenamed("id", "node").withColumn("label", F.col("node"))

# After 4 iterations there is 1 large component (meaning the graph is fully connected)
for i in range(4):
    # Joining the edges with the components to propagate the component labels from neighbors
    neighbors = undirected_edges.join(components, undirected_edges.src == labels.node, "left") \
        .select(F.col("dst").alias("node"), "label") \
        .repartition(200, "node")

    # Updating the components using the alpabetically first node as the label for the component
    components = neighbors.groupBy("node") \
        .agg(F.min("label").alias("label")) \
        .repartition(200, "node")

    # Counting the number of connected components after this iteration for tracking
    component_sizes = components.groupBy("label").agg(F.count("*").alias("size"))
    print(f"Iteration {i+1}: {component_sizes.count()} components.")

# Extracting the largest component label (alpabetically first node)
largest_component_label = component_sizes.orderBy(F.desc("size")).limit(1).collect()[0]["label"]
most_connected_airports = labels.filter(F.col("label") == largest_component_label).select("node")

print(f"The most connected group contains {most_connected_airports.count()} out of {flight_vertices.count()} airports.")

Iteration 1: 44 components.
Iteration 2: 7 components.
Iteration 3: 2 components.
Iteration 4: 1 components.
The most connected group contains 296 out of 296 airports.
