# Project 3: Big graphs

The objective of this project is to use Spark’s APIs to analyze the flight interconnected data to understand the popularity of the airports and flight patterns.

## Session setup

In [20]:
import pyspark
from delta import configure_spark_with_delta_pip

# Prepare the Spark builder
builder = pyspark.sql.SparkSession.builder.appName("Project_3") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "16g")

spark = configure_spark_with_delta_pip(builder,extra_packages=["graphframes:graphframes:0.8.4-spark3.5-s_2.12"]).getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", spark._sc.defaultParallelism)

#spark.conf.set("spark.sql.repl.eagerEval.enabled",True) # OK for exploration, not great for performance
spark.conf.set("spark.sql.repl.eagerEval.truncate", 500)

In [21]:
import graphframes as gf
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType
import json
import numpy as np

## Creating the Graph

In [22]:
# Reading in the data

flight_df = spark.read.csv("input/2009.csv", header=True, inferSchema=True)
display(flight_df)

DataFrame[FL_DATE: date, OP_CARRIER: string, OP_CARRIER_FL_NUM: int, ORIGIN: string, DEST: string, CRS_DEP_TIME: int, DEP_TIME: double, DEP_DELAY: double, TAXI_OUT: double, WHEELS_OFF: double, WHEELS_ON: double, TAXI_IN: double, CRS_ARR_TIME: int, ARR_TIME: double, ARR_DELAY: double, CANCELLED: double, CANCELLATION_CODE: string, DIVERTED: double, CRS_ELAPSED_TIME: double, ACTUAL_ELAPSED_TIME: double, AIR_TIME: double, DISTANCE: double, CARRIER_DELAY: double, WEATHER_DELAY: double, NAS_DELAY: double, SECURITY_DELAY: double, LATE_AIRCRAFT_DELAY: double, Unnamed: 27: string]

In [23]:
# Vertices

flight_vertices = (flight_df
                  .select(F.col("ORIGIN").alias("id"))
                  .union(flight_df
                        .select(F.col("DEST").alias("id")))
                  .distinct()
)

display(flight_vertices)

DataFrame[id: string]

In [24]:
# Edges

flight_edges = (flight_df
               .withColumnRenamed("ORIGIN","src")
               .withColumnRenamed("DEST","dst")
)

display(flight_edges)

DataFrame[FL_DATE: date, OP_CARRIER: string, OP_CARRIER_FL_NUM: int, src: string, dst: string, CRS_DEP_TIME: int, DEP_TIME: double, DEP_DELAY: double, TAXI_OUT: double, WHEELS_OFF: double, WHEELS_ON: double, TAXI_IN: double, CRS_ARR_TIME: int, ARR_TIME: double, ARR_DELAY: double, CANCELLED: double, CANCELLATION_CODE: string, DIVERTED: double, CRS_ELAPSED_TIME: double, ACTUAL_ELAPSED_TIME: double, AIR_TIME: double, DISTANCE: double, CARRIER_DELAY: double, WEATHER_DELAY: double, NAS_DELAY: double, SECURITY_DELAY: double, LATE_AIRCRAFT_DELAY: double, Unnamed: 27: string]

#### GraphFrame

In [25]:
flight_graph = gf.GraphFrame(flight_vertices, flight_edges)

flight_vertices.cache()
flight_edges.cache()

display(flight_graph)

GraphFrame(v:[id: string], e:[src: string, dst: string ... 26 more fields])

## Query 1 (0)

Compute different statistics : in-degree, out-degree and total degree (without existing functions). 

### In-degree

In [19]:
in_degree = flight_graph.edges.groupBy("dst").agg(F.count("*").alias("inDegree"))

in_degree.show()

+---+--------+
|dst|inDegree|
+---+--------+
|IAH|  182088|
|JAX|   28813|
|ABQ|   35577|
|IND|   38198|
|BOS|  110463|
|GRR|   13970|
|MEM|   71721|
|PBI|   25496|
|XNA|   13764|
|LBB|    8004|
|BTV|    6021|
|VPS|    6958|
|SYR|    9330|
|JFK|  119571|
|MBS|    3443|
|SBN|    4527|
|PDX|   52251|
|RDD|    1433|
|LNK|    2765|
|HPN|   10661|
+---+--------+
only showing top 20 rows



In [8]:
# For validation
flight_graph.inDegrees.show()

+---+--------+
| id|inDegree|
+---+--------+
|IAH|  182088|
|JAX|   28813|
|ABQ|   35577|
|IND|   38198|
|BOS|  110463|
|GRR|   13970|
|MEM|   71721|
|PBI|   25496|
|XNA|   13764|
|LBB|    8004|
|BTV|    6021|
|VPS|    6958|
|SYR|    9330|
|JFK|  119571|
|MBS|    3443|
|SBN|    4527|
|PDX|   52251|
|RDD|    1433|
|LNK|    2765|
|HPN|   10661|
+---+--------+
only showing top 20 rows



### Out-degree

In [17]:
out_degree = flight_graph.edges.groupBy("src").agg(F.count("*").alias("outDegree"))
out_degree.show()

+---+---------+
|src|outDegree|
+---+---------+
|IAH|   182097|
|JAX|    28810|
|ABQ|    35582|
|IND|    38201|
|GRR|    13973|
|LBB|     8002|
|MEM|    71713|
|BTV|     6028|
|BOS|   110460|
|PBI|    25500|
|XNA|    13755|
|VPS|     6959|
|SYR|     9336|
|JFK|   119574|
|MBS|     3444|
|SBN|     4526|
|PDX|    52242|
|RDD|     1433|
|LNK|     2765|
|HPN|    10657|
+---+---------+
only showing top 20 rows



In [18]:
# For validation
flight_graph.outDegrees.show()

+---+---------+
| id|outDegree|
+---+---------+
|IAH|   182097|
|JAX|    28810|
|ABQ|    35582|
|IND|    38201|
|GRR|    13973|
|LBB|     8002|
|MEM|    71713|
|BTV|     6028|
|BOS|   110460|
|PBI|    25500|
|XNA|    13755|
|VPS|     6959|
|SYR|     9336|
|JFK|   119574|
|MBS|     3444|
|SBN|     4526|
|PDX|    52242|
|RDD|     1433|
|LNK|     2765|
|HPN|    10657|
+---+---------+
only showing top 20 rows



### Total degree

In [16]:
# Finding the total degrees
total_degree = flight_graph.edges.select(F.col("src").alias("id")) \
    .union(flight_graph.edges.select(F.col("dst").alias("id"))) \
    .groupBy("id") \
    .agg(F.count("*").alias("degree"))

total_degree.show()

+---+------+
| id|degree|
+---+------+
|IAH|364185|
|JAX| 57623|
|ABQ| 71159|
|IND| 76399|
|GRR| 27943|
|LBB| 16006|
|MEM|143434|
|BTV| 12049|
|BOS|220923|
|PBI| 50996|
|XNA| 27519|
|VPS| 13917|
|SYR| 18666|
|JFK|239145|
|MBS|  6887|
|SBN|  9053|
|PDX|104493|
|RDD|  2866|
|LNK|  5530|
|HPN| 21318|
+---+------+
only showing top 20 rows



In [12]:
# For validation
flight_graph.degrees.show()

+---+------+
| id|degree|
+---+------+
|IAH|364185|
|JAX| 57623|
|ABQ| 71159|
|IND| 76399|
|BOS|220923|
|GRR| 27943|
|LBB| 16006|
|MEM|143434|
|BTV| 12049|
|PBI| 50996|
|XNA| 27519|
|VPS| 13917|
|SYR| 18666|
|JFK|239145|
|MBS|  6887|
|SBN|  9053|
|PDX|104493|
|RDD|  2866|
|LNK|  5530|
|HPN| 21318|
+---+------+
only showing top 20 rows



### Triangle count

We calculate how many times each node appears in by converting the graph into an undirected graph, then finding all triangles in the graph and finally counting how many triangles each node is part of. To find all triangles in the graph we first look for paths a -> b and b -> c and then select triangles where there is an edge c -> a. We sort the nodes of all triangles alphabetically to remove all duplicate triangles.

In [13]:
# Removing all duplicate edges from the graph based on src and dst
flight_edges_distinct = flight_edges.dropDuplicates(['src', 'dst'])

# Making the graph undirected to mimic the built-in funtion
undirected_edges = flight_edges_distinct.selectExpr("src", "dst") \
    .union(flight_edges_distinct.selectExpr("dst as src", "src as dst")).dropDuplicates()

# Finding all a -> b, b -> c pairs 
paths = undirected_edges.alias("e1") \
    .join(flight_edges_distinct.alias("e2"), F.col("e1.dst") == F.col("e2.src")) \
    .select(F.col("e1.src").alias("a"),
            F.col("e1.dst").alias("b"),
            F.col("e2.dst").alias("c"))

# Finding all triangles with c -> a
triangles = paths \
    .join(undirected_edges.alias("e3"), (F.col("e3.src") == F.col("c")) & (F.col("e3.dst") == F.col("a"))) \
    .select("a", "b", "c")

# Sorting the triangle nodes alphabetically to drop duplicates
triangles_sorted = triangles.select(F.array_sort(F.array("a", "b", "c")).alias("triangle")).dropDuplicates()

# Converting all triangles into one column of veritces [a, b, c] [c, d, a] -> a, b, c, c, d, a
triangle_vertices = triangles_sorted.select(F.explode("triangle").alias("id"))

# Counting the number of triangles each airport is part of 
triangle_counts = triangle_vertices.groupBy("id").count()

# Adding nodes that are part of 0 triangles to the result to match the built-in function
result = flight_graph.vertices.select("id") \
    .join(triangle_counts, on="id", how="left") \
    .withColumn("count", F.coalesce(F.col("count"), F.lit(0)))

result.show()

+---+-----+
| id|count|
+---+-----+
|IAH| 1338|
|JAX|  342|
|ABQ|  311|
|IND|  612|
|GRR|   96|
|LBB|   23|
|MEM| 1105|
|BTV|   34|
|BOS|  860|
|PBI|  168|
|XNA|   97|
|VPS|   10|
|SYR|   82|
|JFK|  942|
|MBS|    6|
|SBN|   13|
|PDX|  413|
|RDD|    1|
|LNK|   38|
|HPN|   36|
+---+-----+
only showing top 20 rows



In [14]:
# Comparing our results with the built-in function to validate all results
joined_triangles = flight_graph.triangleCount().alias('a').join(result.alias('b'), on='id')
diff_counts = joined_triangles.filter(F.col('a.count') != F.col('b.count')) # Should be empty if all is correct
print("Differences in the results (should be empty):")
diff_counts.select('id', F.col('a.count').alias('Built-in_count'), F.col('b.count').alias('Custom_count')).show()

Differences in the results (should be empty):
+---+--------------+------------+
| id|Built-in_count|Custom_count|
+---+--------------+------------+
+---+--------------+------------+



## Query 2 (1)
Compute the total number of triangles in the graph.

The query to find the total number of triangles in the graph uses the same logic as the previous query, but it just counts the total number of unique triangles, not node appearances.

In [15]:
# Removing all duplicate edges from the graph based on src and dst
flight_edges_distinct = flight_edges.dropDuplicates(['src', 'dst'])

# Making the graph undirected to mimic the built-in funtion
undirected_edges = flight_edges_distinct.selectExpr("src", "dst") \
    .union(flight_edges_distinct.selectExpr("dst as src", "src as dst")).dropDuplicates()

# Finding all a -> b, b -> c pairs 
paths = undirected_edges.alias("e1") \
    .join(flight_edges_distinct.alias("e2"), F.col("e1.dst") == F.col("e2.src")) \
    .select(F.col("e1.src").alias("a"),
            F.col("e1.dst").alias("b"),
            F.col("e2.dst").alias("c"))

# Finding all triangles with c -> a
triangles = paths \
    .join(undirected_edges.alias("e3"), (F.col("e3.src") == F.col("c")) & (F.col("e3.dst") == F.col("a"))) \
    .select("a", "b", "c")

# Sorting the triangle nodes alphabetically to drop duplicates
triangles_sorted = triangles.select(F.array_sort(F.array("a", "b", "c")).alias("triangle")).dropDuplicates()

# Count the total triangles in the graph
total_triangle_count = triangles_sorted.count()

print(f"Total number of triangles in the flight graph is {total_triangle_count}.")

Total number of triangles in the flight graph is 16015.


To find the total number of of unique triangles in the graph using the built-in function replica, we sum all the individual node triangle counts and divided it by 3 since each triangle is included three times (once for each of its 3 vertices).

In [16]:
# Finding the sum of all triangles in the graph using the triangle count results from q1
total_triangle_count = triangle_counts.agg(F.sum("count")).first()[0] // 3
print(f"Total number of triangles in the flight graph is {total_triangle_count}.")

Total number of triangles in the flight graph is 16015.


In [17]:
# For validation
triangle_count_val = flight_graph.triangleCount().agg(F.sum("count")).first()[0] // 3
print(f"Total number of triangles in the flight graph based on the built-in function is {triangle_count_val}.")

Total number of triangles in the flight graph based on the built-in function is 16015.


## Precomputation for query 3 and 4

In [82]:
flight_vertices = (flight_df
                  .select(F.col("ORIGIN").alias("id"))
                  .union(flight_df
                        .select(F.col("DEST").alias("id")))
                  .distinct()
)
flight_edges = (flight_df
                .select(F.col('ORIGIN').alias('src'), F.col('DEST').alias('dst'), F.col('DISTANCE'))
                .distinct()
)
flight_graph = gf.GraphFrame(flight_vertices, flight_edges)

N = flight_graph.vertices.count()

# lookup table ID:'str' -> idx:int
indexes = {}
i = 0
for node in flight_graph.vertices.toLocalIterator():
    indexes[node.id] = i
    i += 1

# Adjacency matrix, distance matrix and column counter
adjM = np.zeros(N * N).reshape((N, N))
distM = np.zeros(N * N).reshape((N, N))
colCounts = np.zeros(N)
rowCounts = np.zeros(N)

for edge in flight_graph.edges.toLocalIterator():
    fromIdx = indexes[edge.src] 
    toIdx = indexes[edge.dst]
    adjM[toIdx][fromIdx] = 1
    distM[toIdx][fromIdx] = edge.DISTANCE
    colCounts[fromIdx] += 1
    rowCounts[toIdx] += 1

## Query 3 (2)
Compute a centrality measure of your choice natively on Spark using Graphframes.

In [33]:
# Closeness Centrality

# BFS function for calculating a measure of closeness centrality from a given node

# Shortest path -> least edges taken
def bfs_edges(vertices, lookup, adjM, v: str):
    lookup = json.loads(lookup) # For mapping vertex names to their indexes
    adjM = json.loads(adjM) # Represents graph's adjacency matrix
    N = len(vertices)
    distances = [float('inf')] * N # initial list of distances (all infinity)
    distances[lookup[v]] = 0 # distance to the start node is 0
    visited = set() # set of visited nodes (tracked by index)
    q = [v] # queue

    # Processes nodes while queue not empty
    while len(q) != 0:
        curr = q.pop(0) # Take the first node
        currIdx = lookup[curr] # Get the index of current node
        
        # Check if the node has already been visited
        if currIdx in visited:
            continue
        visited.add(currIdx) # Mark node as visited

        # Check all other nodes to see if they are directly connected to the current node
        for i in range(N):
            # If there's an edge from current node to node i and node i has not been visited
            if adjM[i][currIdx] == 1 and not i in visited:
                distances[i] = min(distances[i], distances[currIdx] + 1) # If a shorter path is found then update the distance to node i
                q.append(vertices[i])

    return (N - 1)/sum(distances)

# Shortest path -> actual physical shortest path
def bfs_dist(vertices, lookup, distM, v: str):
    lookup = json.loads(lookup) # For mapping vertex names to their indexes
    distM = json.loads(distM) # Represents graph's adjacency matrix
    N = len(vertices)
    distances = [float('inf')] * N # initial list of distances (all infinity)
    distances[lookup[v]] = 0 # distance to the start node is 0
    visited = set() # set of visited nodes (tracked by index)
    q = [v] # queue

    # Processes nodes while queue not empty
    while len(q) != 0:
        curr = q.pop(0) # Take the first node
        currIdx = lookup[curr] # Get the index of current node
        
        # Check if the node has already been visited
        if currIdx in visited:
            continue
        visited.add(currIdx) # Mark node as visited

        # Check all other nodes to see if they are directly connected to the current node
        for i in range(N):
            # If there's an edge from current node to node i and node i has not been visited
            if distM[i][currIdx] != 0 and not i in visited:
                distances[i] = min(distances[i], distances[currIdx] + distM[i][currIdx]) # If a shorter path is found then update the distance to node i
                q.append(vertices[i])

    return (N - 1)/sum(distances)

bfs_edges_udf = F.udf(bfs_edges, FloatType())
bfs_dist_udf = F.udf(bfs_dist, FloatType())

In [34]:
vertexList = [node.id for node in flight_graph.vertices.collect()]
centralities = flight_graph.vertices.withColumn('centrality_edges', bfs_edges_udf(F.lit(vertexList), F.lit(json.dumps(indexes)), F.lit(json.dumps(adjM.tolist())), 'id'))
centralities = centralities.withColumn('centrality_dist', bfs_dist_udf(F.lit(vertexList), F.lit(json.dumps(indexes)), F.lit(json.dumps(distM.tolist())), 'id'))
centralities.show()

+---+----------------+---------------+
| id|centrality_edges|centrality_dist|
+---+----------------+---------------+
|IAH|       0.5983773|    7.851529E-4|
|JAX|      0.47504026|   6.9578097E-4|
|ABQ|      0.50687283|   7.6872966E-4|
|IND|       0.5139373|    8.692603E-4|
|GRR|      0.47049442|    8.364784E-4|
|LBB|      0.40466392|    7.721947E-4|
|MEM|       0.5662188|   8.7641896E-4|
|BTV|      0.43382353|   6.0750963E-4|
|BOS|      0.53636366|    5.937573E-4|
|PBI|      0.45807454|    6.000716E-4|
|XNA|      0.47580644|    8.764086E-4|
|VPS|      0.42753622|   7.3864305E-4|
|SYR|      0.43768546|    6.703997E-4|
|JFK|       0.5462963|    6.488807E-4|
|MBS|      0.41143653|   8.0058404E-4|
|SBN|      0.43703705|   8.5602776E-4|
|PDX|       0.5305755|     5.75475E-4|
|RDD|      0.36019537|    5.204612E-4|
|LNK|      0.45807454|    8.191714E-4|
|HPN|      0.43255132|    6.062599E-4|
+---+----------------+---------------+
only showing top 20 rows



## Query 4 (3)
Implement the PageRank algorithm natively on Spark using Graphframes.

In [103]:
w = np.zeros(N * N).reshape((N, N)) # Create matrix with zeroes

# Fill the matrix with normalized values from adjacency matrix
for i in range(N):
    for j in range(N):
        w[i][j] = adjM[i][j] / rowCounts[j]
        
r = np.ones(N) / N # ranks during iteration t
w = w * 0.85
nr = w @ r + 0.15/N # ranks during iteration t+1

# Process until the rank vector converges
while np.linalg.norm(nr - r, ord=1) >= 0.01:
    r = nr
    nr = w @ r + 0.15/N

r *= N
rankdf = spark.createDataFrame([(node.id, float(r[i])) for i, node in enumerate(flight_graph.vertices.collect())], ['id', 'self rank'])
vertices = flight_graph.vertices.join(rankdf, on='id', how='inner')

In [104]:
result = flight_graph.pageRank(resetProbability=0.85 ,tol=0.01)

In [105]:
result = result.vertices.join(vertices, on='id', how='inner')
result.show()

+---+------------------+-------------------+
| id|          pagerank|          self rank|
+---+------------------+-------------------+
|IND|1.0009248630869763| 1.9872908637142588|
|SUX|0.8547643370409228|0.20584785522286422|
|ABQ|1.0470169632067747| 1.7721722590685787|
|MEM|1.7618723649930412| 4.6333392062929475|
|DAB|0.8585545978595113|0.25667906681486363|
|LAX|2.2024418486180797| 4.9877256037344715|
|SBA|  0.88351542165024| 0.5705655864182047|
|AUS|1.0443137636229094| 1.9092641741169052|
|MSO|0.9131334775607175| 0.5295752394299095|
|FCA|0.8701195610092587| 0.4270449341124207|
|MEI| 0.855231567478034|0.20958017015728558|
|BGR|0.8676373811590247|0.39071623026270175|
|SMX|0.8868960363108772| 0.2907971338088261|
|FLG|0.8540750826920838|0.19877930158941198|
|PIA|0.9159183717213099| 0.6003720181317782|
|CMX|0.8547643370409228|0.20584785522286422|
|GST|0.8744363963773579| 0.3166086214156089|
|TUS|0.9110820605095287| 0.9722184194800675|
|CHS|0.8981776667630528|  0.835503537572304|
|FAT|0.877

In [106]:
ranksum = result.agg(F.sum('pagerank'))
ranksum.show()
ranksum = result.agg(F.sum('self rank'))
ranksum.show()

+------------------+
|     sum(pagerank)|
+------------------+
|296.00000000000006|
+------------------+

+------------------+
|    sum(self rank)|
+------------------+
|296.67947010320694|
+------------------+



## Query 5 (4)
Find the group of the most connected airports.