# Project 3: Big graphs

The objective of this project is to use Spark’s APIs to analyze the flight interconnected data to understand the popularity of the airports and flight patterns.

## Session setup

In [30]:
import pyspark
from delta import configure_spark_with_delta_pip

# Prepare the Spark builder
builder = pyspark.sql.SparkSession.builder.appName("Project_3") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g")

spark = configure_spark_with_delta_pip(builder,extra_packages=["graphframes:graphframes:0.8.4-spark3.5-s_2.12"]).getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", spark._sc.defaultParallelism)

#spark.conf.set("spark.sql.repl.eagerEval.enabled",True) # OK for exploration, not great for performance
spark.conf.set("spark.sql.repl.eagerEval.truncate", 500)

In [31]:
import graphframes as gf
import pyspark.sql.functions as F

## Creating the Graph

In [3]:
# Reading in the data

flight_df = spark.read.csv("input/2009.csv", header=True, inferSchema=True)
display(flight_df)

DataFrame[FL_DATE: date, OP_CARRIER: string, OP_CARRIER_FL_NUM: int, ORIGIN: string, DEST: string, CRS_DEP_TIME: int, DEP_TIME: double, DEP_DELAY: double, TAXI_OUT: double, WHEELS_OFF: double, WHEELS_ON: double, TAXI_IN: double, CRS_ARR_TIME: int, ARR_TIME: double, ARR_DELAY: double, CANCELLED: double, CANCELLATION_CODE: string, DIVERTED: double, CRS_ELAPSED_TIME: double, ACTUAL_ELAPSED_TIME: double, AIR_TIME: double, DISTANCE: double, CARRIER_DELAY: double, WEATHER_DELAY: double, NAS_DELAY: double, SECURITY_DELAY: double, LATE_AIRCRAFT_DELAY: double, Unnamed: 27: string]

In [4]:
# Vertices

flight_vertices = (flight_df
                  .select(F.col("ORIGIN").alias("id"))
                  .union(flight_df
                        .select(F.col("DEST").alias("id")))
                  .distinct()
)

display(flight_vertices)

DataFrame[id: string]

In [5]:
# Edges

flight_edges = (flight_df
               .withColumnRenamed("ORIGIN","src")
               .withColumnRenamed("DEST","dst")
)

display(flight_edges)

DataFrame[FL_DATE: date, OP_CARRIER: string, OP_CARRIER_FL_NUM: int, src: string, dst: string, CRS_DEP_TIME: int, DEP_TIME: double, DEP_DELAY: double, TAXI_OUT: double, WHEELS_OFF: double, WHEELS_ON: double, TAXI_IN: double, CRS_ARR_TIME: int, ARR_TIME: double, ARR_DELAY: double, CANCELLED: double, CANCELLATION_CODE: string, DIVERTED: double, CRS_ELAPSED_TIME: double, ACTUAL_ELAPSED_TIME: double, AIR_TIME: double, DISTANCE: double, CARRIER_DELAY: double, WEATHER_DELAY: double, NAS_DELAY: double, SECURITY_DELAY: double, LATE_AIRCRAFT_DELAY: double, Unnamed: 27: string]

#### GraphFrame

In [6]:
flight_graph = gf.GraphFrame(flight_vertices, flight_edges)

flight_vertices.cache()
flight_edges.cache()

display(flight_graph)

GraphFrame(v:[id: string], e:[src: string, dst: string ... 26 more fields])

## Query 1 (0)

Compute different statistics : in-degree, out-degree, total degree and triangle count (without existing functions). 

### In-degree

In [11]:
in_degree = flight_graph.edges.groupBy("dst").agg(F.count("*").alias("inDegree"))
in_degree.show()

+---+--------+
|dst|inDegree|
+---+--------+
|IAH|  182088|
|JAX|   28813|
|ABQ|   35577|
|IND|   38198|
|BOS|  110463|
|GRR|   13970|
|MEM|   71721|
|PBI|   25496|
|XNA|   13764|
|LBB|    8004|
|BTV|    6021|
|VPS|    6958|
|SYR|    9330|
|JFK|  119571|
|MBS|    3443|
|SBN|    4527|
|PDX|   52251|
|RDD|    1433|
|LNK|    2765|
|HPN|   10661|
+---+--------+
only showing top 20 rows



In [12]:
# For validation
flight_graph.inDegrees.show()

+---+--------+
| id|inDegree|
+---+--------+
|IAH|  182088|
|JAX|   28813|
|ABQ|   35577|
|IND|   38198|
|BOS|  110463|
|GRR|   13970|
|MEM|   71721|
|PBI|   25496|
|XNA|   13764|
|LBB|    8004|
|BTV|    6021|
|VPS|    6958|
|SYR|    9330|
|JFK|  119571|
|MBS|    3443|
|SBN|    4527|
|PDX|   52251|
|RDD|    1433|
|LNK|    2765|
|HPN|   10661|
+---+--------+
only showing top 20 rows



### Out-degree

In [13]:
out_degree = flight_graph.edges.groupBy("src").agg(F.count("*").alias("outDegree"))
out_degree.show()

+---+---------+
|src|outDegree|
+---+---------+
|IAH|   182097|
|JAX|    28810|
|ABQ|    35582|
|IND|    38201|
|GRR|    13973|
|LBB|     8002|
|MEM|    71713|
|BTV|     6028|
|BOS|   110460|
|PBI|    25500|
|XNA|    13755|
|VPS|     6959|
|SYR|     9336|
|JFK|   119574|
|MBS|     3444|
|SBN|     4526|
|PDX|    52242|
|RDD|     1433|
|LNK|     2765|
|HPN|    10657|
+---+---------+
only showing top 20 rows



In [14]:
# For validation
flight_graph.outDegrees.show()

+---+---------+
| id|outDegree|
+---+---------+
|IAH|   182097|
|JAX|    28810|
|ABQ|    35582|
|IND|    38201|
|GRR|    13973|
|LBB|     8002|
|MEM|    71713|
|BTV|     6028|
|BOS|   110460|
|PBI|    25500|
|XNA|    13755|
|VPS|     6959|
|SYR|     9336|
|JFK|   119574|
|MBS|     3444|
|SBN|     4526|
|PDX|    52242|
|RDD|     1433|
|LNK|     2765|
|HPN|    10657|
+---+---------+
only showing top 20 rows



### Total degree

In [15]:
# Finding the total degrees
total_degree = flight_graph.edges.select(F.col("src").alias("id")) \
    .union(flight_graph.edges.select(F.col("dst").alias("id"))) \
    .groupBy("id") \
    .agg(F.count("*").alias("degree"))

total_degree.show()

+---+------+
| id|degree|
+---+------+
|IAH|364185|
|JAX| 57623|
|ABQ| 71159|
|IND| 76399|
|GRR| 27943|
|LBB| 16006|
|MEM|143434|
|BTV| 12049|
|BOS|220923|
|PBI| 50996|
|XNA| 27519|
|VPS| 13917|
|SYR| 18666|
|JFK|239145|
|MBS|  6887|
|SBN|  9053|
|PDX|104493|
|RDD|  2866|
|LNK|  5530|
|HPN| 21318|
+---+------+
only showing top 20 rows



In [15]:
# For validation
flight_graph.degrees.show()

+---+------+
| id|degree|
+---+------+
|IAH|364185|
|JAX| 57623|
|ABQ| 71159|
|IND| 76399|
|BOS|220923|
|GRR| 27943|
|LBB| 16006|
|MEM|143434|
|BTV| 12049|
|PBI| 50996|
|XNA| 27519|
|VPS| 13917|
|SYR| 18666|
|JFK|239145|
|MBS|  6887|
|SBN|  9053|
|PDX|104493|
|RDD|  2866|
|LNK|  5530|
|HPN| 21318|
+---+------+
only showing top 20 rows



## Query 2 (1)
Compute the total number of triangles in the graph.

## Query 3 (2)
Compute a centrality measure of your choice natively on Spark using Graphframes.

In [63]:
# Precomputation for query 3 and 4
from pyspark.sql.types import FloatType
import json

flight_vertices = (flight_df
                  .select(F.col("ORIGIN").alias("id"))
                  .union(flight_df
                        .select(F.col("DEST").alias("id")))
                  .distinct()
)
flight_edges = (flight_df
                .select(F.col('ORIGIN').alias('src'), F.col('DEST').alias('dst'))
                .distinct()
)
flight_graph = gf.GraphFrame(flight_vertices, flight_edges)

# lookup table ID:'str' -> idx:int
indexes = {}
i = 0
for node in flight_graph.vertices.toLocalIterator():
    indexes[node.id] = i
    i += 1

# Adjacency matrix and column counter
adjM = np.zeros(N * N).reshape((N, N))
colCounts = np.zeros(N)

for edge in flight_graph.edges.toLocalIterator():
    fromIdx = indexes[edge.src] 
    toIdx = indexes[edge.dst]
    adjM[toIdx][fromIdx] = 1
    colCounts[fromIdx] += 1

In [75]:
# Closeness Centrality 
def bfs(vertices, lookup, adjM, v: str):
    lookup = json.loads(lookup)
    adjM = json.loads(adjM)
    N = len(vertices)
    distances = [float('inf')] * N # initial list of distances (all 0)
    distances[lookup[v]] = 0
    visited = set() # set of visited nodes (tracked by index)
    q = [v] # queue

    while len(q) != 0:
        curr = q.pop(0)
        currIdx = lookup[curr]
        if currIdx in visited:
            continue
        visited.add(currIdx)

        for i in range(N):
            if adjM[i][currIdx] == 1 and not i in visited:
                distances[i] = min(distances[i], distances[currIdx] + 1)
                q.append(vertices[i])

    return (N - 1)/sum(distances)

bfs_udf = F.udf(bfs, FloatType())

In [76]:
vertexList = [node.id for node in flight_graph.vertices.collect()]
centralities = flight_graph.vertices.withColumn('centrality', bfs_udf(F.lit(vertexList), F.lit(json.dumps(indexes)), F.lit(json.dumps(adjM.tolist())), 'id'))
centralities.show()

+---+----------+
| id|centrality|
+---+----------+
|IAH| 0.5983773|
|JAX|0.47504026|
|ABQ|0.50687283|
|IND| 0.5139373|
|GRR|0.47049442|
|LBB|0.40466392|
|MEM| 0.5662188|
|BTV|0.43382353|
|BOS|0.53636366|
|PBI|0.45807454|
|XNA|0.47580644|
|VPS|0.42753622|
|SYR|0.43768546|
|JFK| 0.5462963|
|MBS|0.41143653|
|SBN|0.43703705|
|PDX| 0.5305755|
|RDD|0.36019537|
|LNK|0.45807454|
|HPN|0.43255132|
+---+----------+
only showing top 20 rows



## Query 4 (3)
Implement the PageRank algorithm natively on Spark using Graphframes.

In [24]:
import numpy as np
from pyspark.sql.functions import lit
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType, FloatType


N = flight_graph.vertices.count()

w = np.zeros(N * N).reshape((N, N))
for i in range(N):
    for j in range(N):
        w[i][j] = adjM[i][j] / colCounts[j]
        
r = np.ones(N) / N # ranks during iteration t
nr = w @ r * 0.85 + 0.15/N # ranks during iteration t+1
while np.linalg.norm(r - nr) >= 1e-2:
    r = nr
    nr = w @ r * 0.85 + 0.15/N

r *= N

rankdf = spark.createDataFrame([(node.id, float(r[i])) for i, node in enumerate(flight_graph.vertices.collect())], ['id', 'self rank'])
vertices = flight_graph.vertices.join(rankdf, on='id', how='inner')

In [25]:
result = flight_graph.pageRank(resetProbability=0.85 ,tol=0.01)

In [26]:
result = result.vertices.join(vertices, on='id', how='inner')
result.show()

+---+------------------+-------------------+
| id|          pagerank|          self rank|
+---+------------------+-------------------+
|IAH| 2.027076736800265|  5.913136023821437|
|JAX| 1.033090823110532| 1.5451739595830511|
|ABQ| 1.047016963206775| 1.7157395513566578|
|IND|1.0009248630869765| 1.9064005964082287|
|GRR|0.8989095494757796| 0.7785236524416105|
|LBB|0.8898990536255568|  0.521209893271091|
|MEM|1.7618723649930417|  4.868021491387658|
|BTV|0.8799527840681606| 0.5429734807644605|
|BOS|1.1694393218877233|  2.781233411986623|
|PBI|0.9459136986320067| 1.0849908569552607|
|XNA| 0.912472699735275| 0.8439770022541137|
|VPS|0.8685686928734259|0.38008814568417404|
|SYR|0.8930956614063718| 0.7127668218904905|
|JFK|1.4405034683667703|  3.494621818147398|
|MBS|0.8649332623310891|0.33323498052345785|
|SBN|0.8689849339031348| 0.3857359105319412|
|PDX|1.2397638228141772|  2.386689800914159|
|RDD|0.9188603905438856|0.29306971894077566|
|LNK|0.8968211619161669|  0.563556437861472|
|HPN| 0.89

## Query 5 (4)
Find the group of the most connected airports.