# Project 3: Big graphs

The objective of this project is to use Spark’s APIs to analyze the flight interconnected data to understand the popularity of the airports and flight patterns.

## Session setup

In [1]:
import pyspark
from delta import configure_spark_with_delta_pip

# Prepare the Spark builder
builder = pyspark.sql.SparkSession.builder.appName("Project_3") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "16g")

spark = configure_spark_with_delta_pip(builder,extra_packages=["graphframes:graphframes:0.8.4-spark3.5-s_2.12"]).getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", spark._sc.defaultParallelism)

#spark.conf.set("spark.sql.repl.eagerEval.enabled",True) # OK for exploration, not great for performance
spark.conf.set("spark.sql.repl.eagerEval.truncate", 500)

In [2]:
import graphframes as gf
import pyspark.sql.functions as F

## Creating the Graph

In [3]:
# Reading in the data

flight_df = spark.read.csv("input/2009.csv", header=True, inferSchema=True)

flight_df.limit(5).show()

+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+-----------+
|   FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|DEST|CRS_DEP_TIME|DEP_TIME|DEP_DELAY|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|ARR_DELAY|CANCELLED|CANCELLATION_CODE|DIVERTED|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|DISTANCE|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|Unnamed: 27|
+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+-----------+
|

In [4]:
# Vertices

flight_vertices = (flight_df
                  .select(F.col("ORIGIN").alias("id"))
                  .union(flight_df
                        .select(F.col("DEST").alias("id")))
                  .distinct()
)

display(flight_vertices)

DataFrame[id: string]

In [5]:
# Edges

flight_edges = (flight_df
               .withColumnRenamed("ORIGIN","src")
               .withColumnRenamed("DEST","dst")
)

display(flight_edges)

DataFrame[FL_DATE: date, OP_CARRIER: string, OP_CARRIER_FL_NUM: int, src: string, dst: string, CRS_DEP_TIME: int, DEP_TIME: double, DEP_DELAY: double, TAXI_OUT: double, WHEELS_OFF: double, WHEELS_ON: double, TAXI_IN: double, CRS_ARR_TIME: int, ARR_TIME: double, ARR_DELAY: double, CANCELLED: double, CANCELLATION_CODE: string, DIVERTED: double, CRS_ELAPSED_TIME: double, ACTUAL_ELAPSED_TIME: double, AIR_TIME: double, DISTANCE: double, CARRIER_DELAY: double, WEATHER_DELAY: double, NAS_DELAY: double, SECURITY_DELAY: double, LATE_AIRCRAFT_DELAY: double, Unnamed: 27: string]

#### GraphFrame

In [6]:
flight_graph = gf.GraphFrame(flight_vertices, flight_edges)

flight_vertices.cache()
flight_edges.cache()

display(flight_graph)

GraphFrame(v:[id: string], e:[src: string, dst: string ... 26 more fields])

## Query 1 (0)

Compute different statistics : in-degree, out-degree, total degree and triangle count (without existing functions). 

### In-degree

In [7]:
in_degree = flight_graph.edges.groupBy("dst").agg(F.count("*").alias("inDegree"))

in_degree.show()

+---+--------+
|dst|inDegree|
+---+--------+
|IAH|  182088|
|JAX|   28813|
|ABQ|   35577|
|IND|   38198|
|BOS|  110463|
|GRR|   13970|
|MEM|   71721|
|PBI|   25496|
|XNA|   13764|
|LBB|    8004|
|BTV|    6021|
|VPS|    6958|
|SYR|    9330|
|JFK|  119571|
|MBS|    3443|
|SBN|    4527|
|PDX|   52251|
|RDD|    1433|
|LNK|    2765|
|HPN|   10661|
+---+--------+
only showing top 20 rows



In [8]:
# For validation
flight_graph.inDegrees.show()

+---+--------+
| id|inDegree|
+---+--------+
|IAH|  182088|
|JAX|   28813|
|ABQ|   35577|
|IND|   38198|
|BOS|  110463|
|GRR|   13970|
|MEM|   71721|
|PBI|   25496|
|XNA|   13764|
|LBB|    8004|
|BTV|    6021|
|VPS|    6958|
|SYR|    9330|
|JFK|  119571|
|MBS|    3443|
|SBN|    4527|
|PDX|   52251|
|RDD|    1433|
|LNK|    2765|
|HPN|   10661|
+---+--------+
only showing top 20 rows



### Out-degree

In [9]:
out_degree = flight_graph.edges.groupBy("src").agg(F.count("*").alias("outDegree"))

out_degree.show()

+---+---------+
|src|outDegree|
+---+---------+
|IAH|   182097|
|JAX|    28810|
|ABQ|    35582|
|IND|    38201|
|GRR|    13973|
|LBB|     8002|
|MEM|    71713|
|BTV|     6028|
|BOS|   110460|
|PBI|    25500|
|XNA|    13755|
|VPS|     6959|
|SYR|     9336|
|JFK|   119574|
|MBS|     3444|
|SBN|     4526|
|PDX|    52242|
|RDD|     1433|
|LNK|     2765|
|HPN|    10657|
+---+---------+
only showing top 20 rows



In [10]:
# For validation
flight_graph.outDegrees.show()

+---+---------+
| id|outDegree|
+---+---------+
|IAH|   182097|
|JAX|    28810|
|ABQ|    35582|
|IND|    38201|
|GRR|    13973|
|LBB|     8002|
|MEM|    71713|
|BTV|     6028|
|BOS|   110460|
|PBI|    25500|
|XNA|    13755|
|VPS|     6959|
|SYR|     9336|
|JFK|   119574|
|MBS|     3444|
|SBN|     4526|
|PDX|    52242|
|RDD|     1433|
|LNK|     2765|
|HPN|    10657|
+---+---------+
only showing top 20 rows



### Total degree

In [11]:
# Finding the total degrees
total_degree = flight_graph.edges.select(F.col("src").alias("id")) \
    .union(flight_graph.edges.select(F.col("dst").alias("id"))) \
    .groupBy("id") \
    .agg(F.count("*").alias("degree"))

total_degree.show()

+---+------+
| id|degree|
+---+------+
|IAH|364185|
|JAX| 57623|
|ABQ| 71159|
|IND| 76399|
|GRR| 27943|
|LBB| 16006|
|MEM|143434|
|BTV| 12049|
|BOS|220923|
|PBI| 50996|
|XNA| 27519|
|VPS| 13917|
|SYR| 18666|
|JFK|239145|
|MBS|  6887|
|SBN|  9053|
|PDX|104493|
|RDD|  2866|
|LNK|  5530|
|HPN| 21318|
+---+------+
only showing top 20 rows



In [7]:
# For validation
flight_graph.degrees.show()

+---+------+
| id|degree|
+---+------+
|IAH|364185|
|JAX| 57623|
|ABQ| 71159|
|IND| 76399|
|BOS|220923|
|GRR| 27943|
|LBB| 16006|
|MEM|143434|
|BTV| 12049|
|PBI| 50996|
|XNA| 27519|
|VPS| 13917|
|SYR| 18666|
|JFK|239145|
|MBS|  6887|
|SBN|  9053|
|PDX|104493|
|RDD|  2866|
|LNK|  5530|
|HPN| 21318|
+---+------+
only showing top 20 rows



### Triangle count

In [47]:
# Removing all duplicate edges from the graph based on src and dst
flight_edges_distinct = flight_edges.dropDuplicates(['src', 'dst'])

# Finding all a -> b pairs
paths = flight_edges_distinct.alias("e1") \
    .join(flight_edges_distinct.alias("e2"), F.col("e1.dst") == F.col("e2.src")) \
    .select(F.col("e1.src").alias("a"),
            F.col("e1.dst").alias("b"),
            F.col("e2.dst").alias("c"))

# Finding all triangles so b -> c and c -> a
triangles = paths \
    .join(flight_edges_distinct.alias("e3"), (F.col("e3.src") == F.col("c")) & (F.col("e3.dst") == F.col("a"))) \
    .select("a", "b", "c")

# a, b, c -> [a, b, c] sorted to exclude duplicates in different orders, eg. [a, b, c] and [b, a, c]
canonical_triangles = triangles \
    .withColumn("triangle", F.array_sort(F.array("a", "b", "c"))) \
    .dropDuplicates(["triangle"])

# Converting all triangles into one column of veritces [a, b, c] [c, d, a] -> a, b, c, c, d, a
triangle_vertices = canonical_triangles.select(F.explode("triangle").alias("id"))

# Counting the number of triangles each airport is part of 
triangle_counts = triangle_vertices.groupBy("id").count()

# Adding nodes that are part of 0 triangles to the result to match the built-in function
result = flight_graph.vertices.select("id") \
    .join(triangle_counts, on="id", how="left") \
    .withColumn("count", F.coalesce(F.col("count"), F.lit(0)))

result.show()

+---+-----+
| id|count|
+---+-----+
|IAH| 1338|
|JAX|  342|
|ABQ|  311|
|IND|  612|
|GRR|   96|
|LBB|   23|
|MEM| 1105|
|BTV|   34|
|BOS|  860|
|PBI|  168|
|XNA|   97|
|VPS|   10|
|SYR|   82|
|JFK|  942|
|MBS|    6|
|SBN|   13|
|PDX|  413|
|RDD|    1|
|LNK|   38|
|HPN|   36|
+---+-----+
only showing top 20 rows



In [12]:
# For validation
flight_graph.triangleCount().show()

+-----+---+
|count| id|
+-----+---+
| 1338|IAH|
|  342|JAX|
|  311|ABQ|
|  612|IND|
|   96|GRR|
|   23|LBB|
| 1105|MEM|
|   34|BTV|
|  860|BOS|
|  168|PBI|
|   97|XNA|
|   10|VPS|
|   82|SYR|
|  942|JFK|
|    6|MBS|
|   13|SBN|
|  413|PDX|
|    1|RDD|
|   38|LNK|
|   36|HPN|
+-----+---+
only showing top 20 rows



In [46]:
# Comparing our results with the built-in function to validate all results
joined_triangles = flight_graph.triangleCount().alias('a').join(result.alias('b'), on='id')
diff_counts = joined_triangles.filter(F.col('a.count') != F.col('b.count'))
diff_counts.select('id', F.col('a.count').alias('count_df1'), F.col('b.count').alias('count_df2')).show() # Should be empty if all is correct

+---+---------+---------+
| id|count_df1|count_df2|
+---+---------+---------+
|IAD|      971|      970|
|ORD|     1678|     1677|
|STT|       52|       51|
+---+---------+---------+



## Query 2 (1)
Compute the total number of triangles in the graph.

To find the total number of of unique triangles in the graph, we sum all the individual node triangle counts and divided it by 3 since each triangle is included three times (once for each of its 3 vertices).

In [55]:
# Finding the sum of all triangles in the graph using the results from q1
total_triangle_count = triangle_counts.agg(F.sum("count")).first()[0] // 3
print(f"Total number of triangles in the flight graph is {total_triangle_count}.")

Total number of triangles in the flight graph is 16014.


In [None]:
# For validation
triangle_count_val = flight_graph.triangleCount().agg(F.sum("count")).first()[0] // 3
print(f"Total number of triangles in the flight graph based on the built-in function is {triangle_count_val}.")

## Query 3 (2)
Compute a centrality measure of your choice natively on Spark using Graphframes.

## Query 4 (3)
Implement the PageRank algorithm natively on Spark using Graphframes.

## Query 5 (4)
Find the group of the most connected airports.