# Project 3: Big graphs

The objective of this project is to use Spark’s APIs to analyze the flight interconnected data to understand the popularity of the airports and flight patterns.

## Session setup

In [1]:
import pyspark
from delta import configure_spark_with_delta_pip

# Prepare the Spark builder
builder = pyspark.sql.SparkSession.builder.appName("Project_3") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g")

spark = configure_spark_with_delta_pip(builder,extra_packages=["graphframes:graphframes:0.8.4-spark3.5-s_2.12"]).getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", spark._sc.defaultParallelism)

#spark.conf.set("spark.sql.repl.eagerEval.enabled",True) # OK for exploration, not great for performance
spark.conf.set("spark.sql.repl.eagerEval.truncate", 500)

In [2]:
import graphframes as gf
import pyspark.sql.functions as F

## Creating the Graph

In [3]:
# Reading in the data

flight_df = spark.read.csv("input/2009.csv", header=True, inferSchema=True)
display(flight_df)

DataFrame[FL_DATE: date, OP_CARRIER: string, OP_CARRIER_FL_NUM: int, ORIGIN: string, DEST: string, CRS_DEP_TIME: int, DEP_TIME: double, DEP_DELAY: double, TAXI_OUT: double, WHEELS_OFF: double, WHEELS_ON: double, TAXI_IN: double, CRS_ARR_TIME: int, ARR_TIME: double, ARR_DELAY: double, CANCELLED: double, CANCELLATION_CODE: string, DIVERTED: double, CRS_ELAPSED_TIME: double, ACTUAL_ELAPSED_TIME: double, AIR_TIME: double, DISTANCE: double, CARRIER_DELAY: double, WEATHER_DELAY: double, NAS_DELAY: double, SECURITY_DELAY: double, LATE_AIRCRAFT_DELAY: double, Unnamed: 27: string]

In [4]:
# Vertices

flight_vertices = (flight_df
                  .select(F.col("ORIGIN").alias("id"))
                  .union(flight_df
                        .select(F.col("DEST").alias("id")))
                  .distinct()
)

display(flight_vertices)

DataFrame[id: string]

In [5]:
# Edges

flight_edges = (flight_df
               .withColumnRenamed("ORIGIN","src")
               .withColumnRenamed("DEST","dst")
)

display(flight_edges)

DataFrame[FL_DATE: date, OP_CARRIER: string, OP_CARRIER_FL_NUM: int, src: string, dst: string, CRS_DEP_TIME: int, DEP_TIME: double, DEP_DELAY: double, TAXI_OUT: double, WHEELS_OFF: double, WHEELS_ON: double, TAXI_IN: double, CRS_ARR_TIME: int, ARR_TIME: double, ARR_DELAY: double, CANCELLED: double, CANCELLATION_CODE: string, DIVERTED: double, CRS_ELAPSED_TIME: double, ACTUAL_ELAPSED_TIME: double, AIR_TIME: double, DISTANCE: double, CARRIER_DELAY: double, WEATHER_DELAY: double, NAS_DELAY: double, SECURITY_DELAY: double, LATE_AIRCRAFT_DELAY: double, Unnamed: 27: string]

#### GraphFrame

In [6]:
flight_graph = gf.GraphFrame(flight_vertices, flight_edges)

flight_vertices.cache()
flight_edges.cache()

display(flight_graph)

GraphFrame(v:[id: string], e:[src: string, dst: string ... 26 more fields])

## Query 1 (0)

Compute different statistics : in-degree, out-degree and total degree (without existing functions). 

### In-degree

In [19]:
in_degree = flight_graph.edges.groupBy("dst").agg(F.count("*").alias("inDegree"))
in_degree.show()

+---+--------+
|dst|inDegree|
+---+--------+
|IAH|  182088|
|JAX|   28813|
|ABQ|   35577|
|IND|   38198|
|BOS|  110463|
|GRR|   13970|
|MEM|   71721|
|PBI|   25496|
|XNA|   13764|
|LBB|    8004|
|BTV|    6021|
|VPS|    6958|
|SYR|    9330|
|JFK|  119571|
|MBS|    3443|
|SBN|    4527|
|PDX|   52251|
|RDD|    1433|
|LNK|    2765|
|HPN|   10661|
+---+--------+
only showing top 20 rows



In [21]:
# For validation
flight_graph.inDegrees.show()

+---+--------+
| id|inDegree|
+---+--------+
|IAH|  182088|
|JAX|   28813|
|ABQ|   35577|
|IND|   38198|
|BOS|  110463|
|GRR|   13970|
|MEM|   71721|
|PBI|   25496|
|XNA|   13764|
|LBB|    8004|
|BTV|    6021|
|VPS|    6958|
|SYR|    9330|
|JFK|  119571|
|MBS|    3443|
|SBN|    4527|
|PDX|   52251|
|RDD|    1433|
|LNK|    2765|
|HPN|   10661|
+---+--------+
only showing top 20 rows



### Out-degree

In [17]:
out_degree = flight_graph.edges.groupBy("src").agg(F.count("*").alias("outDegree"))
out_degree.show()

+---+---------+
|src|outDegree|
+---+---------+
|IAH|   182097|
|JAX|    28810|
|ABQ|    35582|
|IND|    38201|
|GRR|    13973|
|LBB|     8002|
|MEM|    71713|
|BTV|     6028|
|BOS|   110460|
|PBI|    25500|
|XNA|    13755|
|VPS|     6959|
|SYR|     9336|
|JFK|   119574|
|MBS|     3444|
|SBN|     4526|
|PDX|    52242|
|RDD|     1433|
|LNK|     2765|
|HPN|    10657|
+---+---------+
only showing top 20 rows



In [18]:
# For validation
flight_graph.outDegrees.show()

+---+---------+
| id|outDegree|
+---+---------+
|IAH|   182097|
|JAX|    28810|
|ABQ|    35582|
|IND|    38201|
|GRR|    13973|
|LBB|     8002|
|MEM|    71713|
|BTV|     6028|
|BOS|   110460|
|PBI|    25500|
|XNA|    13755|
|VPS|     6959|
|SYR|     9336|
|JFK|   119574|
|MBS|     3444|
|SBN|     4526|
|PDX|    52242|
|RDD|     1433|
|LNK|     2765|
|HPN|    10657|
+---+---------+
only showing top 20 rows



### Total degree

In [16]:
# Finding the total degrees
total_degree = flight_graph.edges.select(F.col("src").alias("id")) \
    .union(flight_graph.edges.select(F.col("dst").alias("id"))) \
    .groupBy("id") \
    .agg(F.count("*").alias("degree"))

total_degree.show()

+---+------+
| id|degree|
+---+------+
|IAH|364185|
|JAX| 57623|
|ABQ| 71159|
|IND| 76399|
|GRR| 27943|
|LBB| 16006|
|MEM|143434|
|BTV| 12049|
|BOS|220923|
|PBI| 50996|
|XNA| 27519|
|VPS| 13917|
|SYR| 18666|
|JFK|239145|
|MBS|  6887|
|SBN|  9053|
|PDX|104493|
|RDD|  2866|
|LNK|  5530|
|HPN| 21318|
+---+------+
only showing top 20 rows



In [15]:
# For validation
flight_graph.degrees.show()

+---+------+
| id|degree|
+---+------+
|IAH|364185|
|JAX| 57623|
|ABQ| 71159|
|IND| 76399|
|BOS|220923|
|GRR| 27943|
|LBB| 16006|
|MEM|143434|
|BTV| 12049|
|PBI| 50996|
|XNA| 27519|
|VPS| 13917|
|SYR| 18666|
|JFK|239145|
|MBS|  6887|
|SBN|  9053|
|PDX|104493|
|RDD|  2866|
|LNK|  5530|
|HPN| 21318|
+---+------+
only showing top 20 rows



## Query 2 (1)
Compute the total number of triangles in the graph.

## Query 3 (2)
Compute a centrality measure of your choice natively on Spark using Graphframes.

## Query 4 (3)
Implement the PageRank algorithm natively on Spark using Graphframes.

## Query 5 (4)
Find the group of the most connected airports.