# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 05**: Data pipeline with Neo4j

**Date**: October 2nd 2025

**Student Name**: Luis Antonio Pelayo Sierra

**Professor**: Pablo Camarillo Ramirez

# Dataset description

https://www.kaggle.com/datasets/alenreuel/airport-network

Origen: Source Airport

Destino: Destination Airport

Aristas: Airlines

# Data ingestion

In [11]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on SparkSQL") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "org.neo4j:neo4j-connector-apache-spark_2.13:5.3.10_for_spark_3") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [12]:
# Build schema
# Import your module
from luis_pelayo.spark_utils import SparkUtils
schema = SparkUtils.generate_schema([("Airline", "string"), 
                                     ("Airline ID","int"), 
                                     ("Source airport","string"), 
                                     ("Source airport ID","int"), 
                                     ("Destination airport","string"), 
                                     ("Destination airport ID","int"), 
                                     ("Codeshare","string"), 
                                     ("Stops","int"), 
                                     ("Equipment","string")])

path = "/opt/spark/work-dir/data/airline/"

df_airlines = spark.read \
                .option("header", "true") \
                .schema(schema) \
                .csv(path + "airline_network.csv")


df_airlines = df_airlines.dropna(how='any')
df_airlines = df_airlines.dropDuplicates()

df_airlines.show()


[Stage 0:>                                                          (0 + 1) / 1]

+-------+----------+--------------+-----------------+-------------------+----------------------+---------+-----+---------+
|Airline|Airline ID|Source airport|Source airport ID|Destination airport|Destination airport ID|Codeshare|Stops|Equipment|
+-------+----------+--------------+-----------------+-------------------+----------------------+---------+-----+---------+
|     7H|     16726|           SCM|             7209|                BET|                  3599|        Y|    0|      CNA|
|     9W|      3000|           GAU|             6173|                AJL|                  3039|        Y|    0|      AT7|
|     A3|        96|           CPH|              609|                SKG|                  1486|        Y|    0|      320|
|     AA|        24|           BOS|             3448|                PIT|                  3570|        Y|    0|  E70 CRJ|
|     AA|        24|           OAK|             3453|                KOA|                  3514|        Y|    0|      737|
|     AA|       

                                                                                

# Transformations

In [13]:
# Add the code for your transformations to create nodes and edges DataFrames HERE
from pyspark.sql.functions import col

airports = df_airlines.select(col("Source airport ID").alias("id"), col("Source airport").alias("name")).dropDuplicates(["id"])
airports.show()

connections = df_airlines.select(col("Source airport ID").alias("src"), col("Destination airport ID").alias("dst"), col("Airline").alias("airline")).dropDuplicates()
connections.show()


+---+----+
| id|name|
+---+----+
|  5| POM|
| 16| KEF|
| 21| YAM|
| 27| YBC|
| 28| YBG|
| 29| YBK|
| 30| YBL|
| 33| YCD|
| 34| YCG|
| 41| YZS|
| 45| YDF|
| 49| YEG|
| 50| YEK|
| 55| YFB|
| 56| YFC|
| 61| YGK|
| 63| YGP|
| 65| YGR|
| 73| YHZ|
| 78| YKA|
+---+----+
only showing top 20 rows
+----+----+-------+
| src| dst|airline|
+----+----+-------+
| 132| 147|     5T|
|6173|3057|     9W|
|4037|3670|     AA|
|3644|3577|     AA|
|3670|3488|     AA|
|3697|3645|     AA|
| 507| 687|     AA|
|3576|3488|     AA|
|3731| 507|     AA|
|3577| 156|     AA|
| 507|1613|     AB|
| 346|2179|     AB|
| 146| 149|     AC|
| 182| 146|     AC|
|3682|3875|     AF|
|3875|3682|     AF|
|4041|3682|     AF|
| 495|1265|     AF|
|3448|3682|     AM|
|3576|1819|     AM|
+----+----+-------+
only showing top 20 rows


# Writing Data in Neo4j

In [14]:
# Add the code to write a graph from PySpark's DataFrames to Neo4j
neo4j_url = "bolt://neo4j-iteso:7687"
neo4j_user = "neo4j"
neo4j_passwd = "neo4j@1234"

airports.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("labels", ":Airports") \
  .option("node.keys", "id") \
  .save()

connections.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("relationship", "AIRLINE") \
  .option("relationship.save.strategy", "keys") \
  .option("relationship.source.labels", ":Airports") \
  .option("relationship.source.save.mode", "match") \
  .option("relationship.source.node.keys", "src:id") \
  .option("relationship.target.labels", ":Airports") \
  .option("relationship.target.save.mode", "match") \
  .option("relationship.target.node.keys", "dst:id") \
  .save()


                                                                                

# Read and Query Graphs with PySpark

In [15]:
# Add the code to read a data frame from Neo4J and run a simple query to verify 
table = spark.read \
    .format("org.neo4j.spark.DataSource") \
    .option("url", neo4j_url) \
    .option("authentication.basic.username", neo4j_user) \
    .option("authentication.basic.password", neo4j_passwd) \
    .option("query","MATCH (S:Airports)-[A:AIRLINE]->(D:Airports) RETURN S.name AS Source, A.airline AS Airline, D.name AS Destination")\
    .load()

table.show()

+------+-------+-----------+
|Source|Airline|Destination|
+------+-------+-----------+
|   POM|     QF|        CNS|
|   POM|     QF|        SYD|
|   POM|     VA|        BNE|
|   KEF|     W2|        CPH|
|   KEF|     W2|        SXF|
|   KEF|     W2|        LGW|
|   KEF|     W2|        ALC|
|   KEF|     W2|        CDG|
|   KEF|     AY|        HEL|
|   YAM|     AC|        YYZ|
|   YBC|     AC|        YYY|
|   YBC|     AC|        YUL|
|   YBG|     AC|        YUL|
|   YBK|     5T|        YCS|
|   YBK|     5T|        YXN|
|   YBL|     AC|        YQQ|
|   YCD|     AC|        YVR|
|   YCG|     AC|        YVR|
|   YCG|     AC|        YYC|
|   YZS|     5T|        YRT|
+------+-------+-----------+
only showing top 20 rows


In [6]:
sc.stop()