# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 05**: Data pipeline with Neo4j

**Date**: October 2nd 2025

**Student Name**: Pablo Quintero

**Professor**: Pablo Camarillo Ramirez

# Dataset description

# Data ingestion

In [10]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Lab06") \
    .master("spark://f5f576f1e425:7077") \
    .config("spark.jars.packages", "org.neo4j:neo4j-connector-apache-spark_2.13:5.3.10_for_spark_3") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")
# Optimization (reduce the number of shuffle partitions)
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [11]:
from PabloQuintero.spark_utils import SparkUtils
# Build schema
schema_breakingbad= SparkUtils.generate_schema([("Season", "string"), ("Characters", "string")])

# Import your module
df_Bad = spark.read.schema(schema_breakingbad).option("header", True).csv("/opt/spark/work-dir/data/BreakingBad")
df_Bad.show(truncate=False)


+--------+------------------------------------+
|Season  |Characters                          |
+--------+------------------------------------+
|Season_1|Bryan Cranston as Walter White      |
|Season_1|Anna Gunn as Skyler White           |
|Season_1|Aaron Paul as Jesse Pinkman         |
|Season_1|Dean Norris as Hank Schrader        |
|Season_1|Betsy Brandt as Marie Schrader      |
|Season_1|RJ Mitte as Walter White Jr.        |
|Season_1|Max Arciniega as Krazy-8            |
|Season_1|John Koyama as Emilio               |
|Season_1|Steven Michael Quezada as Gomez     |
|Season_1|Marius Stan as Bogdan               |
|Season_1|Aaron Hill as Jock                  |
|Season_1|Greg Chase as Dr. Belknap           |
|Season_1|Carmen Serano as Carmen             |
|Season_1|Evan Bobrick as Chad                |
|Season_1|Roberta Marquez as Chad's Girlfriend|
|Season_1|Christopher Dempsey as E.M.T.       |
|Season_1|Allan Pacheco as Irving             |
|Season_1|Jason Byrd as Chemistry Studen

# Transformations

In [12]:
from pyspark.sql.functions import col, split, trim, lit, when
# Add the code for your transformations to create nodes and edges
processed_df = df_Bad \
    .where(col("Characters").contains(" as ")) \
    .withColumn("split_char", split(col("Characters"), " as ")) \
    .withColumn("ActorName", trim(col("split_char").getItem(0))) \
    .withColumn("CharacterName", trim(col("split_char").getItem(1))) \
    .select("Season", "ActorName", "CharacterName") \
    .na.drop()
# Nodos de tipo 'Actor'
actor_nodes = processed_df.select(
    col("ActorName").alias("id")
).distinct().withColumn("label", lit("Actor"))
# Nodos de tipo 'Personaje'
character_nodes = processed_df.withColumn(
    "id",
    when(col("CharacterName") == "Himself", col("ActorName"))
    .otherwise(col("CharacterName"))
).select("id").distinct().withColumn("label", lit("Personaje"))
# Nodos de tipo 'Temporada'
season_nodes = processed_df.select(
    col("Season").alias("id")
).distinct().withColumn("label", lit("Temporada"))
nodes = actor_nodes.unionByName(character_nodes).unionByName(season_nodes)
# Relaciones: (Actor) (Personaje)
plays_edges = processed_df.select(
    col("ActorName").alias("src"),
    when(col("CharacterName") == "Himself", col("ActorName"))
    .otherwise(col("CharacterName")).alias("dst")
).distinct().withColumn("relationship", lit("PLAYS"))
# Relaciones: (Personaje) (Temporada)
appears_in_edges = processed_df.select(
    when(col("CharacterName") == "Himself", col("ActorName"))
    .otherwise(col("CharacterName")).alias("src"),
    col("Season").alias("dst")
).distinct().withColumn("relationship", lit("APPEARS_IN"))
edges = plays_edges.unionByName(appears_in_edges)
# DOS DataFrames finales
print("--- DataFrame Final de Nodos ---")
nodes.show(20, truncate=False)

print("--- DataFrame Final de Relaciones ---")
edges.show(20, truncate=False)

--- DataFrame Final de Nodos ---
+----------------------+-----+
|id                    |label|
+----------------------+-----+
|Bryan Cranston        |Actor|
|John Koyama           |Actor|
|Steven Michael Quezada|Actor|
|Evan Bobrick          |Actor|
|Jon Kristian Moore    |Actor|
|Robert Anthony Brass  |Actor|
|Adam Godley           |Actor|
|Marc Mouchet          |Actor|
|Frederic Doss         |Actor|
|Dennis Keiffer        |Actor|
|Kristen Loree         |Actor|
|Vic Browder           |Actor|
|Matthew Lee Jones     |Actor|
|Tom Kiesche           |Actor|
|Drew Waters           |Actor|
|Liam Ruggles          |Actor|
|Caleb Jones           |Actor|
|John Christopher Hicks|Actor|
|Tina Parker           |Actor|
|Jimmy Daniels         |Actor|
+----------------------+-----+
only showing top 20 rows
--- DataFrame Final de Relaciones ---
+----------------------+--------------------+------------+
|src                   |dst                 |relationship|
+----------------------+------------------

# Writing Data in Neo4j

In [None]:
# Add the code to write a graph from PySpark's DataFrames to Neo4j

neo4j_url = "bolt://neo4j-iteso:7687"
neo4j_user = "neo4j"
neo4j_passwd = "neo4j@1234"

actor_nodes_to_write = nodes.where(col("label") == "Actor").select("id")
actor_nodes_to_write.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("labels", ":Actor") \
  .option("node.keys", "id") \
  .save()
print(f"-> {actor_nodes_to_write.count()} nodos :Actor escritos en Neo4j")

character_nodes_to_write = nodes.where(col("label") == "Personaje").select("id")
character_nodes_to_write.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("labels", ":Personaje") \
  .option("node.keys", "id") \
  .save()
print(f"-> {character_nodes_to_write.count()} nodos :Personaje escritos en Neo4j")


season_nodes_to_write = nodes.where(col("label") == "Temporada").select("id")
season_nodes_to_write.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("labels", ":Temporada") \
  .option("node.keys", "id") \
  .save()
print(f"-> {season_nodes_to_write.count()} nodos :Temporada escritos en Neo4j")


plays_edges_to_write = edges.where(col("relationship") == "PLAYS")
plays_edges_to_write.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("relationship", "PLAYS") \
  .option("relationship.save.strategy", "keys") \
  .option("relationship.source.labels", ":Actor") \
  .option("relationship.source.node.keys", "src:id") \
  .option("relationship.target.labels", ":Personaje") \
  .option("relationship.target.node.keys", "dst:id") \
  .save()
print(f"-> {plays_edges_to_write.count()} relaciones :PLAYS escritas en Neo4j")

appears_in_edges_to_write = edges.where(col("relationship") == "APPEARS_IN")
appears_in_edges_to_write.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("relationship", "APPEARS_IN") \
  .option("relationship.save.strategy", "keys") \
  .option("relationship.source.labels", ":Personaje") \
  .option("relationship.source.node.keys", "src:id") \
  .option("relationship.target.labels", ":Temporada") \
  .option("relationship.target.node.keys", "dst:id") \
  .save()
print(f"-> {appears_in_edges_to_write.count()} relaciones :APPEARS_IN escritas en Neo4j")

                                                                                

-> 417 nodos :Actor escritos en Neo4j
-> 436 nodos :Personaje escritos en Neo4j
-> 6 nodos :Temporada escritos en Neo4j


                                                                                

-> 482 relaciones :PLAYS escritas en Neo4j
-> 602 relaciones :APPEARS_IN escritas en Neo4j


# Read and Query Graphs with PySpark

In [16]:
# Add the code to read a data frame from Neo4J and run a simple query to verify 
print("\n Amigos de reparto de Bryan Cranston")

co_actors_df = spark.read \
    .format("org.neo4j.spark.DataSource") \
    .option("url", neo4j_url) \
    .option("authentication.basic.username", neo4j_user) \
    .option("authentication.basic.password", neo4j_passwd) \
    .option("query",
            """
            MATCH (aaron:Actor {id: 'Bryan Cranston'})-[:PLAYS]->(:Personaje)-[:APPEARS_IN]->(s:Temporada)
            MATCH (coactor:Actor)-[:PLAYS]->(:Personaje)-[:APPEARS_IN]->(s)
            WHERE aaron <> coactor
            RETURN coactor.id AS ColegaDeReparto, s.id AS TemporadaEnComun
            ORDER BY ColegaDeReparto, TemporadaEnComun
            """) \
    .load()
co_actors_df.show()


 Amigos de reparto de Bryan Cranston
+--------------------+----------------+
|     ColegaDeReparto|TemporadaEnComun|
+--------------------+----------------+
|          "Anna Gunn|        Season_3|
|"Michael ""Yak"" ...|        Season_4|
|          Aaron Hill|        Season_1|
|          Aaron Paul|        Season_1|
|          Aaron Paul|        Season_2|
|          Aaron Paul|        Season_3|
|          Aaron Paul|        Season_4|
|          Aaron Paul|       Season_5A|
|          Aaron Paul|       Season_5B|
|        Aaron Wright|       Season_5B|
|         Adam Godley|        Season_1|
|         Adam Godley|       Season_5B|
|    Alex Gianopoulos|        Season_4|
|    Alex Gianopoulos|       Season_5B|
|         Alex Knight|       Season_5A|
|       Allan Pacheco|        Season_1|
|     Amanda Fresquez|        Season_4|
|    Amanda Schofield|        Season_3|
|    Amanda Schofield|        Season_4|
|    Amanda Schofield|       Season_5A|
+--------------------+----------------+
on

In [None]:
sc.stop()