# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 05**: Data pipeline with Neo4j

**Date**: October 2nd 2025

**Student Name**: Axel Escoto García

**Professor**: Pablo Camarillo Ramirez

# Dataset description

# Data ingestion

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on SparkSQL") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "org.neo4j:neo4j-connector-apache-spark_2.13:5.3.10_for_spark_3") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2.5.2/cache
The jars for the packages stored in: /root/.ivy2.5.2/jars
org.neo4j#neo4j-connector-apache-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1588d46e-8ecd-4666-970e-214b0291f041;1.0
	confs: [default]
	found org.neo4j#neo4j-connector-apache-spark_2.13;5.3.10_for_spark_3 in central
	found org.neo4j#neo4j-connector-apache-spark_2.13_common;5.3.10_for_spark_3 in central
	found org.neo4j#caniuse-core;1.3.0 in central
	found org.neo4j#caniuse-api;1.3.0 in central
	found org.jetbrains.kotlin#kotlin-stdlib;2.1.20 in central
	found org.jetbrains#annotations;13.0 in central
	found org.neo4j#caniuse-neo4j-detection;1.3.0 in central
	found org.neo4j.driver#neo4j-java-driver-slim;4.4.21 in central
	found org.reactivestreams#reactive-streams;1.0.4 in central
	found io.netty#netty-handler;4.1.

In [3]:
# Build schema
# Import your module
from axel2293.spark_utils import SparkUtils

witcher_schema = [
    ("id", "string"),
    ("source", "string"),
    ("target", "string"),
    ("type", "string"),
    ("weight", "int"),
    ("book", "int"),
]
witcher_schema = SparkUtils.generate_schema(witcher_schema)

witcher_df = spark.read.schema(witcher_schema).option("header", True).csv("/opt/spark/work-dir/data/witcher_network/witcher_network.csv")
witcher_df.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+---+-------+-------+----------+------+----+
| id| source| target|      type|weight|book|
+---+-------+-------+----------+------+----+
|  0|Velerad| Geralt|Undirected|     1|   1|
|  1| Geralt|Foltest|Undirected|     5|   1|
|  2|Foltest| Geralt|Undirected|     4|   1|
|  3|   Adda| Geralt|Undirected|     1|   1|
|  4| Geralt| Ostrit|Undirected|     4|   1|
+---+-------+-------+----------+------+----+
only showing top 5 rows


                                                                                

# Transformations

In [8]:
# Add the code for your transformations to create nodes and edges DataFrames HERE
from pyspark.sql.functions import col

# Nodos
nodes_source = witcher_df.select(col("source").alias("id"))
nodes_target = witcher_df.select(col("target").alias("id"))

# Were gonna have duplicates, so create a unique df with the union
nodes_df = nodes_source.union(nodes_target).distinct()
nodes_df.show()

+-----------+
|         id|
+-----------+
|      Roach|
|      Sword|
|   Vysogota|
|    Radovid|
|      Baron|
|   Aelirenn|
|    Carduin|
|     Rience|
|     Ostrit|
|      Munro|
|    Venzlav|
|    Visenna|
| Akerspaark|
|Ihuarraquax|
|Filavandrel|
|    Everett|
|    Esterad|
|      Rayla|
|     Civril|
|     Little|
+-----------+
only showing top 20 rows


In [7]:
# Edges
edges_df = witcher_df.select(
    col("Source").alias("src"),
    col("Target").alias("dst"),
    col("weight").cast("int"),
    col("book").cast("int")
)
edges_df.show(5)

+-------+-------+------+----+
|    src|    dst|weight|book|
+-------+-------+------+----+
|Velerad| Geralt|     1|   1|
| Geralt|Foltest|     5|   1|
|Foltest| Geralt|     4|   1|
|   Adda| Geralt|     1|   1|
| Geralt| Ostrit|     4|   1|
+-------+-------+------+----+
only showing top 5 rows


# Writing Data in Neo4j

In [11]:
# Add the code to write a graph from PySpark's DataFrames to Neo4j
neo4j_url = "bolt://neo4j-iteso:7687"
neo4j_user = "neo4j"
neo4j_passwd = "neo4j@1234"

nodes_df.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("labels", ":Characters") \
  .option("node.keys", "id") \
  .save()

print(f"{nodes_df.count()} characters nodes wrote in Neo4j")

edges_df.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("relationship", "INTERACTED") \
  .option("relationship.save.strategy", "keys") \
  .option("relationship.source.labels", ":Characters") \
  .option("relationship.source.save.mode", "match") \
  .option("relationship.source.node.keys", "src:id") \
  .option("relationship.target.labels", ":Characters") \
  .option("relationship.target.save.mode", "match") \
  .option("relationship.target.node.keys", "dst:id") \
  .save()

print(f"{edges_df.count()} Interaction edges wrote in Neo4j")

224 characters nodes wrote in Neo4j
2600 Interaction edges wrote in Neo4j


# Read and Query Graphs with PySpark

In [25]:
# Add the code to read a data frame from Neo4J and run a simple query to verify 
cypher_df = spark.read \
    .format("org.neo4j.spark.DataSource") \
    .option("url", neo4j_url) \
    .option("authentication.basic.username", neo4j_user) \
    .option("authentication.basic.password", neo4j_passwd) \
    .option("query",
            """
            MATCH (a:Characters)-[r:INTERACTED]->(b:Characters)
            WHERE r.weight > 20
            ORDER BY r.weight DESC
            RETURN a.id AS Source, b.id AS Target, r.weight, r.book
            """) \
    .load()

cypher_df.show()

+--------+----------+--------+------+
|  Source|    Target|r.weight|r.book|
+--------+----------+--------+------+
|Yennefer|      Ciri|      44|     7|
|    Ciri|  Yennefer|      40|     7|
|  Kelpie|      Ciri|      36|     7|
|  Geralt|      Ciri|      34|     7|
|  Zoltan|    Geralt|      33|     5|
|  Rience| Dandelion|      29|     3|
|  Geralt|      Essi|      29|     2|
|  Geralt| Dandelion|      28|     7|
|    Ciri|    Geralt|      28|     7|
|  Zoltan|     Milva|      27|     5|
|  Geralt|     Yurga|      27|     2|
|Yennefer|    Geralt|      25|     7|
|    Ciri|    Kelpie|      25|     7|
| Istredd|    Geralt|      24|     2|
|  Geralt|   Istredd|      24|     2|
|    Essi|    Geralt|      23|     2|
|  Geralt| Fringilla|      22|     7|
|  Geralt|Vilgefortz|      22|     7|
|   Yurga|    Geralt|      21|     2|
|    Ciri|     Fabio|      21|     4|
+--------+----------+--------+------+



In [29]:
# Relaciones más importantes con Geralt

cypher_df = spark.read \
    .format("org.neo4j.spark.DataSource") \
    .option("url", neo4j_url) \
    .option("authentication.basic.username", neo4j_user) \
    .option("authentication.basic.password", neo4j_passwd) \
    .option("query",
            """
            MATCH (g:Characters {id:"Geralt"})-[r:INTERACTED]-(n:Characters)
            WHERE r.weight > 10
            ORDER BY r.weight DESC
            RETURN g.id AS Source, n.id AS Target, r.weight, r.book
            """) \
    .load()

cypher_df.show()

+------+----------+--------+------+
|Source|    Target|r.weight|r.book|
+------+----------+--------+------+
|Geralt|      Ciri|      34|     7|
|Geralt|    Zoltan|      33|     5|
|Geralt|      Essi|      29|     2|
|Geralt|      Ciri|      28|     7|
|Geralt| Dandelion|      28|     7|
|Geralt|     Yurga|      27|     2|
|Geralt|  Yennefer|      25|     7|
|Geralt|   Istredd|      24|     2|
|Geralt|   Istredd|      24|     2|
|Geralt|      Essi|      23|     2|
|Geralt| Fringilla|      22|     7|
|Geralt|Vilgefortz|      22|     7|
|Geralt|     Yurga|      21|     2|
|Geralt|    Eithné|      18|     2|
|Geralt|    Braenn|      18|     2|
|Geralt|   Visenna|      17|     2|
|Geralt|    Braenn|      17|     2|
|Geralt| Dandelion|      15|     7|
|Geralt| Chireadan|      15|     1|
|Geralt|  Herbolth|      15|     2|
+------+----------+--------+------+
only showing top 20 rows


In [None]:
sc.stop()