# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 05**: Data pipeline with Neo4j

**Date**: October 2nd 2025

**Student Name**: Mateo Garcia Lopez

**Professor**: Pablo Camarillo Ramirez

# Dataset description

# Data ingestion

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on SparkSQL") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "org.neo4j:neo4j-connector-apache-spark_2.13:5.3.10_for_spark_3") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2.5.2/cache
The jars for the packages stored in: /root/.ivy2.5.2/jars
org.neo4j#neo4j-connector-apache-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ffc8f543-187a-4ad4-af8e-69073a17ccc8;1.0
	confs: [default]
	found org.neo4j#neo4j-connector-apache-spark_2.13;5.3.10_for_spark_3 in central
	found org.neo4j#neo4j-connector-apache-spark_2.13_common;5.3.10_for_spark_3 in central
	found org.neo4j#caniuse-core;1.3.0 in central
	found org.neo4j#caniuse-api;1.3.0 in central
	found org.jetbrains.kotlin#kotlin-stdlib;2.1.20 in central
	found org.jetbrains#annotations;13.0 in central
	found org.neo4j#caniuse-neo4j-detection;1.3.0 in central
	found org.neo4j.driver#neo4j-java-driver-slim;4.4.21 in central
	found org.reactivestreams#reactive-streams;1.0.4 in central
	found io.netty#netty-handler;4.1.

In [2]:
# Build schema
# Import your module
from mateogarcial.spark_utils import SparkUtils

# 1. Define the data and column information
users_data = [("u1", "Mateo"), ("u2", "Juan"), ("u3", "Carla")]
users_columns_info = [("user_id", "string"), ("name", "string")]

transactions_data = [("u1", "Laptop", 6900), ("u2", "Phone", 999), ("u1", "Mouse", 125), ("u3", "Laptop", 4337)]
transactions_columns_info = [("user_id", "string"), ("product", "string"), ("amount", "int")]

friends_data = [("u1", "u2"), ("u2", "u3")]
friends_columns_info = [("user_id_1", "string"), ("user_id_2", "string")]

# 2. Generate schemas using my utility
users_schema = SparkUtils.generate_schema(users_columns_info)
transactions_schema = SparkUtils.generate_schema(transactions_columns_info)
friends_schema = SparkUtils.generate_schema(friends_columns_info)

# 3. Create the DataFrames with the generated schemas
users_df = spark.createDataFrame(users_data, users_schema)
transactions_df = spark.createDataFrame(transactions_data, transactions_schema)
friends_df = spark.createDataFrame(friends_data, friends_schema)

# 4. Verify the results
print("DataFrames created successfully using generated schemas:")
users_df.show()
users_df.printSchema()

transactions_df.show()
transactions_df.printSchema()

friends_df.show()
friends_df.printSchema()

DataFrames created successfully using generated schemas:


                                                                                

+-------+-----+
|user_id| name|
+-------+-----+
|     u1|Mateo|
|     u2| Juan|
|     u3|Carla|
+-------+-----+

root
 |-- user_id: string (nullable = true)
 |-- name: string (nullable = true)

+-------+-------+------+
|user_id|product|amount|
+-------+-------+------+
|     u1| Laptop|  6900|
|     u2|  Phone|   999|
|     u1|  Mouse|   125|
|     u3| Laptop|  4337|
+-------+-------+------+

root
 |-- user_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)

+---------+---------+
|user_id_1|user_id_2|
+---------+---------+
|       u1|       u2|
|       u2|       u3|
+---------+---------+

root
 |-- user_id_1: string (nullable = true)
 |-- user_id_2: string (nullable = true)



# Transformations

In [3]:
# Add the code for your transformations to create nodes and edges DataFrames HERE
from pyspark.sql.functions import col

# 1. Create a DF for User nodes
user_nodes_df = users_df.select(
    col("user_id").alias("id"),
    "name"
)

# 2. Create a DF for Product nodes
product_nodes_df = transactions_df.select("product").distinct().withColumnRenamed("product", "id")

# 3. Create a DF for PURCHASED edges
purchased_edges_df = transactions_df.select(
    col("user_id").alias("src"),
    col("product").alias("dst"),
    "amount"
)

# 4. Create a DF for FRIEND edges
friends_edges_df = friends_df.select(
    col("user_id_1").alias("src"),
    col("user_id_2").alias("dst")
)

print("Four DataFrames: user_nodes_df, product_nodes_df, purchased_edges_df, friends_edges_df")
user_nodes_df.show()
product_nodes_df.show()
purchased_edges_df.show()
friends_edges_df.show()

Four DataFrames: user_nodes_df, product_nodes_df, purchased_edges_df, friends_edges_df
+---+-----+
| id| name|
+---+-----+
| u1|Mateo|
| u2| Juan|
| u3|Carla|
+---+-----+



                                                                                

+------+
|    id|
+------+
| Phone|
|Laptop|
| Mouse|
+------+

+---+------+------+
|src|   dst|amount|
+---+------+------+
| u1|Laptop|  6900|
| u2| Phone|   999|
| u1| Mouse|   125|
| u3|Laptop|  4337|
+---+------+------+

+---+---+
|src|dst|
+---+---+
| u1| u2|
| u2| u3|
+---+---+



# Writing Data in Neo4j

In [4]:
# --- Define Connection Details ---
NEO4J_URL = "bolt://neo4j-iteso:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "neo4j@1234"

# Write User nodes
user_nodes_df.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("Overwrite") \
    .option("url", NEO4J_URL) \
    .option("authentication.basic.username", NEO4J_USER) \
    .option("authentication.basic.password", NEO4J_PASSWORD) \
    .option("query", "MERGE (u:User {id: event.id}) SET u.name = event.name") \
    .save()

# Write Product nodes
product_nodes_df.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("Append") \
    .option("url", NEO4J_URL) \
    .option("authentication.basic.username", NEO4J_USER) \
    .option("authentication.basic.password", NEO4J_PASSWORD) \
    .option("query", "MERGE (p:Product {id: event.id})") \
    .save()

# Write PURCHASED relationships
purchased_edges_df.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("Append") \
    .option("url", NEO4J_URL) \
    .option("authentication.basic.username", NEO4J_USER) \
    .option("authentication.basic.password", NEO4J_PASSWORD) \
    .option("relationship", "PURCHASED") \
    .option("relationship.save.strategy", "keys") \
    .option("relationship.source.labels", ":User") \
    .option("relationship.source.node.properties", "src:id") \
    .option("relationship.target.labels", ":Product") \
    .option("relationship.target.node.properties", "dst:id") \
    .option("relationship.properties", "amount") \
    .save()

# Write FRIEND relationships
friends_edges_df.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("Append") \
    .option("url", NEO4J_URL) \
    .option("authentication.basic.username", NEO4J_USER) \
    .option("authentication.basic.password", NEO4J_PASSWORD) \
    .option("relationship", "FRIEND") \
    .option("relationship.save.strategy", "keys") \
    .option("relationship.source.labels", ":User") \
    .option("relationship.source.node.properties", "src:id") \
    .option("relationship.target.labels", ":User") \
    .option("relationship.target.node.properties", "dst:id") \
    .save()

print("Graph data has been successfully written.")

Graph data has been successfully written.


# Read and Query Graphs with PySpark

In [5]:
# Add the code to read a data frame from Neo4J and run a simple query to verify 
cypher_query = """
MATCH (u:User)-[:PURCHASED]->(p:Product {id: 'Laptop'})
OPTIONAL MATCH (u)-[:FRIEND]->(friend:User)
RETURN u.name AS user, p.id AS product, collect(friend.name) AS friends
"""

graph_data = spark.read \
    .format("org.neo4j.spark.DataSource") \
    .option("url", NEO4J_URL) \
    .option("authentication.type", "basic") \
    .option("authentication.basic.username", NEO4J_USER) \
    .option("authentication.basic.password", NEO4J_PASSWORD) \
    .option("query", cypher_query) \
    .load()

print("Query results from Neo4j:")
graph_data.show()

Query results from Neo4j:
+-----+-------+--------------------+
| user|product|             friends|
+-----+-------+--------------------+
|Carla| Laptop|[Juan, Carla, Mat...|
| Juan| Laptop|[Juan, Carla, Mat...|
|Mateo| Laptop|[Carla, Juan, Mat...|
+-----+-------+--------------------+



In [6]:
sc.stop()