# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Autumn 2025** </center>
---
### <center> **Examples on Storage Solutions (Neo4j)** </center>
---
**Profesor**: Pablo Camarillo Ramirez

# Neo4j setup
## Install Neo4j with Docker

Go to **spark** directory and run:

```
docker run \
    -d --restart always \
    --publish=7474:7474 --publish=7687:7687 \
    --env NEO4J_AUTH=neo4j/neo4j@1234 \
    --volume=data_neo4j:/data \
    --name neo4j-iteso \
    --network spark_default \
    neo4j:2025.09.0
```

# Create SparkSession

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on storage solutions with Neo4j") \
    .master("spark://011cc7b3c81d:7077") \
    .config("spark.jars.packages", "org.neo4j:neo4j-connector-apache-spark_2.13:5.3.10_for_spark_3") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")
# Optimization (reduce the number of shuffle partitions)
spark.conf.set("spark.sql.shuffle.partitions", "5")

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2.5.2/cache
The jars for the packages stored in: /root/.ivy2.5.2/jars
org.neo4j#neo4j-connector-apache-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-b7b03080-73b6-45ba-968f-b50c0c3ffd13;1.0
	confs: [default]
	found org.neo4j#neo4j-connector-apache-spark_2.13;5.3.10_for_spark_3 in central
	found org.neo4j#neo4j-connector-apache-spark_2.13_common;5.3.10_for_spark_3 in central
	found org.neo4j#caniuse-core;1.3.0 in central
	found org.neo4j#caniuse-api;1.3.0 in central
	found org.jetbrains.kotlin#kotlin-stdlib;2.1.20 in central
	found org.jetbrains#annotations;13.0 in central
	found org.neo4j#caniuse-neo4j-detection;1.3.0 in central
	found org.neo4j.driver#neo4j-java-driver-slim;4.4.21 in central
	found org.reactivestreams#reactive-streams;1.0.4 in central
	found io.netty#netty-handler;4.1.

# Create DataFrames

In [2]:
from pcamarillor.spark_utils import SparkUtils

# Example Users dataset
users_data = [
    {"user_id": "U1", "name": "Alice", "country": "USA"},
    {"user_id": "U2", "name": "Bob", "country": "Canada"},
    {"user_id": "U3", "name": "Carla", "country": "Mexico"},
]
users_schema = SparkUtils.generate_schema([("user_id", "string"), ("name", "string"), ("country", "string")])

# Example Transactions dataset
transactions_data = [
    {"user_id": "U1", "product": "Laptop", "amount": 1200, "date": "2025-01-05"},
    {"user_id": "U2", "product": "Phone", "amount": 800, "date": "2025-01-07"},
    {"user_id": "U1", "product": "Mouse", "amount": 20, "date": "2025-01-10"},
    {"user_id": "U3", "product": "Laptop", "amount": 1150, "date": "2025-01-15"},
]
transactions_schema = SparkUtils.generate_schema([("user_id", "string"), ("product", "string"), ("amount", "int"), ("date", "string")])

# Create DataFrames
users_df = spark.createDataFrame(users_data, schema=users_schema)
transactions_df = spark.createDataFrame(transactions_data, schema=transactions_schema)


# Perform transformations

In [3]:
from pyspark.sql.functions import col

# --- NODES ---

# User Nodes
user_nodes = users_df.select(
    col("user_id").alias("id"),
    col("name"),
    col("country")
).dropDuplicates(["id"])
user_nodes.show()

# Product Nodes
product_nodes = transactions_df.select(
    col("product").alias("id")
).dropDuplicates(["id"])
product_nodes.show()


# --- EDGES ---

# Purchases: relationships between User -> Product
purchase_edges = transactions_df.select(
    col("user_id").alias("src"),   # source node (User)
    col("product").alias("dst"),   # destination node (Product)
    col("amount"),
    col("date")
)
purchase_edges.show()

                                                                                

+---+-----+-------+
| id| name|country|
+---+-----+-------+
| U1|Alice|    USA|
| U2|  Bob| Canada|
| U3|Carla| Mexico|
+---+-----+-------+



                                                                                

+------+
|    id|
+------+
|Laptop|
| Phone|
| Mouse|
+------+

+---+------+------+----------+
|src|   dst|amount|      date|
+---+------+------+----------+
| U1|Laptop|  1200|2025-01-05|
| U2| Phone|   800|2025-01-07|
| U1| Mouse|    20|2025-01-10|
| U3|Laptop|  1150|2025-01-15|
+---+------+------+----------+



# Write data to a Neo4j Graph

In [None]:
neo4j_url = "bolt://neo4j-iteso:7687"
neo4j_user = "neo4j"
neo4j_passwd = "neo4j@1234"

user_nodes.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("labels", ":User") \
  .option("node.keys", "id") \
  .save()

print(f"{user_nodes.count()} users wrote in Neo4j")


product_nodes.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("labels", ":Product") \
  .option("node.keys", "id") \
  .save()

print(f"{product_nodes.count()} product nodes wrote in Neo4j")

purchase_edges.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("relationship", "PURCHASED") \
  .option("relationship.save.strategy", "keys") \
  .option("relationship.source.labels", ":User") \
  .option("relationship.source.save.mode", "match") \
  .option("relationship.source.node.keys", "src:id") \
  .option("relationship.target.labels", ":Product") \
  .option("relationship.target.save.mode", "match") \
  .option("relationship.target.node.keys", "dst:id") \
  .save()

print(f"{purchase_edges.count()} purchase edges wrote in Neo4j")

# Read data from a Neo4j Graph

In [None]:
cypher_df = spark.read \
    .format("org.neo4j.spark.DataSource") \
    .option("url", neo4j_url) \
    .option("authentication.basic.username", neo4j_user) \
    .option("authentication.basic.password", neo4j_passwd) \
    .option("query",
            """
            MATCH (u:User)-[r:PURCHASED]->(p:Product)
            WHERE r.amount > 1000
            RETURN u.name, p.id, r.amount
            """) \
    .load()

cypher_df.show()

In [None]:
sc.stop()