# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 05**: Data pipeline with Neo4j

**Date**: October 2nd 2025

**Student Name**:

**Professor**: Pablo Camarillo Ramirez

# Dataset description

# Data ingestion

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on SparkSQL") \
    .master("spark://011cc7b3c81d:7077") \
    .config("spark.jars.packages", "org.neo4j:neo4j-connector-apache-spark_2.13:5.3.10_for_spark_3") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2.5.2/cache
The jars for the packages stored in: /root/.ivy2.5.2/jars
org.neo4j#neo4j-connector-apache-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-bcd4c61e-be67-40f7-8f7c-e0b35fc22bbf;1.0
	confs: [default]
	found org.neo4j#neo4j-connector-apache-spark_2.13;5.3.10_for_spark_3 in central
	found org.neo4j#neo4j-connector-apache-spark_2.13_common;5.3.10_for_spark_3 in central
	found org.neo4j#caniuse-core;1.3.0 in central
	found org.neo4j#caniuse-api;1.3.0 in central
	found org.jetbrains.kotlin#kotlin-stdlib;2.1.20 in central
	found org.jetbrains#annotations;13.0 in central
	found org.neo4j#caniuse-neo4j-detection;1.3.0 in central
	found org.neo4j.driver#neo4j-java-driver-slim;4.4.21 in central
	found org.reactivestreams#reactive-streams;1.0.4 in central
	found io.netty#netty-handler;4.1.

In [3]:
from luis_gonzalez.spark_utils import SparkUtils

schema_columns = [
    ("Rank", "int"),
    ("Title", "string"),
    ("Genre", "string"),
    ("Description", "string"),
    ("Director", "string"),
    ("Actors", "string"),
    ("Year", "int"),
    ("Runtime (Minutes)", "string"), 
    ("Rating", "float"),
    ("Votes", "int"),
    ("Revenue (Millions)", "float"),
    ("Metascore", "int")
]

movie_schema = SparkUtils.generate_schema(schema_columns)


base_path = "/opt/spark/work-dir/data/movies/"

df_movies = spark.read \
           .option("header", "true") \
           .schema(movie_schema) \
           .csv(base_path + "IMDB-Movie-Data.csv")

df_movies = df_movies.filter(df_movies.Rank > 5)
df_movies.show(n=5)


                                                                                

+----+------------------+--------------------+--------------------+--------------------+--------------------+----+-----------------+------+------+------------------+---------+
|Rank|             Title|               Genre|         Description|            Director|              Actors|Year|Runtime (Minutes)|Rating| Votes|Revenue (Millions)|Metascore|
+----+------------------+--------------------+--------------------+--------------------+--------------------+----+-----------------+------+------+------------------+---------+
|   6|    The Great Wall|Action,Adventure,...|European mercenar...|         Yimou Zhang|Matt Damon, Tian ...|2016|              103|   6.1| 56036|             45.13|       42|
|   7|        La La Land|  Comedy,Drama,Music|A jazz pianist fa...|     Damien Chazelle|Ryan Gosling, Emm...|2016|              128|   8.3|258682|            151.06|       93|
|   8|          Mindhorn|              Comedy|"A has-been actor...| whom he believes...|          Sean Foley|NULL|      

# Transformations

In [4]:
from pyspark.sql.functions import split, col, explode

#NODES

# 1. Create a DF for Director nodes
df_directors = df_movies.select(
    col("Director").alias("Director_Name")
)
df_directors.show()

# 2. Create a DF for Actor nodes

df_actors = df_movies.withColumn("Actor_Name", explode(split(col("Actors"), ","))) \
                    .select("Actor_Name") \
                    .distinct()
df_actors.show()

# 3. Create a DF for Movie nodes
df_movies_nodes = df_movies.select(
    col("Title").alias("Movie_Title"),
    col("Description"),
    col("Year"),
    col("Runtime (Minutes)").alias("Runtime"),
    col("Rating"),
    col("Revenue (Millions)").alias("Revenue")
).distinct()
df_movies_nodes.show()

# 4. Create a DF for Genre nodes
df_genres = df_movies.withColumn("Genre_Name", explode(split(col("Genre"), ","))) \
                    .select("Genre_Name") \
                    .distinct()
df_genres.show()

#EDGES

# 1. (Director)-[:DIRECTED]->(Movie)
edges_director_movie = df_movies.select(
    col("Director").alias("Director_Name"),
    col("Title").alias("Movie_Title")
)
edges_director_movie.show()


# 2. (Actor)-[:ACTED_IN]->(Movie)
edges_actor_movie = df_movies.withColumn("Actor_Name", explode(split(col("Actors"), ","))) \
                             .select(col("Title").alias("Movie_Title"), "Actor_Name")
edges_actor_movie.show()

# 3. (Movie)-[:BELONGS_TO]->(Genre)
edges_movie_genre = df_movies.withColumn("Genre_Name", explode(split(col("Genre"), ","))) \
                             .select(col("Title").alias("Movie_Title"), "Genre_Name")
edges_movie_genre.show()


+--------------------+
|       Director_Name|
+--------------------+
|         Yimou Zhang|
|     Damien Chazelle|
| whom he believes...|
|          James Gray|
|       Morten Tyldum|
|         David Yates|
|      Theodore Melfi|
|      Gareth Edwards|
|        Ron Clements|
|     Nacho Vigalondo|
|        Chris Renaud|
|          Mel Gibson|
|     Paul Greengrass|
|         Garth Davis|
|    Denis Villeneuve|
|      Stephen Gaghan|
|    Kenneth Lonergan|
|           Ben Young|
|          Walt Dohrn|
|     Roland Emmerich|
+--------------------+
only showing top 20 rows


                                                                                

+-------------------+
|         Actor_Name|
+-------------------+
|          Amy Adams|
|      Chris Messina|
| Raoul Max Trujillo|
|   Daniel Day-Lewis|
|  Genesis Rodriguez|
|      Peter Riegert|
|      Nicholas Hope|
|         Osric Chau|
|     Craig Robinson|
|       Oliver Platt|
|        Mateo Arias|
|          Dax Flame|
|     Tadanobu Asano|
|       Michael Peña|
|         Emma Stone|
|          Anna Camp|
|    Rodrigo Santoro|
|       Rebecca Hall|
|      John Travolta|
|   Benicio Del Toro|
+-------------------+
only showing top 20 rows
+--------------------+--------------------+----+-------+------+-------+
|         Movie_Title|         Description|Year|Runtime|Rating|Revenue|
+--------------------+--------------------+----+-------+------+-------+
|Underworld: Blood...|Vampire death dea...|2016|     91|   5.8|  30.35|
|Kingsman: The Sec...|A spy organizatio...|2014|    129|   7.7| 128.25|
|              Trance|An art auctioneer...|2013|    101|   7.0|   2.32|
|The Hobbit: An

# Writing Data in Neo4j

In [5]:
NEO4J_URL = "bolt://neo4j-iteso:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "neo4j@1234"

# Nodes

# Actor nodes
print("Writing Actor nodes...")
df_actors.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("Overwrite") \
    .option("url", NEO4J_URL) \
    .option("authentication.basic.username", NEO4J_USER) \
    .option("authentication.basic.password", NEO4J_PASSWORD) \
    .option("query", "MERGE (a:Actor {name: event.Actor_Name})") \
    .save()

# Director nodes
print("Writing Director nodes...")
df_directors.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("Append") \
    .option("url", NEO4J_URL) \
    .option("authentication.basic.username", NEO4J_USER) \
    .option("authentication.basic.password", NEO4J_PASSWORD) \
    .option("query", "MERGE (d:Director {name: event.Director_Name})") \
    .save()

# Genre nodes
print("Writing Genre nodes...")
df_genres.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("Append") \
    .option("url", NEO4J_URL) \
    .option("authentication.basic.username", NEO4J_USER) \
    .option("authentication.basic.password", NEO4J_PASSWORD) \
    .option("query", "MERGE (g:Genre {name: event.Genre_Name})") \
    .save()

# D) Movie nodes
print("Writing Movie nodes...")
df_movies_nodes.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("Append") \
    .option("url", NEO4J_URL) \
    .option("authentication.basic.username", NEO4J_USER) \
    .option("authentication.basic.password", NEO4J_PASSWORD) \
    .option("query", """
        MERGE (m:Movie {title: event.Movie_Title})
        SET m.description = event.Description,
            m.year = event.Year,
            m.runtime = event.Runtime,
            m.rating = event.Rating,
            m.revenue = event.Revenue
    """) \
    .save()


# Relationships

# (Director)-[:DIRECTED]->(Movie) relationships
print("Writing DIRECTED relationships...")
edges_director_movie.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("Append") \
    .option("url", NEO4J_URL) \
    .option("authentication.basic.username", NEO4J_USER) \
    .option("authentication.basic.password", NEO4J_PASSWORD) \
    .option("batch.size", 1000) \
    .option("relationship", "DIRECTED") \
    .option("relationship.save.strategy", "keys") \
    .option("relationship.source.labels", ":Director") \
    .option("relationship.source.node.properties", "Director_Name:name") \
    .option("relationship.target.labels", ":Movie") \
    .option("relationship.target.node.properties", "Movie_Title:title") \
    .save()

# (Actor)-[:ACTED_IN]->(Movie)
print("Writing ACTED_IN relationships...")
edges_actor_movie.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("Append") \
    .option("url", NEO4J_URL) \
    .option("authentication.basic.username", NEO4J_USER) \
    .option("authentication.basic.password", NEO4J_PASSWORD) \
    .option("batch.size", 1000) \
    .option("relationship", "ACTED_IN") \
    .option("relationship.save.strategy", "keys") \
    .option("relationship.source.labels", ":Actor") \
    .option("relationship.source.node.properties", "Actor_Name:name") \
    .option("relationship.target.labels", ":Movie") \
    .option("relationship.target.node.properties", "Movie_Title:title") \
    .save()

# (Movie)-[:BELONGS_TO]->(Genre)
print("Writing BELONGS_TO relationships...")
edges_movie_genre.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("Append") \
    .option("url", NEO4J_URL) \
    .option("authentication.basic.username", NEO4J_USER) \
    .option("authentication.basic.password", NEO4J_PASSWORD) \
    .option("batch.size", 1000) \
    .option("relationship", "BELONGS_TO") \
    .option("relationship.save.strategy", "keys") \
    .option("relationship.source.labels", ":Movie") \
    .option("relationship.source.node.properties", "Movie_Title:title") \
    .option("relationship.target.labels", ":Genre") \
    .option("relationship.target.node.properties", "Genre_Name:name") \
    .save()


Writing Actor nodes...


Py4JJavaError: An error occurred while calling o151.save.
: org.neo4j.driver.exceptions.ServiceUnavailableException: Unable to connect to neo4j-iteso:7687, ensure the database is running and that there is a working network connection to it.
	at org.neo4j.driver.internal.util.Futures.blockingGet(Futures.java:111)
	at org.neo4j.driver.internal.InternalSession.run(InternalSession.java:62)
	at org.neo4j.driver.internal.InternalSession.run(InternalSession.java:47)
	at org.neo4j.driver.internal.AbstractQueryRunner.run(AbstractQueryRunner.java:34)
	at org.neo4j.driver.internal.AbstractQueryRunner.run(AbstractQueryRunner.java:39)
	at org.neo4j.caniuse.Neo4jDetector.detect(Neo4jDetector.kt:29)
	at org.neo4j.spark.DataSource.getNeo4jInfo(DataSource.scala:77)
	at org.neo4j.spark.DataSource.getTable(DataSource.scala:103)
	at org.apache.spark.sql.classic.DataFrameWriter.getTable$1(DataFrameWriter.scala:157)
	at org.apache.spark.sql.classic.DataFrameWriter.saveInternal(DataFrameWriter.scala:175)
	at org.apache.spark.sql.classic.DataFrameWriter.save(DataFrameWriter.scala:126)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:184)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:108)
	at java.base/java.lang.Thread.run(Thread.java:840)
	Suppressed: org.neo4j.driver.internal.util.ErrorUtil$InternalExceptionCause
		at org.neo4j.driver.internal.async.connection.ChannelConnectedListener.databaseUnavailableError(ChannelConnectedListener.java:84)
		at org.neo4j.driver.internal.async.connection.ChannelConnectedListener.operationComplete(ChannelConnectedListener.java:78)
		at org.neo4j.driver.internal.async.connection.ChannelConnectedListener.operationComplete(ChannelConnectedListener.java:37)
		at io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:590)
		at io.netty.util.concurrent.DefaultPromise.notifyListeners0(DefaultPromise.java:583)
		at io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:559)
		at io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:492)
		at io.netty.util.concurrent.DefaultPromise.setValue0(DefaultPromise.java:636)
		at io.netty.util.concurrent.DefaultPromise.setFailure0(DefaultPromise.java:629)
		at io.netty.util.concurrent.DefaultPromise.setFailure(DefaultPromise.java:110)
		at io.netty.channel.DefaultChannelPromise.setFailure(DefaultChannelPromise.java:89)
		at io.netty.bootstrap.Bootstrap.doResolveAndConnect0(Bootstrap.java:228)
		at io.netty.bootstrap.Bootstrap.access$000(Bootstrap.java:47)
		at io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:189)
		at io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:175)
		at io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:590)
		at io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:557)
		at io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:492)
		at io.netty.util.concurrent.DefaultPromise.setValue0(DefaultPromise.java:636)
		at io.netty.util.concurrent.DefaultPromise.setSuccess0(DefaultPromise.java:625)
		at io.netty.util.concurrent.DefaultPromise.trySuccess(DefaultPromise.java:105)
		at io.netty.channel.DefaultChannelPromise.trySuccess(DefaultChannelPromise.java:84)
		at io.netty.channel.AbstractChannel$AbstractUnsafe.safeSetSuccess(AbstractChannel.java:988)
		at io.netty.channel.AbstractChannel$AbstractUnsafe.register0(AbstractChannel.java:515)
		at io.netty.channel.AbstractChannel$AbstractUnsafe.access$200(AbstractChannel.java:428)
		at io.netty.channel.AbstractChannel$AbstractUnsafe$1.run(AbstractChannel.java:485)
		at io.netty.util.concurrent.AbstractEventExecutor.runTask(AbstractEventExecutor.java:173)
		at io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:166)
		at io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:472)
		at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:569)
		at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998)
		at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
		at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
		... 1 more
Caused by: java.net.UnknownHostException: neo4j-iteso
	at java.base/java.net.InetAddress$CachedAddresses.get(InetAddress.java:801)
	at java.base/java.net.InetAddress.getAllByName0(InetAddress.java:1533)
	at java.base/java.net.InetAddress.getAllByName(InetAddress.java:1385)
	at java.base/java.net.InetAddress.getAllByName(InetAddress.java:1306)
	at org.neo4j.driver.internal.DefaultDomainNameResolver.resolve(DefaultDomainNameResolver.java:35)
	at org.neo4j.driver.internal.async.connection.NettyDomainNameResolver.doResolve(NettyDomainNameResolver.java:41)
	at io.netty.resolver.SimpleNameResolver.resolve(SimpleNameResolver.java:61)
	at io.netty.resolver.SimpleNameResolver.resolve(SimpleNameResolver.java:53)
	at io.netty.resolver.InetSocketAddressResolver.doResolve(InetSocketAddressResolver.java:55)
	at io.netty.resolver.InetSocketAddressResolver.doResolve(InetSocketAddressResolver.java:31)
	at io.netty.resolver.AbstractAddressResolver.resolve(AbstractAddressResolver.java:106)
	at io.netty.bootstrap.Bootstrap.doResolveAndConnect0(Bootstrap.java:220)
	at io.netty.bootstrap.Bootstrap.access$000(Bootstrap.java:47)
	at io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:189)
	at io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:175)
	at io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:590)
	at io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:557)
	at io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:492)
	at io.netty.util.concurrent.DefaultPromise.setValue0(DefaultPromise.java:636)
	at io.netty.util.concurrent.DefaultPromise.setSuccess0(DefaultPromise.java:625)
	at io.netty.util.concurrent.DefaultPromise.trySuccess(DefaultPromise.java:105)
	at io.netty.channel.DefaultChannelPromise.trySuccess(DefaultChannelPromise.java:84)
	at io.netty.channel.AbstractChannel$AbstractUnsafe.safeSetSuccess(AbstractChannel.java:988)
	at io.netty.channel.AbstractChannel$AbstractUnsafe.register0(AbstractChannel.java:515)
	at io.netty.channel.AbstractChannel$AbstractUnsafe.access$200(AbstractChannel.java:428)
	at io.netty.channel.AbstractChannel$AbstractUnsafe$1.run(AbstractChannel.java:485)
	at io.netty.util.concurrent.AbstractEventExecutor.runTask(AbstractEventExecutor.java:173)
	at io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:166)
	at io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:472)
	at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:569)
	at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998)
	at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
	at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
	... 1 more


# Read and Query Graphs with PySpark

In [None]:
# Add the code to read a data frame from Neo4J and run a simple query to verify 

In [None]:
sc.stop()