# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 05**: Data pipeline with Neo4j

**Date**: October 2nd 2025

**Student Name**: Diego Orozco Alvarado

**Professor**: Pablo Camarillo Ramirez

# Dataset description

#### Lista de canciones y sus artistas
 https://www.kaggle.com/datasets/neisse/scrapped-lyrics-from-6-genres?select=artists-data.csv

 Relacion Link de el Artista con el Nombre de la cancion como escribio (WROTE)

# Data ingestion

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on SparkSQL") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.neo4j:neo4j-connector-apache-spark_2.13:5.3.10_for_spark_3") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

# Optimization (reduce the number of shuffle partitions)
spark.conf.set("spark.sql.shuffle.partitions", "5")

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2.5.2/cache
The jars for the packages stored in: /root/.ivy2.5.2/jars
org.neo4j#neo4j-connector-apache-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ab570660-8bdb-47c5-b545-bc4274ce2b08;1.0
	confs: [default]
	found org.neo4j#neo4j-connector-apache-spark_2.13;5.3.10_for_spark_3 in central
	found org.neo4j#neo4j-connector-apache-spark_2.13_common;5.3.10_for_spark_3 in central
	found org.neo4j#caniuse-core;1.3.0 in central
	found org.neo4j#caniuse-api;1.3.0 in central
	found org.jetbrains.kotlin#kotlin-stdlib;2.1.20 in central
	found org.jetbrains#annotations;13.0 in central
	found org.neo4j#caniuse-neo4j-detection;1.3.0 in central
	found org.neo4j.driver#neo4j-java-driver-slim;4.4.21 in central
	found org.reactivestreams#reactive-streams;1.0.4 in central
	found io.netty#netty-handler;4.1.

In [2]:
!pwd

/opt/spark/work-dir


In [14]:
!du -sh data/songs_data/artists
!du -sh data/songs_data/lyrics
base_path = "/opt/spark/work-dir/data/"

224K	data/songs_data/artists


417M	data/songs_data/lyrics


In [49]:
# Import your module
from diego_orozco.spark_utils import SparkUtils

# Build schema
schema_artists = SparkUtils.generate_schema([("Artist", "string"), ("Genres", "string"), ("Songs", "int"), ("Popularity", "double"), ("ALink", "string")])
df_artists = spark.read.schema(schema_artists).option("header",True).csv(base_path+"/songs_data/artists").limit(1000)

schema_lyrics = SparkUtils.generate_schema([("ALink", "string"), ("SName", "string"), ("SLink", "string"), ("Lyric", "string"), ("language", "string")])
df_lyrics = (
    spark.read
    .schema(schema_lyrics)
    .option("header", True)
    .option("quote", '"')           # reconoce los campos entre comillas
    .option("escape", '"')          # maneja comillas dobles internas correctamente ("" -> ")
    .option("multiLine", True)      # permite saltos de línea dentro de comillas
    .option("ignoreLeadingWhiteSpace", True)
    .option("ignoreTrailingWhiteSpace", True)
    .csv(base_path + "/songs_data/lyrics")
    .limit(1000)
)


df_artists=df_artists.na.drop(subset=["ALink"])
df_lyrics=df_lyrics.na.drop(subset=["SName"])

## NOTA: 
Agregue el limite ya que me estaba tardando mucho en el df de lyrics.
Como se puede apreciar en la siguiente imagen:
![Captura de Pantalla 2025-10-04 a la(s) 10.18.25.png](<attachment:Captura de Pantalla 2025-10-04 a la(s) 10.18.25.png>)

# Transformations

In [43]:
# Add the code for your transformations to create nodes and edges DataFrames HERE
from pyspark.sql.functions import col

# --- NODES ---

# artist Nodes
artist_nodes = df_artists.select(
    col("ALink"),
    col("Artist"),
    col("Genres"),
    col("Songs"),
    col("Popularity")
).dropDuplicates(["ALink"])

In [44]:
artist_nodes.show()

+--------------------+--------------------+--------------------+-----+----------+
|               ALink|              Artist|              Genres|Songs|Popularity|
+--------------------+--------------------+--------------------+-----+----------+
|            /14-bis/|              14 Bis|      MPB; Pop; Folk|  121|       1.0|
|              /2ne1/|                2NE1|K-Pop/K-Rock; Pop...|  103|       0.0|
| /2pac-tupac-shakur/|        Tupac Shakur|Hip Hop; Rap; Bla...|  383|      10.3|
|              /3oh3/|               3OH!3|Electronica; Pop;...|   88|       0.0|
|        /5-elemento/|         5º Elemento|Black Music; Hip ...|   36|       0.0|
|           /50-cent/|             50 Cent|Hip Hop; Rap; Bla...|  469|      40.7|
|/a-banda-mais-bon...|A Banda Mais Boni...|    Indie; Folk; MPB|   50|       1.0|
|           /aaliyah/|             Aaliyah|    Black Music; R&B|   97|       0.0|
|              /abba/|                ABBA|   Disco; Pop; Dance|  166|      11.8|
|      /above-be

In [50]:

from pyspark.sql.functions import concat_ws, sha2, col
# lyrics Nodes
lyrics_nodes = df_lyrics.select(
    col("SName"),
    col("SLink"),
    col("Lyric"),
    col("language")
).dropDuplicates(["SName"])
lyrics_nodes = lyrics_nodes.withColumn(
    "SongId",
    sha2(concat_ws(":", col("SName"), col("SLink")), 256)
)


In [34]:
df_lyrics.show()

+---------------+--------------------+--------------------+--------------------+--------+
|          ALink|               SName|               SLink|               Lyric|language|
+---------------+--------------------+--------------------+--------------------+--------+
|/ivete-sangalo/|               Arerê|/ivete-sangalo/ar...|Tudo o que eu que...|      pt|
|/ivete-sangalo/|Se Eu Não Te Amas...|/ivete-sangalo/se...|Meu coração\nSem ...|      pt|
|/ivete-sangalo/|         Céu da Boca|/ivete-sangalo/ch...|É de babaixá!\nÉ ...|      pt|
|/ivete-sangalo/|Quando A Chuva Pa...|/ivete-sangalo/qu...|Quando a chuva pa...|      pt|
|/ivete-sangalo/|        Sorte Grande|/ivete-sangalo/so...|A minha sorte gra...|      pt|
|/ivete-sangalo/|    A Lua Q Eu T Dei|/ivete-sangalo/a-...|Posso te falar do...|      pt|
|/ivete-sangalo/|Mulheres Não Têm ...|/ivete-sangalo/mu...|Hey, girl\nLevant...|      pt|
|/ivete-sangalo/|Eva / Alô Paixão ...|/ivete-sangalo/ev...|"EVA"\n(Giancarlo...|      pt|
|/ivete-sa

In [51]:
lyrics_nodes.show()


[Stage 74:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------+--------------------+
|               SName|               SLink|               Lyric|language|              SongId|
+--------------------+--------------------+--------------------+--------+--------------------+
|           100% Você|/chiclete-com-ban...|Um céu sem estrel...|      pt|5b71a4b71bae235ab...|
|            40 Graus|/banda-eva/40-gra...|Eu acho que tô co...|      pt|298c393b046b8dd36...|
|      A Casa Amarela|/ivete-sangalo/a-...|Papai pintou\nA c...|      pt|8b56d24ed22813a2f...|
|       A Cor da Vida|/chiclete-com-ban...|Em cada lugar do ...|      pt|e8f04e7f608dbcd4a...|
|   A Dança do Ventre|/e-o-tchan/a-danc...|Neguinha maravilh...|      pt|a9b103c4add549b74...|
|  A Dona do Terreiro|/chiclete-com-ban...|A dona do terreir...|      pt|8bc94a3c333a799c5...|
|        A Fila Andou|/chiclete-com-ban...|Nananá Nananananá...|      pt|f4803dd72f175b757...|
|A Fórmula do Amor...|/claudia-leitte/a...|Eu tenh

                                                                                

In [53]:

# --- EDGES ---

# Purchases: relationships between User -> lyrics
wrote_edges = df_lyrics.select(
    col("SName").alias("src"),   # source node (artist)
    col("ALink").alias("dst"),   # destination node (lyrics)
)


In [37]:
wrote_edges.head(5)

[Row(src='Arerê', dst='/ivete-sangalo/'),
 Row(src='Se Eu Não Te Amasse Tanto Assim', dst='/ivete-sangalo/'),
 Row(src='Céu da Boca', dst='/ivete-sangalo/'),
 Row(src='Quando A Chuva Passar', dst='/ivete-sangalo/'),
 Row(src='Sorte Grande', dst='/ivete-sangalo/')]

# Writing Data in Neo4j

In [23]:
# Add the code to write a graph from PySpark's DataFrames to Neo4j
neo4j_url = "bolt://neo4j-iteso:7687"
neo4j_user = "neo4j"
neo4j_passwd = "neo4j@1234"

In [29]:

query = """
CALL apoc.periodic.iterate(
  'MATCH (n) RETURN n',
  'DETACH DELETE n',
  {batchSize: 10000}
)
"""

spark.read \
    .format("org.neo4j.spark.DataSource") \
    .option("url", neo4j_url) \
    .option("authentication.basic.username", neo4j_user) \
    .option("authentication.basic.password", neo4j_passwd) \
    .option("query", query) \
    .load()

DataFrame[]

In [45]:


artist_nodes.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("labels", ":Artist") \
  .option("node.keys", "ALink") \
  .save()

print(f"{artist_nodes.count()} artist wrote in Neo4j")


                                                                                

1000 artist wrote in Neo4j


In [52]:

lyrics_nodes.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("labels", ":Lyrics") \
  .option("node.keys", "SongId")\
  .option("batch.size", "10000") \
  .save()

print(f"{lyrics_nodes.count()} lyrics nodes wrote in Neo4j")


                                                                                

978 lyrics nodes wrote in Neo4j


In [54]:

wrote_edges.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("relationship", "WROTE") \
  .option("relationship.save.strategy", "keys") \
  .option("relationship.target.labels", ":Lyrics") \
  .option("relationship.target.save.mode", "match") \
  .option("relationship.target.node.keys", "src:SName") \
  .option("relationship.source.labels", ":Artist") \
  .option("relationship.source.save.mode", "match") \
  .option("relationship.source.node.keys", "dst:ALink") \
  .save()

print(f"{wrote_edges.count()} wrote edges wrote in Neo4j")

                                                                                

1000 wrote edges wrote in Neo4j


# Read and Query Graphs with PySpark

In [61]:
# Add the code to read a data frame from Neo4J and run a simple query to verify 
cypher_df = spark.read \
    .format("org.neo4j.spark.DataSource") \
    .option("url", neo4j_url) \
    .option("authentication.basic.username", neo4j_user) \
    .option("authentication.basic.password", neo4j_passwd) \
    .option("query",
            """
            MATCH (a:Artist)-[w:WROTE]->(l:Lyrics)
            RETURN a.Artist,l.SName, l.SLink , a.Popularity
            """) \
    .load()

cypher_df.show()

+-------------+--------------------+--------------------+------------+
|     a.Artist|             l.SName|             l.SLink|a.Popularity|
+-------------+--------------------+--------------------+------------+
|Ivete Sangalo|    A Lua Q Eu T Dei|/ivete-sangalo/a-...|         4.4|
|    Banda Eva|               Arerê|/ivete-sangalo/ar...|         2.3|
|Ivete Sangalo|               Arerê|/ivete-sangalo/ar...|         4.4|
|    Banda Eva|         Carro Velho|/ivete-sangalo/ca...|         2.3|
|Ivete Sangalo|         Carro Velho|/ivete-sangalo/ca...|         4.4|
|Ivete Sangalo|         Céu da Boca|/ivete-sangalo/ch...|         4.4|
|Ivete Sangalo|Eva / Alô Paixão ...|/ivete-sangalo/ev...|         4.4|
|Ivete Sangalo|      Flor do Reggae|/ivete-sangalo/fl...|         4.4|
|Ivete Sangalo|Mulheres Não Têm ...|/ivete-sangalo/mu...|         4.4|
|Ivete Sangalo|Quando A Chuva Pa...|/ivete-sangalo/qu...|         4.4|
|Ivete Sangalo|Se Eu Não Te Amas...|/ivete-sangalo/se...|         4.4|
|Ivete

In [62]:
sc.stop()