# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 06**: Data pipeline with Neo4j

**Date**: October 5th 2025

**Student Name**:Axel Ivan Gallardo Terriquez

**Professor**: Pablo Camarillo Ramirez

# Dataset description

IGN games from best to worst
Listado de Videojuegos ordenado por calificacion de IGN con Titulo del videojuego, Calificacion numérica, Calificación resumida, Plataforma, Género, y Fecha dividida en año, mes y día.
Relacion de Título al resto de columnas.

# Data ingestion

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on SparkSQL") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "org.neo4j:neo4j-connector-apache-spark_2.13:5.3.10_for_spark_3") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")
spark.conf.set("spark.sql.shuffle.partitions", "5")

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2.5.2/cache
The jars for the packages stored in: /root/.ivy2.5.2/jars
org.neo4j#neo4j-connector-apache-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d205eec3-e6fb-402d-8f36-5e7fccf10a84;1.0
	confs: [default]
	found org.neo4j#neo4j-connector-apache-spark_2.13;5.3.10_for_spark_3 in central
	found org.neo4j#neo4j-connector-apache-spark_2.13_common;5.3.10_for_spark_3 in central
	found org.neo4j#caniuse-core;1.3.0 in central
	found org.neo4j#caniuse-api;1.3.0 in central
	found org.jetbrains.kotlin#kotlin-stdlib;2.1.20 in central
	found org.jetbrains#annotations;13.0 in central
	found org.neo4j#caniuse-neo4j-detection;1.3.0 in central
	found org.neo4j.driver#neo4j-java-driver-slim;4.4.21 in central
	found org.reactivestreams#reactive-streams;1.0.4 in central
	found io.netty#netty-handler;4.1.

In [5]:
from axelgallardo.spark_utils import SparkUtils
from pyspark.sql.functions import col, trim, split, explode, array_distinct, when, regexp_replace, coalesce, lit
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

csv_path = "/opt/spark/work-dir/data/lab06/"

ign_schema_columns = [
    ("title", "string"),
    ("score", "string"),
    ("score_phrase", "string"),
    ("platform", "string"),
    ("genre", "string"),
    ("release_year", "int"),
    ("release_month", "int"),
    ("release_day", "int"),
]
ign_schema = SparkUtils.generate_schema(ign_schema_columns)

df = (spark.read
      .option("header", "true")
      .schema(ign_schema)
      .csv(csv_path)
)

df = (df
      .withColumn("score", regexp_replace(col("score"), ",", "."))
      .withColumn("title", trim(col("title")))
      .withColumn("score_phrase", trim(col("score_phrase")))
      .withColumn("platform", trim(col("platform")))
      .withColumn("genre", trim(col("genre")))
)

df.show(5, truncate=False)
df.printSchema()

games_schema_columns = [
    ("title", "string"),
    ("score", "double"),
    ("score_phrase", "string"),
    ("release_year", "int"),
    ("release_month", "int"),
    ("release_day", "int"),
]
game_schema = SparkUtils.generate_schema(games_schema_columns)

games_df = (df.select(
                col("title").alias("title"),
                col("score").cast("double").alias("score"),
                col("score_phrase").alias("score_phrase"),
                col("release_year").cast("int").alias("release_year"),
                col("release_month").cast("int").alias("release_month"),
                col("release_day").cast("int").alias("release_day"),
                col("platform").alias("platform"),
                col("genre").alias("genre")
            )
            .where(col("title").isNotNull() & (col("title") != ""))
            .dropDuplicates(["title"])
           )

games_df.printSchema()
print(game_schema.simpleString())

platform_schema_columns = [("name", "string")]
platform_schema = SparkUtils.generate_schema(platform_schema_columns)

platform_df = (games_df
    .select(trim(col("platform")).alias("name"))
    .where(col("name").isNotNull() & (col("name") != ""))
    .dropDuplicates(["name"])
)

platform_df.printSchema()
print(platform_schema.simpleString())

genre_schema_columns = [("name", "string")]
genre_schema = SparkUtils.generate_schema(genre_schema_columns)

genres_exploded = (
    games_df
    .withColumn(
        "genre_arr",
        array_distinct(
            split(coalesce(col("genre"), lit("")), ",")
        )
    )
    .withColumn("genre_item", explode(col("genre_arr")))  # primero explota
    .withColumn("genre_item", trim(col("genre_item")))    # luego trim
    .where(col("genre_item") != "")
)

genre_df = (genres_exploded
    .select(col("genre_item").alias("name"))
    .dropDuplicates(["name"])
)

genre_df.printSchema()
print(genre_schema.simpleString())

year_schema_columns = [("value", "int")]
year_schema = SparkUtils.generate_schema(year_schema_columns)

year_df = (games_df
    .select(col("release_year").alias("value"))
    .where(col("value").isNotNull())
    .dropDuplicates(["value"])
)

year_df.printSchema()
print(year_schema.simpleString())

games_df.show(5, truncate=False)
platform_df.show(5, truncate=False)
genre_df.show(5, truncate=False)
year_df.show(5, truncate=False)


+-------------------+-----+------------+--------------+-----------------+------------+-------------+-----------+
|title              |score|score_phrase|platform      |genre            |release_year|release_month|release_day|
+-------------------+-----+------------+--------------+-----------------+------------+-------------+-----------+
|Checkered Flag     |10.0 |Masterpiece |Lynx          |Racing           |1999        |7            |6          |
|Chrono Trigger     |10.0 |Masterpiece |Wii           |Action, RPG      |2011        |5            |25         |
|Dragon Warrior III |10.0 |Masterpiece |Game Boy Color|RPG              |2001        |7            |20         |
|Grand Theft Auto IV|10.0 |Masterpiece |Xbox 360      |Action, Adventure|2008        |4            |25         |
|Grand Theft Auto IV|10.0 |Masterpiece |PlayStation 3 |Action, Adventure|2008        |4            |25         |
+-------------------+-----+------------+--------------+-----------------+------------+----------

                                                                                

+---------------------------------------------------------------+-----+------------+------------+-------------+-----------+-------------+------+
|title                                                          |score|score_phrase|release_year|release_month|release_day|platform     |genre |
+---------------------------------------------------------------+-----+------------+------------+-------------+-----------+-------------+------+
|"Ivan ""Ironman"" Stewart's Super Off Road"                    |7.1  |Good        |2005        |6            |20         |Wireless     |Racing|
|"SnoCross 2 Featuring ""Air"" Blair Morgan"                    |4.2  |Bad         |2007        |2            |7          |PlayStation 2|Racing|
|"Stubbs the Zombie in ""Rebel without a Pulse"""               |8.1  |Great       |2005        |10           |24         |Xbox         |Action|
|"World Championship Poker: Featuring Howard Lederer ""All In"""|6.6  |Okay        |2006        |9            |11         |PlaySta

                                                                                

+--------------+
|name          |
+--------------+
|Xbox One      |
|PlayStation 3 |
|PlayStation   |
|Nintendo 64   |
|Game Boy Color|
+--------------+
only showing top 5 rows
+-------+
|name   |
+-------+
|Racing |
|Pinball|
|Board  |
|Hunting|
|Music  |
+-------+
only showing top 5 rows
+-----+
|value|
+-----+
|2008 |
|2001 |
|2012 |
|2006 |
|1998 |
+-----+
only showing top 5 rows


# Transformations

In [6]:
from pyspark.sql.functions import concat_ws, coalesce, lit, trim, split, explode, array_distinct

games_df = games_df.withColumn(
    "game_key",
    concat_ws("||", col("title"), col("platform"), col("release_year"))
)

# ====== NODES ======
# Game nodes
game_nodes = games_df.select(
    col("game_key").alias("id"),
    col("title"),
    col("score"),
    col("score_phrase"),
    col("release_year"),
    col("release_month"),
    col("release_day")
).dropDuplicates(["id"])

# Platform nodes
platform_nodes = platform_df.select(
    col("name").alias("id")
).dropDuplicates(["id"])

# Genre nodes
genre_nodes = genre_df.select(
    col("name").alias("id")
).dropDuplicates(["id"])

# Year nodes
year_nodes = year_df.select(
    col("value").alias("id")
).dropDuplicates(["id"])

game_nodes.show(5, truncate=False)
platform_nodes.show(5, truncate=False)
genre_nodes.show(5, truncate=False)
year_nodes.show(5, truncate=False)


# ====== EDGES ======
# Game -> Platform
game_platform_edges = games_df.select(
    col("game_key").alias("src"),
    col("platform").alias("dst")
).where(col("dst").isNotNull() & (col("dst") != "")).dropDuplicates(["src", "dst"])

# Game -> Genre  (separa géneros del string)
genres_exploded_edges = (
    games_df
    .withColumn("genre_arr", array_distinct(split(coalesce(col("genre"), lit("")), ",")))
    .withColumn("genre_item", explode(col("genre_arr")))
    .withColumn("genre_item", trim(col("genre_item")))
    .where(col("genre_item") != "")
)

game_genre_edges = genres_exploded_edges.select(
    col("game_key").alias("src"),
    col("genre_item").alias("dst")
).dropDuplicates(["src", "dst"])

# Game -> Year
game_year_edges = games_df.select(
    col("game_key").alias("src"),
    col("release_year").alias("dst")
).where(col("dst").isNotNull()).dropDuplicates(["src", "dst"])

game_platform_edges.show(5, truncate=False)
game_genre_edges.show(5, truncate=False)
game_year_edges.show(5, truncate=False)


                                                                                

+------------------------------------------------------------------------------------+---------------------------------------------------------------+-----+------------+------------+-------------+-----------+
|id                                                                                  |title                                                          |score|score_phrase|release_year|release_month|release_day|
+------------------------------------------------------------------------------------+---------------------------------------------------------------+-----+------------+------------+-------------+-----------+
|"Ivan ""Ironman"" Stewart's Super Off Road"||Wireless||2005                         |"Ivan ""Ironman"" Stewart's Super Off Road"                    |7.1  |Good        |2005        |6            |20         |
|"SnoCross 2 Featuring ""Air"" Blair Morgan"||PlayStation 2||2007                    |"SnoCross 2 Featuring ""Air"" Blair Morgan"                    |4.2  |Bad     

                                                                                

+------------------------------------------------+-----------+
|src                                             |dst        |
+------------------------------------------------+-----------+
|.hack//INFECTION (Part 1)||PlayStation 2||2003  |Action     |
|101 Shark Pets||Nintendo DSi||2010              |Simulation |
|101-in-1 Explosive Megamix||Wii||2011           |Compilation|
|18-Wheeler American Pro Trucker||Dreamcast||2001|Racing     |
|187 Ride or Die||PlayStation 2||2005            |Action     |
+------------------------------------------------+-----------+
only showing top 5 rows
+---------------------------------------------+----+
|src                                          |dst |
+---------------------------------------------+----+
|.deTuned||PlayStation 3||2009                |2009|
|.hack//OUTBREAK (Part 3)||PlayStation 2||2003|2003|
|007 Racing||PlayStation||2000                |2000|
|10-Pin Bowling||Game Boy Color||1999         |1999|
|1001 BlockBusters||Nintendo DSi||2011

# Writing Data in Neo4j

In [16]:
neo4j_url = "bolt://neo4j-iteso:7687"
neo4j_user = "neo4j"
neo4j_passwd = "neo4j@1234"


# --- escribe NODES ---
db_name = "neo4j"  # ajusta si usas otra DB

game_nodes.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("database", db_name) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("labels", ":Game") \
  .option("node.keys", "id") \
  .save()

platform_nodes.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("database", db_name) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("labels", ":Platform") \
  .option("node.keys", "id") \
  .save()

genre_nodes.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("database", db_name) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("labels", ":Genre") \
  .option("node.keys", "id") \
  .save()

year_nodes.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("database", db_name) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("labels", ":Year") \
  .option("node.keys", "id") \
  .save()



# Game -[:RELEASED_ON]-> Platform
game_platform_edges.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
    .option("database", db_name) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("relationship", "RELEASED_ON") \
  .option("relationship.save.strategy", "keys") \
  .option("relationship.source.labels", ":Game") \
  .option("relationship.source.save.mode", "match") \
  .option("relationship.source.node.keys", "src:id") \
  .option("relationship.target.labels", ":Platform") \
  .option("relationship.target.save.mode", "match") \
  .option("relationship.target.node.keys", "dst:id") \
  .save()
print(f"{game_platform_edges.count()} RELEASED_ON edges wrote in Neo4j")

# Game -[:HAS_GENRE]-> Genre
game_genre_edges.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
    .option("database", db_name) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("relationship", "HAS_GENRE") \
  .option("relationship.save.strategy", "keys") \
  .option("relationship.source.labels", ":Game") \
  .option("relationship.source.save.mode", "match") \
  .option("relationship.source.node.keys", "src:id") \
  .option("relationship.target.labels", ":Genre") \
  .option("relationship.target.save.mode", "match") \
  .option("relationship.target.node.keys", "dst:id") \
  .save()
print(f"{game_genre_edges.count()} HAS_GENRE edges wrote in Neo4j")

# Game -[:RELEASED_IN]-> Year
game_year_edges.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
    .option("database", db_name) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("relationship", "RELEASED_IN") \
  .option("relationship.save.strategy", "keys") \
  .option("relationship.source.labels", ":Game") \
  .option("relationship.source.save.mode", "match") \
  .option("relationship.source.node.keys", "src:id") \
  .option("relationship.target.labels", ":Year") \
  .option("relationship.target.save.mode", "match") \
  .option("relationship.target.node.keys", "dst:id") \
  .save()
print(f"{game_year_edges.count()} RELEASED_IN edges wrote in Neo4j")


                                                                                

12589 RELEASED_ON edges wrote in Neo4j


                                                                                

14217 HAS_GENRE edges wrote in Neo4j


                                                                                

12589 RELEASED_IN edges wrote in Neo4j


# Read and Query Graphs with PySpark

In [23]:
cypher_df = spark.read.format("org.neo4j.spark.DataSource") \
  .option("url", neo4j_url).option("database", "neo4j") \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("query",
        """
        MATCH (g:Game)-[:RELEASED_ON]->(p:Platform)
        RETURN g.title                    AS title,
               toInteger(g.release_year)  AS release_year,
               coalesce(p.id, p.name)     AS platform_id,
               toFloat(g.score)           AS score,
               g.score_phrase             AS score_phrase
        ORDER BY score DESC, release_year DESC, title ASC
        """) \
  .load()

cypher_df.limit(10).show(truncate=False)


+------------------------------------+------------+-------------+-----+------------+
|title                               |release_year|platform_id  |score|score_phrase|
+------------------------------------+------------+-------------+-----+------------+
|Inside                              |2016        |PlayStation 4|10.0 |Masterpiece |
|The Witness                         |2016        |PlayStation 4|10.0 |Masterpiece |
|Undertale                           |2016        |PC           |10.0 |Masterpiece |
|Metal Gear Solid V: The Phantom Pain|2015        |Xbox One     |10.0 |Masterpiece |
|The Last of Us: Remastered          |2014        |PlayStation 4|10.0 |Masterpiece |
|Grand Theft Auto V                  |2013        |PlayStation 3|10.0 |Masterpiece |
|The Last of Us                      |2013        |PlayStation 3|10.0 |Masterpiece |
|Chrono Trigger                      |2011        |Wii          |10.0 |Masterpiece |
|Infinity Blade II                   |2011        |iPhone       |

In [25]:
cypher_df = spark.read.format("org.neo4j.spark.DataSource") \
  .option("url", neo4j_url).option("database", "neo4j") \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("query",
        """
        MATCH (g:Game)-[:RELEASED_ON]->(p:Platform)
        RETURN g.title                    AS title,
               toInteger(g.release_year)  AS release_year,
               coalesce(p.id, p.name)     AS platform_id,
               toFloat(g.score)           AS score,
               g.score_phrase             AS score_phrase
        ORDER BY score ASC, release_year DESC, title ASC
        """) \
  .load()

cypher_df.limit(10).show(truncate=False)


+---------------------------------------+------------+------------+-----+------------+
|title                                  |release_year|platform_id |score|score_phrase|
+---------------------------------------+------------+------------+-----+------------+
|Looney Tunes: Back in Action: Zany Race|2003        |Wireless    |0.5  |Disaster    |
|Extreme PaintBrawl                     |1998        |PC          |0.7  |Disaster    |
|Action Girlz Racing                    |2009        |Wii         |0.8  |Disaster    |
|Self-Defense Training Camp             |2011        |Xbox 360    |1.0  |Unbearable  |
|Step Up                                |2011        |Wii         |1.0  |Unbearable  |
|Hints Hunter                           |2010        |Nintendo DSi|1.0  |Unbearable  |
|Aquarium                               |2008        |Nintendo DS |1.0  |Unbearable  |
|Fantasy Aquarium                       |2008        |Nintendo DS |1.0  |Unbearable  |
|Kidz Sports: Basketball                |20

In [None]:
sc.stop()