# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 05**: Data pipeline with Neo4j

**Date**: October 2nd 2025

**Student Name**: Renata Tejeda

**Professor**: Pablo Camarillo Ramirez

# Dataset description

## Nodes

- **Tournament**
    - Properties: name, year, surface, prize
- **Player**
    - Properties: name, handedness (R/L)

## Relationships

1. **(Player) –[:WON {winner_atp_ranking}]→ (Tournament)**
    - Source column: WINNER
    - Destination column: TOURNAMENT
    - Edge properties: WINNER ATP RANKING
2. **(Player) –[:LOST_TO {runner_up_atp_ranking, year}]→ (Player)**
    - Source column: RUNNER UP
    - Destination column: WINNER
    - Edge properties: RUNNER UP ATP RANKING, YEAR
3. **(Player) –[:WAS_RUNNER_UP]→ (Tournament)**
    - Source column: RUNNER UP
    - Destination column: TOURNAMENT

# Data ingestion

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on SparkSQL") \
    .master("spark://34c8a8d7a9e7:7077") \
    .config("spark.jars.packages", "org.neo4j:neo4j-connector-apache-spark_2.13:5.3.10_for_spark_3") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2.5.2/cache
The jars for the packages stored in: /root/.ivy2.5.2/jars
org.neo4j#neo4j-connector-apache-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5e05db69-6123-4cc8-aac1-689cc917338f;1.0
	confs: [default]
	found org.neo4j#neo4j-connector-apache-spark_2.13;5.3.10_for_spark_3 in central
	found org.neo4j#neo4j-connector-apache-spark_2.13_common;5.3.10_for_spark_3 in central
	found org.neo4j#caniuse-core;1.3.0 in central
	found org.neo4j#caniuse-api;1.3.0 in central
	found org.jetbrains.kotlin#kotlin-stdlib;2.1.20 in central
	found org.jetbrains#annotations;13.0 in central
	found org.neo4j#caniuse-neo4j-detection;1.3.0 in central
	found org.neo4j.driver#neo4j-java-driver-slim;4.4.21 in central
	found org.reactivestreams#reactive-streams;1.0.4 in central
	found io.netty#netty-handler;4.1.

In [2]:
# Import your module
from codrenatat.spark_utils import SparkUtils

# Define schema
tennis_schema_columns = [
    ("YEAR", "int"),
    ("TOURNAMENT", "string"),
    ("WINNER", "string"),
    ("RUNNER_UP", "string"),
    ("WINNER_NATIONALITY", "string"),
    ("WINNER_ATP_RANKING", "int"),
    ("RUNNER-UP_ATP_RANKING", "int"),
    ("WINNER_LEFT_OR_RIGHT_HANDED", "string"),
    ("TOURNAMENT_SURFACE", "string"),
    ("WINNER_PRIZE", "long")
]

tennis_schema = SparkUtils.generate_schema(tennis_schema_columns)

# Read CSV 
base_path = "/opt/spark/work-dir/data/"
df_tennis = (
    spark.read
        .option("header", "true")
        .schema(tennis_schema)
        .csv(base_path + "tenis/")
)

df_tennis.printSchema()
df_tennis.show(5, truncate=False)

root
 |-- YEAR: integer (nullable = true)
 |-- TOURNAMENT: string (nullable = true)
 |-- WINNER: string (nullable = true)
 |-- RUNNER_UP: string (nullable = true)
 |-- WINNER_NATIONALITY: string (nullable = true)
 |-- WINNER_ATP_RANKING: integer (nullable = true)
 |-- RUNNER-UP_ATP_RANKING: integer (nullable = true)
 |-- WINNER_LEFT_OR_RIGHT_HANDED: string (nullable = true)
 |-- TOURNAMENT_SURFACE: string (nullable = true)
 |-- WINNER_PRIZE: long (nullable = true)

+----+---------------+--------------+------------------+------------------+------------------+---------------------+---------------------------+---------------------+------------+
|YEAR|TOURNAMENT     |WINNER        |RUNNER_UP         |WINNER_NATIONALITY|WINNER_ATP_RANKING|RUNNER-UP_ATP_RANKING|WINNER_LEFT_OR_RIGHT_HANDED|TOURNAMENT_SURFACE   |WINNER_PRIZE|
+----+---------------+--------------+------------------+------------------+------------------+---------------------+---------------------------+---------------------+----

                                                                                

# Transformations

In [4]:
# Add the code for your transformations to create nodes and edges DataFrames HERE
from pyspark.sql.functions import col, lit

# Nodes

# Players
player_nodes = (
    df_tennis
      .select(col("WINNER").alias("name"))
      .union(df_tennis.select(col("RUNNER_UP").alias("name")))
      .distinct()
      .withColumn("node_type", lit("Player"))
)

# Tournaments
tournament_nodes = (
    df_tennis
      .select(
          col("TOURNAMENT").alias("name"),
          col("YEAR").alias("year"),
          col("TOURNAMENT_SURFACE").alias("surface"),
          col("WINNER_PRIZE").alias("prize")
      )
      .distinct()
      .withColumn("node_type", lit("Tournament"))
)

# Edges

# WINNER -> TOURNAMENT (WON)
won_edges = (
    df_tennis
      .select(
          col("WINNER").alias("source"),
          col("TOURNAMENT").alias("destination"),
          lit("WON").alias("rel_type"),
          col("YEAR").alias("year")
      )
)

# RUNNER_UP -> TOURNAMENT (WAS_RUNNER_UP)
runnerup_edges = (
    df_tennis
      .select(
          col("RUNNER_UP").alias("source"),
          col("TOURNAMENT").alias("destination"),
          lit("WAS_RUNNER_UP").alias("rel_type"),
          col("YEAR").alias("year")
      )
)

# RUNNER UP -> WINNER (LOST_TO)
lost_to_edges = (
    df_tennis
      .select(
          col("RUNNER_UP").alias("source"),
          col("WINNER").alias("destination"),
          lit("LOST_TO").alias("rel_type"),
          col("YEAR").alias("year")
      )
)

edges_df = won_edges.unionByName(runnerup_edges).unionByName(lost_to_edges)

print("player nodes")
player_nodes.show(5, truncate=False)

print("tournament nodes")
tournament_nodes.show(5, truncate=False)

print("edges df")
edges_df.show(10, truncate=False)



player nodes
+-------------------+---------+
|name               |node_type|
+-------------------+---------+
|Juan Carlos Ferrero|Player   |
|Arthur Larsen      |Player   |
|Mark Edmondson     |Player   |
|Guillermo Vilas    |Player   |
|Andres Gimeno      |Player   |
+-------------------+---------+
only showing top 5 rows
tournament nodes
+---------------+----+-------------------+-------+----------+
|name           |year|surface            |prize  |node_type |
+---------------+----+-------------------+-------+----------+
|U.S. Open      |1995|DecoTurf - outdoors|575000 |Tournament|
|Australian Open|1973|Grass              |6750   |Tournament|
|Wimbledon      |2014|Grass / Outdoor    |1760000|Tournament|
|U.S. Open      |1998|DecoTurf - outdoors|700000 |Tournament|
|Australian Open|1984|Grass              |100000 |Tournament|
+---------------+----+-------------------+-------+----------+
only showing top 5 rows
edges df
+---------------+---------------+--------+----+
|source         |de

# Writing Data in Neo4j

In [5]:
# Add the code to write a graph from PySpark's DataFrames to Neo4j
neo4j_url = "bolt://neo4j-iteso:7687"
neo4j_user = "neo4j"
neo4j_passwd = "neo4j@1234"

# Player Nodes
player_nodes.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("labels", ":Player") \
  .option("node.keys", "name") \
  .save()

print(f"{player_nodes.count()} player nodes wrote in Neo4j")

# Tournament Nodes
tournament_nodes.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("labels", ":Tournament") \
  .option("node.keys", "name,year") \
  .save()

print(f"{tournament_nodes.count()} tournament nodes wrote in Neo4j")

# WON Relationships (Player → Tournament)
won_edges.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("relationship", "WON") \
  .option("relationship.save.strategy", "keys") \
  .option("relationship.source.labels", ":Player") \
  .option("relationship.source.save.mode", "match") \
  .option("relationship.source.node.keys", "source:name") \
  .option("relationship.target.labels", ":Tournament") \
  .option("relationship.target.save.mode", "match") \
  .option("relationship.target.node.keys", "destination:name,year:year") \
  .save()

print(f"{won_edges.count()} WON relationships wrote in Neo4j")

# WAS_RUNNER_UP Relationship (Player → Tournament)
runnerup_edges.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("relationship", "WAS_RUNNER_UP") \
  .option("relationship.save.strategy", "keys") \
  .option("relationship.source.labels", ":Player") \
  .option("relationship.source.save.mode", "match") \
  .option("relationship.source.node.keys", "source:name") \
  .option("relationship.target.labels", ":Tournament") \
  .option("relationship.target.save.mode", "match") \
  .option("relationship.target.node.keys", "destination:name,year:year") \
  .save()

print(f"{runnerup_edges.count()} WAS_RUNNER_UP relationships wrote in Neo4j")

# LOST_TO Relationship (Player → Player)
lost_to_edges.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("relationship", "LOST_TO") \
  .option("relationship.save.strategy", "keys") \
  .option("relationship.source.labels", ":Player") \
  .option("relationship.source.save.mode", "match") \
  .option("relationship.source.node.keys", "source:name") \
  .option("relationship.target.labels", ":Player") \
  .option("relationship.target.save.mode", "match") \
  .option("relationship.target.node.keys", "destination:name") \
  .save()

print(f"{lost_to_edges.count()} LOST_TO relationships wrote in Neo4j")


                                                                                

158 player nodes wrote in Neo4j
292 tournament nodes wrote in Neo4j
292 WON relationships wrote in Neo4j
292 WAS_RUNNER_UP relationships wrote in Neo4j
292 LOST_TO relationships wrote in Neo4j


# Read and Query Graphs with PySpark

In [6]:
# Add the code to read a data frame from Neo4J and run a simple query to verify 

# All tournaments won by Rafael Nadal
cypher_df = spark.read \
    .format("org.neo4j.spark.DataSource") \
    .option("url", neo4j_url) \
    .option("authentication.basic.username", neo4j_user) \
    .option("authentication.basic.password", neo4j_passwd) \
    .option("query",
            """
            MATCH (p:Player {name:'Rafael Nadal'})-[:WON]->(t:Tournament)
            RETURN p.name AS player, t.name AS tournament, t.year AS year, t.surface AS surface
            ORDER BY t.year DESC
            """) \
    .load()

cypher_df.show(truncate=False)

# Players defeated by Novak Djokovic
djokovic_df = spark.read \
    .format("org.neo4j.spark.DataSource") \
    .option("url", neo4j_url) \
    .option("authentication.basic.username", neo4j_user) \
    .option("authentication.basic.password", neo4j_passwd) \
    .option("query",
            """
            MATCH (r:Player)-[:LOST_TO]->(w:Player {name:'Novak Djokovic'})
            RETURN r.name AS runner_up, w.name AS winner
            """) \
    .load()

djokovic_df.show(truncate=False)

# Count of titles per player
titles_df = spark.read \
    .format("org.neo4j.spark.DataSource") \
    .option("url", neo4j_url) \
    .option("authentication.basic.username", neo4j_user) \
    .option("authentication.basic.password", neo4j_passwd) \
    .option("query",
            """
            MATCH (p:Player)-[:WON]->(t:Tournament)
            RETURN p.name AS player, COUNT(t) AS titles
            ORDER BY titles DESC
            """) \
    .load()

titles_df.show(truncate=False)


+------------+---------------+----+---------------------+
|player      |tournament     |year|surface              |
+------------+---------------+----+---------------------+
|Rafael Nadal|Australian Open|2022|Plexicushion Prestige|
|Rafael Nadal|French Open    |2022|Clay                 |
|Rafael Nadal|French Open    |2020|Clay                 |
|Rafael Nadal|French Open    |2019|Clay                 |
|Rafael Nadal|U.S. Open      |2019|DecoTurf - outdoors  |
|Rafael Nadal|French Open    |2018|Clay                 |
|Rafael Nadal|French Open    |2017|Clay                 |
|Rafael Nadal|U.S. Open      |2017|DecoTurf - outdoors  |
|Rafael Nadal|French Open    |2014|Clay                 |
|Rafael Nadal|U.S. Open      |2013|DecoTurf - outdoors  |
|Rafael Nadal|French Open    |2013|Clay                 |
|Rafael Nadal|French Open    |2012|Clay                 |
|Rafael Nadal|French Open    |2011|Clay                 |
|Rafael Nadal|Wimbledon      |2010|Grass / Outdoor      |
|Rafael Nadal|

In [None]:
sc.stop()