# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 05**: Data pipeline with Neo4j

**Date**: October 2nd 2025

**Student Name**: Ivan Estrella

**Professor**: Pablo Camarillo Ramirez

# Dataset description

This dataset contains information about a Stack Overflow technology tag network.
The nodes are technology tags (such as Python, JavaScript, or React) that appear in developer stories, and the edges represent co-occurrences of these tags.

Link for dataset: https://www.kaggle.com/datasets/stackoverflow/stack-overflow-tag-network?select=stack_network_links.csv

Neccesary files: 2 csv(links and nodes)

# Data ingestion

In [23]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on SparkSQL") \
    .master("spark://e3b502141eaf:7077") \
    .config("spark.jars.packages", "org.neo4j:neo4j-connector-apache-spark_2.13:5.3.10_for_spark_3") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

spark.conf.set("spark.sql.shuffle.partitions", "5")

In [24]:
# Build schema
# Import your module
from IvanE.SparkUtilsIvan import SparkUtils
from pyspark.sql.functions import regexp_replace, split, col

nodes_schema = SparkUtils.generate_schema([
    ("name", "string"),
    ("group", "int"),
    ("nodesize", "float")
])

df_nodes = spark.read \
    .option("header", "true") \
    .schema(nodes_schema) \
    .csv("/opt/spark/work-dir/data/lab06neo/nodes")

df_nodes.show(5, truncate=False)

links_schema = SparkUtils.generate_schema([
    ("source", "string"),
    ("target", "string"),
    ("value", "float")
])

df_links = spark.read \
    .option("header", "true") \
    .schema(links_schema) \
    .csv("/opt/spark/work-dir/data/lab06neo/links")
df_links.show(5, truncate=False)


+---------+-----+--------+
|name     |group|nodesize|
+---------+-----+--------+
|html     |6    |272.45  |
|css      |6    |341.17  |
|hibernate|8    |29.83   |
|spring   |8    |52.84   |
|ruby     |3    |70.14   |
+---------+-----+--------+
only showing top 5 rows
+----------------+------+---------+
|source          |target|value    |
+----------------+------+---------+
|azure           |.net  |20.933193|
|sql-server      |.net  |32.322525|
|asp.net         |.net  |48.40703 |
|entity-framework|.net  |24.370903|
|wpf             |.net  |32.350925|
+----------------+------+---------+
only showing top 5 rows


# Transformations

In [25]:
# Add the code for your transformations to create nodes and edges DataFrames HERE

from pyspark.sql.functions import col

tech_nodes = df_nodes.select(
    col("name").alias("id"),      
    col("group"),                
    col("nodesize").alias("size") 
).dropDuplicates(["id"])          

tech_nodes.show(5, truncate=False)


tech_edges = df_links.select(
    col("source").alias("src"),  
    col("target").alias("dst"),  
    col("value").alias("weight")  
).dropDuplicates()

tech_edges.show(5, truncate=False)


+------+-----+------+
|id    |group|size  |
+------+-----+------+
|spring|8    |52.84 |
|jquery|6    |208.29|
|mysql |6    |165.43|
|.net  |2    |75.08 |
|less  |6    |9.73  |
+------+-----+------+
only showing top 5 rows
+-----------------+--------------+---------+
|src              |dst           |weight   |
+-----------------+--------------+---------+
|android          |android-studio|33.661083|
|typescript       |angular       |31.036482|
|typescript       |angular2      |38.879982|
|twitter-bootstrap|angularjs     |24.153687|
|express          |angularjs     |24.433828|
+-----------------+--------------+---------+
only showing top 5 rows


# Writing Data in Neo4j

In [28]:
# Add the code to write a graph from PySpark's DataFrames to Neo4j
neo4j_url = "bolt://neo4j-iteso:7687"
neo4j_user = "neo4j"
neo4j_passwd = "neo4j@1234"

# Nodes
tech_nodes.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("labels", ":Tech") \
  .option("node.keys", "id") \
  .save()

print(f"{tech_nodes.count()} users wrote in Neo4j")

# Edges
tech_edges.write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", neo4j_url) \
  .option("authentication.basic.username", neo4j_user) \
  .option("authentication.basic.password", neo4j_passwd) \
  .option("relationship", "CO_OCCURS") \
  .option("relationship.save.strategy", "keys") \
  .option("relationship.source.labels", ":Tech") \
  .option("relationship.source.node.keys", "src:id") \
  .option("relationship.target.labels", ":Tech") \
  .option("relationship.target.node.keys", "dst:id") \
  .save()

print(f"{tech_edges.count()} edges written to Neo4j")



115 users wrote in Neo4j
490 edges written to Neo4j


# Read and Query Graphs with PySpark

In [38]:
cypher_df = spark.read \
    .format("org.neo4j.spark.DataSource") \
    .option("url", neo4j_url) \
    .option("authentication.basic.username", neo4j_user) \
    .option("authentication.basic.password", neo4j_passwd) \
    .option("query",
            """
            MATCH (t1:Tech)-[r:CO_OCCURS]->(t2:Tech)
            WHERE t1.id = 'azure'
            WITH t1, t2, r.weight AS cooccurrence
            RETURN t1.id AS source_tag, t2.id AS target_tag, cooccurrence
            """) \
    .load()

cypher_df.show(truncate=False)


+----------+-------------------+------------------+
|source_tag|target_tag         |cooccurrence      |
+----------+-------------------+------------------+
|azure     |amazon-web-services|21.30994987487793 |
|azure     |asp.net-web-api    |21.585695266723633|
|azure     |.net               |20.93319320678711 |
|azure     |c#                 |22.144487380981445|
|azure     |asp.net            |23.76407241821289 |
+----------+-------------------+------------------+



In [None]:
sc.stop()