In [18]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import BucketedRandomProjectionLSH, VectorAssembler
from pyspark.sql.functions import col, desc, asc, lit, when, isnan, isnull, broadcast
from pyspark.ml.functions import array_to_vector
from pyspark.sql.types import DoubleType, ArrayType, FloatType
from pyspark.sql.functions import udf
from pyspark.ml.linalg import Vectors, VectorUDT
import numpy as np

In [19]:
# %%
import pyspark
print(pyspark.__version__)  # VERSION MUST MATCH THE SPARK CONTAINER VERSION

3.5.3


In [20]:
NEO4J_URI = "bolt://neo4j:password@neo4j:7687"
graph_name = "AnalysisGraph"

In [21]:
spark = (
    SparkSession.builder.appName("NodeSimilarityAnalysis")
    .master("spark://spark:7077")
    .config("spark.jars.packages", "neo4j-contrib:neo4j-spark-connector:5.3.1-s_2.12")
    .config("neo4j.url", NEO4J_URI)
    .config("neo4j.authentication.basic.username", "neo4j")
    .config("neo4j.authentication.basic.password", "password")
    .config("neo4j.database", "neo4j")
    .config("spark.sql.adaptive.enabled", "false")
    .getOrCreate()
)

print("Spark session created successfully")
spark

Spark session created successfully


In [22]:
# Drop Graph if exists
graph_exists = (
    spark.read.format("org.neo4j.spark.DataSource")
    .option("gds", "gds.graph.exists")
    .option("gds.graphName", graph_name)
    .load()
)

In [23]:
graph_exists = graph_exists.first()["exists"]
graph_exists

True

In [24]:
if not graph_exists:
    (
        spark.read.format("org.neo4j.spark.DataSource")
        .option("gds", "gds.graph.project")
        .option("gds.graphName", graph_name)
        .option("gds.nodeProjection", ["Paper", "Keyword"])  # Include Paper e Keyword
        .option(
            "gds.relationshipProjection",
            """
            {
            "KEYWORD": {"orientation": "UNDIRECTED"}
            }
            """,
        )
        .load()
        .show(truncate=False)
    )

In [26]:
similarity_stream = (
    spark.read.format("org.neo4j.spark.DataSource")
    .option("gds", "gds.nodeSimilarity.stream")
    .option("gds.graphName", graph_name)
    .option("gds.sourceNodeFilter", "Paper")  
    .option("gds.targetNodeFilter", "Paper")
    .option("gds.topK", "5")  # Top 5 paper more simiar for each paper
    .option("gds.similarityCutoff", "0.1")  
    .option("gds.similarityMetric", "JACCARD")  
    .load()
)
similarity_stream.orderBy(col("similarity").desc()).show(50, truncate=False)

+-----+-----+----------+
|node1|node2|similarity|
+-----+-----+----------+
|18776|18503|1.0       |
|18783|19039|1.0       |
|18737|18479|1.0       |
|18802|19158|1.0       |
|18501|18372|1.0       |
|18802|19155|1.0       |
|18503|18776|1.0       |
|18807|18533|1.0       |
|18533|18807|1.0       |
|18807|19164|1.0       |
|18533|19164|1.0       |
|18807|18192|1.0       |
|18533|18192|1.0       |
|18192|18807|1.0       |
|18826|19199|1.0       |
|18192|18533|1.0       |
|18826|19183|1.0       |
|18379|18953|1.0       |
|18855|18565|1.0       |
|18543|18206|1.0       |
|18565|18855|1.0       |
|19164|18192|1.0       |
|18950|18698|1.0       |
|18696|18954|1.0       |
|18698|18950|1.0       |
|18699|18953|1.0       |
|18699|18379|1.0       |
|18953|18699|1.0       |
|18770|19033|1.0       |
|18953|18379|1.0       |
|18727|18411|1.0       |
|18954|18696|1.0       |
|18206|18543|1.0       |
|18974|18713|1.0       |
|18379|18699|1.0       |
|19033|18770|1.0       |
|19183|19199|1.0       |


In [27]:
# COSINE similarity
print("Running Node Similarity with COSINE metric...")
cosine_similarity = (
    spark.read.format("org.neo4j.spark.DataSource")
    .option("gds", "gds.nodeSimilarity.stream")
    .option("gds.graphName", graph_name)
    .option("gds.topK", "5")
    .option("gds.similarityCutoff", "0.1")
    .option("gds.similarityMetric", "COSINE")
    .load()
)
cosine_similarity.orderBy(desc("similarity")).show(10, truncate=False)

Running Node Similarity with COSINE metric...
+-----+-----+----------+
|node1|node2|similarity|
+-----+-----+----------+
|18372|18501|1.0       |
|18533|18807|1.0       |
|18533|19164|1.0       |
|18192|19164|1.0       |
|18411|18727|1.0       |
|18192|18807|1.0       |
|18479|18737|1.0       |
|18379|18953|1.0       |
|18501|18372|1.0       |
|18206|18543|1.0       |
+-----+-----+----------+
only showing top 10 rows



In [28]:
# Advanced configuration with degree cutoffs
print("Running Node Similarity with degree cutoffs...")
filtered_similarity = (
    spark.read.format("org.neo4j.spark.DataSource")
    .option("gds", "gds.nodeSimilarity.stream")
    .option("gds.graphName", graph_name)
    .option("gds.topK", "5")
    .option("gds.similarityCutoff", "0.2")
    .option("gds.similarityMetric", "JACCARD")
    .option("gds.degreeCutoff", "2")  # Only consider nodes with degree >= 2
    .option("gds.upperDegreeCutoff", "100")  # Only consider nodes with degree <= 100
    .load()
)

print("Similarity with degree filtering (degree 2-100):")
filtered_similarity.orderBy(desc("similarity")).show(15, truncate=False)

Running Node Similarity with degree cutoffs...
Similarity with degree filtering (degree 2-100):
+-----+-----+----------+
|node1|node2|similarity|
+-----+-----+----------+
|18501|18372|1.0       |
|18699|18379|1.0       |
|18503|18776|1.0       |
|18192|19164|1.0       |
|18533|18807|1.0       |
|18206|18543|1.0       |
|18533|19164|1.0       |
|18358|18682|1.0       |
|18533|18192|1.0       |
|18379|18953|1.0       |
|18543|18206|1.0       |
|18565|18855|1.0       |
|18699|18953|1.0       |
|18479|18737|1.0       |
|18372|18501|1.0       |
+-----+-----+----------+
only showing top 15 rows



In [29]:
spark.stop()