In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import collect_list, size
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import pyspark

print(pyspark.__version__)  # VERSION MUST MATCH THE SPARK CONTAINER VERSION

3.5.3


In [3]:
NEO4J_URI = "bolt://neo4j:password@neo4j:7687"
graph_name = "PeopleKnowledge"
pipeline_name = "LinkPrediction"

In [4]:
spark = (
    SparkSession.builder.appName("MLPipeline")
    .master("spark://spark:7077")
    .config("spark.jars.packages", "neo4j-contrib:neo4j-spark-connector:5.3.1-s_2.12")
    .config("neo4j.url", NEO4J_URI)
    .config("neo4j.authentication.basic.username", "neo4j")
    .config("neo4j.authentication.basic.password", "password")
    .config("neo4j.database", "neo4j")
    .getOrCreate()
)
spark

# Create the projection graph

In [5]:
# Drop Graph if exists
(
    spark.read.format("org.neo4j.spark.DataSource")
    .option(
        "query",
        f"CALL gds.graph.drop('{graph_name}', false) YIELD graphName RETURN graphName",
    )
    .option("partitions", "1")
    .load()
    .show()
)

+---------+
|graphName|
+---------+
+---------+



In [6]:
(
    spark.read.format("org.neo4j.spark.DataSource")
    .option("gds", "gds.graph.project")
    .option("gds.graphName", graph_name)
    .option("gds.nodeProjection", ["Person", "Paper", "Volume"])
    .option(
        "gds.relationshipProjection",
        """
        {
        "AUTHOR": {"orientation": "UNDIRECTED"},
        "EDITOR": {"orientation": "UNDIRECTED"},
        "CONTAINS": {"orientation": "UNDIRECTED"}
        }
        """,
    )
    .load()
    .show(truncate=False)
)

+----------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+---------+-----------------+-------------+
|nodeProjection                                                                                                                          |relationshipProjection                                                                                                                                                                                                                                                                                               

In [31]:
vol_rel = (
    spark.read.format("org.neo4j.spark.DataSource")
    .option("relationship", "EDITOR")
    .option("relationship.source.labels", "Volume")
    .option("relationship.target.labels", "Person")
    .load()
    .select(
        col("`<source.id>`").alias("source_id"), col("`<target.id>`").alias("person_id")
    )
)
vol_rel

DataFrame[source_id: bigint, person_id: bigint]

In [32]:
pap_rel = (
    spark.read.format("org.neo4j.spark.DataSource")
    .option("relationship", "AUTHOR")
    .option("relationship.source.labels", "Paper")
    .option("relationship.target.labels", "Person")
    .load()
    .select(
        col("`<source.id>`").alias("source_id"), col("`<target.id>`").alias("person_id")
    )
)
pap_rel

DataFrame[source_id: bigint, person_id: bigint]

In [42]:
df = vol_rel.union(pap_rel)

df1 = df.alias("df1")
df2 = df.alias("df2")

df = df1.join(
    df2,
    (col("df1.source_id") == col("df2.source_id"))
    & (col("df1.person_id") < col("df2.person_id")),
).select(col("df1.person_id").alias("p1"), col("df2.person_id").alias("p2"))


df.show()

+----+----+
|  p1|  p2|
+----+----+
|5377|5378|
|5376|5378|
|5381|5384|
|5377|5384|
|5380|5384|
|5383|5384|
|5376|5384|
|5382|5384|
|5379|5384|
|5378|5384|
|5377|5379|
|5376|5379|
|5378|5379|
|5381|5382|
|5377|5382|
|5380|5382|
|5376|5382|
|5379|5382|
|5378|5382|
|5381|5385|
+----+----+
only showing top 20 rows



In [None]:
(
    spark.read.format("org.neo4j.spark.DataSource")
    .option("gds", "gds.graph.project")
    .option("gds.graphName", graph_name)
    .option("gds.nodeProjection", ["Person", "Paper", "Volume"])
    .option(
        "gds.relationshipProjection",
        """
        {
        "AUTHOR": {"orientation": "UNDIRECTED"},
        "EDITOR": {"orientation": "UNDIRECTED"},
        "CONTAINS": {"orientation": "UNDIRECTED"}
        }
        """,
    )
    .load()
    .show(truncate=False)
)

In [None]:
spark.read.format("org.neo4j.spark.DataSource")
.option("gds", "gds.graph.relationship.write")


# Creating a pipeline

In [None]:
# Drop Graph if exists
(
    spark.read.format("org.neo4j.spark.DataSource")
    .option("query", f"CALL gds.beta.pipeline.linkPrediction.create('{pipeline_name}')")
    .load()
    .show()
)

IllegalArgumentException: Query not compiled for the following exception: ClientException: Query cannot conclude with CALL together with YIELD (line 2, column 1 (offset: 32))
"CALL gds.beta.pipeline.linkPrediction.create('LinkPrediction') YIELD name"
 ^

In [None]:
f"CALL gds.beta.pipeline.linkPrediction.create('{pipeline_name}')"

"CALL gds.beta.pipeline.linkPrediction.create('LinkPrediction')"