In [31]:
from pyspark.sql import SparkSession
from random import randint

In [32]:
import pyspark

print(pyspark.__version__)  # VERSION MUST MATCH THE SPARK CONTAINER VERSION

3.5.3


In [33]:
NEO4J_URI = "bolt://neo4j:password@neo4j:7687"
graph_name = "PeopleKnowledge"
pipeline_name = f"LinkPrediction-{randint(0, 10**9)}"

In [34]:
spark = (
    SparkSession.builder.appName("MLPipeline")
    .master("spark://spark:7077")
    .config("spark.jars.packages", "neo4j-contrib:neo4j-spark-connector:5.3.1-s_2.12")
    .config("neo4j.url", NEO4J_URI)
    .config("neo4j.authentication.basic.username", "neo4j")
    .config("neo4j.authentication.basic.password", "password")
    .config("neo4j.database", "neo4j")
    .getOrCreate()
)
spark

# Create the projection graph

In [35]:
# Drop Graph if exists
(
    spark.read.format("org.neo4j.spark.DataSource")
    .option(
        "query",
        f"CALL gds.graph.drop('{graph_name}', false) YIELD graphName RETURN graphName",
    )
    .option("partitions", "1")
    .load()
)
# Create graph
(
    spark.read.format("org.neo4j.spark.DataSource")
    .option("gds", "gds.graph.project")
    .option("gds.graphName", graph_name)
    .option("gds.nodeProjection", ["Person", "Paper"])
    .option(
        "gds.relationshipProjection",
        '{"AUTHOR": {"orientation": "UNDIRECTED"}}',
    )
    .load()
    .show(truncate=False)
)

+------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+---------------+---------+-----------------+-------------+
|nodeProjection                                                                            |relationshipProjection                                                                                               |graphName      |nodeCount|relationshipCount|projectMillis|
+------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+---------------+---------+-----------------+-------------+
|{Person -> {"properties":{},"label":"Person"}, Paper -> {"properties":{},"label":"Paper"}}|{AUTHOR -> {"aggregation":"DEFAULT","orientation":"UNDIRECTED","type":"AUTHOR","properties":{},"index

In [36]:
# vol_rel = (
#     spark.read.format("org.neo4j.spark.DataSource")
#     .option("relationship", "EDITOR")
#     .option("relationship.source.labels", "Volume")
#     .option("relationship.target.labels", "Person")
#     .load()
#     .select(
#         col("`<source.id>`").alias("source_id"), col("`<target.id>`").alias("person_id")
#     )
# )
# vol_rel

In [37]:
# pap_rel = (
#     spark.read.format("org.neo4j.spark.DataSource")
#     .option("relationship", "AUTHOR")
#     .option("relationship.source.labels", "Paper")
#     .option("relationship.target.labels", "Person")
#     .load()
#     .select(
#         col("`<source.id>`").alias("source_id"), col("`<target.id>`").alias("person_id")
#     )
# )
# pap_rel

In [38]:
# # match connections between people that collaborated in papers and volumes
# df = vol_rel.union(pap_rel)

# df1 = df.alias("df1")
# df2 = df.alias("df2")

# df = df1.join(
#     df2,
#     (col("df1.source_id") == col("df2.source_id"))
#     & (col("df1.person_id") < col("df2.person_id")),
# ).select(col("df1.person_id").alias("p1"), col("df2.person_id").alias("p2"))


# df.show()

In [39]:
# (
#     spark.read.format("org.neo4j.spark.DataSource")
#     .option("gds", "gds.graph.project")
#     .option("gds.graphName", graph_name)
#     .option("gds.nodeProjection", ["Person", "Paper"])
#     .option(
#         "gds.relationshipProjection",
#         """
#         {
#         "AUTHOR": {"orientation": "UNDIRECTED"},
#         }
#         """,
#     )
#     .load()
#     .show(truncate=False)
# )

In [40]:
# spark.read.format("org.neo4j.spark.DataSource")
# .option("gds", "gds.graph.relationship.write")

## Creating a pipeline

In [41]:
(
    spark.read.format("org.neo4j.spark.DataSource")
    .option(
        "query",
        f"""CALL gds.beta.pipeline.drop('{pipeline_name}', false)
        YIELD pipelineName, pipelineType, creationTime
        RETURN pipelineName, pipelineType, creationTime""",
    )
    .option("partitions", "1")
    .load()
    .show(truncate=False)
)
try:
    (
        spark.read.format("org.neo4j.spark.DataSource")
        .option(
            "query",
            f"""CALL gds.beta.pipeline.linkPrediction.create('{pipeline_name}')
            YIELD name, nodePropertySteps, featureSteps, splitConfig, autoTuningConfig, parameterSpace
            RETURN name, nodePropertySteps, featureSteps, splitConfig, autoTuningConfig, parameterSpace""",
        )
        .option("partitions", "1")
        .load()
        .show(truncate=False)
    )
except:
    pass

+------------+------------+------------+
|pipelineName|pipelineType|creationTime|
+------------+------------+------------+
+------------+------------+------------+



### Adding node properties

In [43]:
(
    spark.read.format("org.neo4j.spark.DataSource")
    .option(
        "query",
        f"""
    CALL gds.beta.pipeline.linkPrediction.addNodeProperty(
  '{pipeline_name}',
  'fastRP',
  {{
    mutateProperty: 'embedding',
    embeddingDimension: 256,
    randomSeed: 42
  }}
)
YIELD name, nodePropertySteps, featureSteps, splitConfig, autoTuningConfig, parameterSpace
RETURN name, nodePropertySteps, featureSteps, splitConfig, autoTuningConfig, parameterSpace
""",
    )
    .option("partitions", "1")
    .load()
    .show(truncate=False)
)

Py4JJavaError: An error occurred while calling o254.load.
: org.neo4j.driver.exceptions.ClientException: Failed to invoke procedure `gds.beta.pipeline.linkPrediction.addNodeProperty`: Caused by: java.lang.IllegalArgumentException: The value of `mutateProperty` is expected to be unique, but embedding was already specified in the gds.fastRP.mutate procedure.
	at org.neo4j.driver.internal.util.Futures.blockingGet(Futures.java:111)
	at org.neo4j.driver.internal.InternalResult.blockingGet(InternalResult.java:107)
	at org.neo4j.driver.internal.InternalResult.list(InternalResult.java:88)
	at org.neo4j.spark.service.SchemaService.retrieveSchema(SchemaService.scala:111)
	at org.neo4j.spark.service.SchemaService.structForQuery(SchemaService.scala:233)
	at org.neo4j.spark.service.SchemaService.struct(SchemaService.scala:341)
	at org.neo4j.spark.DataSource.$anonfun$inferSchema$1(DataSource.scala:29)
	at org.neo4j.spark.util.Neo4jUtil$.callSchemaService(Neo4jUtil.scala:173)
	at org.neo4j.spark.DataSource.inferSchema(DataSource.scala:29)
	at org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils$.getTableFromProvider(DataSourceV2Utils.scala:90)
	at org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils$.loadV2Source(DataSourceV2Utils.scala:140)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$1(DataFrameReader.scala:210)
	at scala.Option.flatMap(Option.scala:271)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:208)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:172)
	at jdk.internal.reflect.GeneratedMethodAccessor41.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
	Suppressed: org.neo4j.driver.internal.util.ErrorUtil$InternalExceptionCause
		at org.neo4j.driver.internal.util.ErrorUtil.newNeo4jError(ErrorUtil.java:76)
		at org.neo4j.driver.internal.async.inbound.InboundMessageDispatcher.handleFailureMessage(InboundMessageDispatcher.java:107)
		at org.neo4j.driver.internal.messaging.common.CommonMessageReader.unpackFailureMessage(CommonMessageReader.java:75)
		at org.neo4j.driver.internal.messaging.common.CommonMessageReader.read(CommonMessageReader.java:53)
		at org.neo4j.driver.internal.async.inbound.InboundMessageHandler.channelRead0(InboundMessageHandler.java:81)
		at org.neo4j.driver.internal.async.inbound.InboundMessageHandler.channelRead0(InboundMessageHandler.java:37)
		at org.neo4j.driver.internal.shaded.io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:99)
		at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:444)
		at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)
		at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412)
		at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:346)
		at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:318)
		at org.neo4j.driver.internal.async.inbound.MessageDecoder.channelRead(MessageDecoder.java:42)
		at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:444)
		at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)
		at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412)
		at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:346)
		at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:318)
		at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:444)
		at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)
		at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412)
		at org.neo4j.driver.internal.shaded.io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1407)
		at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:440)
		at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)
		at org.neo4j.driver.internal.shaded.io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:918)
		at org.neo4j.driver.internal.shaded.io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:166)
		at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:788)
		at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:724)
		at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:650)
		at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:562)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:994)
		at org.neo4j.driver.internal.shaded.io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
		at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
		... 1 more


In [44]:
(
    spark.read.format("org.neo4j.spark.DataSource")
    .option(
        "query",
        f"""
CALL gds.beta.pipeline.linkPrediction.addNodeProperty(
  pipelineName: "{pipeline_name}",
  procedureName: 'degree',
  configuration: {{
    mutateProperty: 'degree'
  }}
)
""",
    )
    .option("partitions", "1")
    # .option("gds", "gds.beta.pipeline.linkPrediction.addNodeProperty")
    # .option("gds.pipelineName", pipeline_name)
    # .option("gds.procedureName", "degree")
    # .option("gds.procedureConfiguration.mutateProperty", "degree")
    .load()
    # .show()
)
(
    spark.read.format("org.neo4j.spark.DataSource")
    .option(
        "query",
        f"""
        CALL gds.beta.pipeline.linkPrediction.addNodeProperty(
  pipelineName: "{pipeline_name}",
  procedureName: 'alpha.scaleProperties',
  configuration: {{
    nodeProperties: ['degree'],
    mutateProperty: 'scaledDegree',
    scaler: 'MinMax'
  }}
)
""",
    )
    .option("partitions", "1")
    # .option("gds", "gds.beta.pipeline.linkPrediction.addNodeProperty")
    # .option("gds.pipelineName", pipeline_name)
    # .option("gds.procedureName", "alpha.scaleProperties")
    # .option("gds.procedureConfiguration.nodeProperties", ["degree"])
    # .option("gds.procedureConfiguration.mutateProperty", "scaledDegree")
    # .option("gds.procedureConfiguration.scaler", "MinMax")
    .load()
    # .show()
)

DataFrame[]

### Adding link properties

In [None]:
(
    spark.read.format("org.neo4j.spark.DataSource")
    .option(
        "query",
        f"""
CALL gds.beta.pipeline.linkPrediction.addFeature(
  pipelineName: "{pipeline_name}",
  featureType: 'hadamard',
  configuration: {{
    nodeProperties: ['scaledDegree']
  }}
)
""",
    )
    .option("partitions", "1")
    # .option("gds", "gds.beta.pipeline.linkPrediction.addFeature")
    # .option("gds.pipelineName", pipeline_name)
    # .option("gds.featureType", "hadamard")
    # .option("gds.configuration.nodeProperties", ["scaledDegree"])
    .load()
    # .show()
)

DataFrame[]

### Configuring the relationship splits

In [None]:
(
    spark.read.format("org.neo4j.spark.DataSource")
    .option(
        "query",
        f"""CALL gds.beta.pipeline.linkPrediction.train(
  pipelineName: "{pipeline_name}",
  graphName: "{graph_name}",
  configuration: {{
    testFraction: 0.1,
    trainFraction: 0.1,
    validationFolds: 3
  }}
)
            """,
    )
    .option("partitions", "1")
    # .option("gds", "gds.beta.pipeline.linkPrediction.addFeature")
    # .option("gds.pipelineName", pipeline_name)
    # .option("gds.configuration.testFraction", 0.1)
    # .option("gds.configuration.trainFraction", 0.1)
    # .option("gds.configuration.validationFolds", 3)
    .load()
    # .show()
)

DataFrame[]

### Adding model candidates

In [None]:
(
    spark.read.format("org.neo4j.spark.DataSource")
    .option(
        "query",
        f"""CALL gds.beta.pipeline.linkPrediction.addLogisticRegression(
  pipelineName: "{pipeline_name}"
)
            """,
    )
    .option("partitions", "1")
    # .option("gds", "gds.beta.pipeline.linkPrediction.addLogisticRegression")
    # .option("gds.pipelineName", pipeline_name)
    .load()
    # .show()
)

DataFrame[]

### Model training

In [None]:
# estimate the training cost
(
    spark.read.format("org.neo4j.spark.DataSource")
    .option("gds", "gds.beta.pipeline.linkPrediction.train.estimate")
    .option("gds.graphName", graph_name)
    .option("gds.configuration.modelName", "collaborationPrediction")
    .option("gds.configuration.pipeline", pipeline_name)
    .load()
    # .show()
)

DataFrame[requiredMemory: string, treeView: string, mapView: map<string,string>, bytesMin: bigint, bytesMax: bigint, nodeCount: bigint, relationshipCount: bigint, heapPercentageMin: double, heapPercentageMax: double]

In [None]:
(
    spark.read.format("org.neo4j.spark.DataSource")
    .option("gds", "gds.beta.pipeline.linkPrediction.train")
    .option("gds.graphName", graph_name)
    .option("gds.configuration.modelName", "collaborationPrediction")
    .option("gds.configuration.pipeline", pipeline_name)
    .option("gds.configuration.metrics", ["AUCPR"])
    .option("gds.configuration.randomSeed", 42)
    .load()
    # .show()
)

DataFrame[modelSelectionStats: map<string,string>, trainMillis: bigint, modelInfo: map<string,string>, configuration: map<string,string>]

In [None]:
(
    spark.read.format("org.neo4j.spark.DataSource")
    .option("gds", "gds.beta.pipeline.linkPrediction.predict.mutate")
    .option("gds.graphName", graph_name)
    .option("gds.configuration.modelName", "collaborationPrediction")
    .option("gds.configuration.mutateRelationshipType", "AUTHOR_APPROX_PREDICTED")
    .option("gds.configuration.topN", 40)
    .option("gds.configuration.threshold", 0.45)
    .load()
    # .show()
)

DataFrame[relationshipsWritten: bigint, probabilityDistribution: map<string,string>, samplingStats: map<string,string>, mutateMillis: bigint, postProcessingMillis: bigint, preProcessingMillis: bigint, computeMillis: bigint, configuration: map<string,string>]

In [None]:
(
    spark.read.format("org.neo4j.spark.DataSource")
    .option("gds", "gds.beta.pipeline.linkPrediction.predict.stream")
    .option("gds.graphName", graph_name)
    .option("gds.configuration.modelName", "collaborationPrediction")
    .option("gds.configuration.topN", 10)
    .load()
    # .toPandas()
    # .show()
)

DataFrame[node1: bigint, node2: bigint, probability: double]

In [None]:
# spark.stop()