In [1]:
import pandas as pd
from graphdatascience import GraphDataScience
pd.options.display.width = 0
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('max_colwidth', 1000)

In [2]:
RANDOM_SEED = 7474

In [3]:
HOST = 'neo4j://localhost'
USERNAME = 'neo4j'
PASSWORD = 'password'

gds = GraphDataScience(HOST, auth=(USERNAME, PASSWORD), aura_ds=False)

In [4]:
g, _  = gds.graph.project('proj',['User', 'Website'], {
    'SAME_AS': {'orientation': 'UNDIRECTED'},
    'CHILD_OF': {'orientation': 'UNDIRECTED'},
    'VISITED': {'orientation': 'UNDIRECTED'}
})
_

nodeProjection                                                                                                                                                                                                                                         {'User': {'label': 'User', 'properties': {}}, 'Website': {'label': 'Website', 'properties': {}}}
relationshipProjection    {'SAME_AS': {'orientation': 'UNDIRECTED', 'aggregation': 'DEFAULT', 'type': 'SAME_AS', 'properties': {}}, 'VISITED': {'orientation': 'UNDIRECTED', 'aggregation': 'DEFAULT', 'type': 'VISITED', 'properties': {}}, 'CHILD_OF': {'orientation': 'UNDIRECTED', 'aggregation': 'DEFAULT', 'type': 'CHILD_OF', 'properties': {}}}
graphName                                                                                                                                                                                                                                                                                                               

In [5]:
gds.fastRP.mutate(g,
                mutateProperty='embedding',
                relationshipTypes=['CHILD_OF' , 'VISITED'],
                iterationWeights=[0.0, 1.0, 0.7, 0.5, 0.8],
                embeddingDimension=128,
                randomSeed=RANDOM_SEED)

nodePropertiesWritten                                                                                                                                                                                                                                                                                                                                                                                         2433861
mutateMillis                                                                                                                                                                                                                                                                                                                                                                                                        1
nodeCount                                                                                                                                                                                   

In [6]:
pipe, _ = gds.beta.pipeline.linkPrediction.create("pipe")
_

name                                                                                                            pipe
nodePropertySteps                                                                                                 []
featureSteps                                                                                                      []
splitConfig          {'negativeSamplingRatio': 1.0, 'testFraction': 0.1, 'validationFolds': 3, 'trainFraction': 0.1}
parameterSpace                                                        {'RandomForest': [], 'LogisticRegression': []}
Name: 0, dtype: object

In [7]:
pipe.addFeature('l2', nodeProperties=['embedding'])

name                                                                                                            pipe
nodePropertySteps                                                                                                 []
featureSteps                                           [{'name': 'L2', 'config': {'nodeProperties': ['embedding']}}]
splitConfig          {'negativeSamplingRatio': 1.0, 'testFraction': 0.1, 'validationFolds': 3, 'trainFraction': 0.1}
parameterSpace                                                        {'RandomForest': [], 'LogisticRegression': []}
Name: 0, dtype: object

In [8]:
pipe.addFeature('cosine', nodeProperties=['embedding'])

name                                                                                                                                           pipe
nodePropertySteps                                                                                                                                []
featureSteps         [{'name': 'L2', 'config': {'nodeProperties': ['embedding']}}, {'name': 'COSINE', 'config': {'nodeProperties': ['embedding']}}]
splitConfig                                         {'negativeSamplingRatio': 1.0, 'testFraction': 0.1, 'validationFolds': 3, 'trainFraction': 0.1}
parameterSpace                                                                                       {'RandomForest': [], 'LogisticRegression': []}
Name: 0, dtype: object

In [9]:
pipe.configureSplit(testFraction=0.2, trainFraction=0.4, validationFolds=5, negativeSamplingRatio=2.0)

name                                                                                                                                           pipe
nodePropertySteps                                                                                                                                []
featureSteps         [{'name': 'L2', 'config': {'nodeProperties': ['embedding']}}, {'name': 'COSINE', 'config': {'nodeProperties': ['embedding']}}]
splitConfig                                         {'negativeSamplingRatio': 2.0, 'testFraction': 0.2, 'validationFolds': 5, 'trainFraction': 0.4}
parameterSpace                                                                                       {'RandomForest': [], 'LogisticRegression': []}
Name: 0, dtype: object

In [10]:
pipe.addLogisticRegression(penalty=0.0, patience=3, maxEpochs=2000, tolerance=0.00001)
pipe.addLogisticRegression(penalty=0.01, patience=3, maxEpochs=1000, tolerance=0.00001)

name                                                                                                                                                                                                                                                                                                                                                      pipe
nodePropertySteps                                                                                                                                                                                                                                                                                                                                           []
featureSteps                                                                                                                                                                                                                    [{'name': 'L2', 'config': {'nodeProperties': ['embedding']}}, {'name': 'CO

In [11]:
trained_pipe_model, res = pipe.train(g,
    modelName='entity-resolution-model',
    randomSeed=RANDOM_SEED,
    nodeLabels=['User'],
    relationshipTypes=['SAME_AS'],
    negativeClassWeight=1.0/2.0)

In [19]:
trained_pipe_model.metrics()["AUCPR"]["test"]

0.9332965423634675

In [17]:
trained_pipe_model.train_config()

pipeline                                  pipe
randomSeed                                7474
graphName                                 proj
modelName              entity-resolution-model
negativeClassWeight                        0.5
nodeLabels                              [User]
sudo                                     False
relationshipTypes                    [SAME_AS]
username                                  None
concurrency                                  4
dtype: object

In [26]:
trained_pipe_model.predict_stream(g, sampleRate=1.0, topN=20, nodeLabels=['User'], relationshipTypes=['SAME_AS'])

Unnamed: 0,node1,node2,probability
0,33845,33846,0.838466
1,33806,33807,0.838466
2,1563,15803,0.838214
3,15803,21642,0.837809
4,1563,21642,0.837587
5,10644,13318,0.837585
6,12812,15803,0.837573
7,4592,18438,0.837123
8,12812,13319,0.836847
9,13319,15803,0.83683
