## Link Prediction Pipelines

In [1]:
import pandas as pd
from graphdatascience import GraphDataScience
pd.options.display.width = 0
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('max_colwidth', 1000)

In [2]:
RANDOM_SEED = 7474

## Connect to Graph Data Science

In [3]:
HOST = 'neo4j://localhost'
USERNAME = 'neo4j'
PASSWORD = 'password'

gds = GraphDataScience(HOST, auth=(USERNAME, PASSWORD), aura_ds=False)

## Configure Machine Learning Pipeline for Entity Resolution

In [19]:
g, _  = gds.graph.project('proj',['User', 'Website'], {
    'SAME_AS': {'orientation': 'UNDIRECTED'},
    'CHILD_OF': {'orientation': 'UNDIRECTED'},
    'VISITED': {'orientation': 'UNDIRECTED'}
})

In [20]:
_ = gds.fastRP.mutate(g,
                mutateProperty='embedding',
                relationshipTypes=['CHILD_OF' , 'VISITED'],
                iterationWeights=[0.0, 1.0, 0.7, 0.5, 0.8],
                embeddingDimension=128,
                randomSeed=RANDOM_SEED)

In [6]:
pipe, _ = gds.beta.pipeline.linkPrediction.create("pipe")

In [7]:
_ = pipe.addFeature('l2', nodeProperties=['embedding'])

In [8]:
_ = pipe.addFeature('cosine', nodeProperties=['embedding'])

In [9]:
_ = pipe.configureSplit(testFraction=0.2, trainFraction=0.4, validationFolds=5, negativeSamplingRatio=2.0)

In [10]:
_ = pipe.addLogisticRegression(penalty=0.0, patience=3, maxEpochs=2000, tolerance=0.00001)
_ = pipe.addLogisticRegression(penalty=0.01, patience=3, maxEpochs=1000, tolerance=0.00001)

## Train Model Candidates, Evaluate, Select Best, and Deploy

In [11]:
trained_pipe_model, res = pipe.train(g,
    modelName='entity-resolution-model',
    randomSeed=RANDOM_SEED,
    nodeLabels=['User'],
    relationshipTypes=['SAME_AS'],
    negativeClassWeight=1.0/2.0)

In [12]:
print(trained_pipe_model.metrics()["AUCPR"]["test"])

0.9332965423634675


## Predict Entity Resolution Links In Production Database

In [21]:
_ = trained_pipe_model.predict_mutate(g, sampleRate=0.001, nodeLabels=['User'], 
                                      relationshipTypes=['SAME_AS'], mutateRelationshipType='PREDICTED_SAME_AS')

In [17]:
gds.graph.writeRelationship(g,'PREDICTED_SAME_AS')

writeMillis                         13011
graphName                            proj
relationshipType        PREDICTED_SAME_AS
relationshipProperty                 None
relationshipsWritten               678820
propertiesWritten                       0
Name: 0, dtype: object