In [None]:
#Mat Scalpello Dundee University MSc data Science Project 2020
#Orignal Code from Graph Algorithms - Mark Needham and Amy Hodler 2019 (O'Reilly Media)
#Modifications by Mat Scalpello as indicated

# Code to test V3-V4 model
#You will need to add paths to a model and for file output
# Test model link prediction and community detection features
# Test triangles and clustering

In [None]:
from py2neo import Graph
import pandas as pd

import matplotlib 
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import pandas as pd
from collections import Counter
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
graph = Graph("bolt://localhost", auth=("neo4j", "123"))
import pickle

In [None]:
#MSCALPELLO
#You will need to put in a file path to the model location
filename = 'file path here V3.sav'
# Feature column names
columns = [
    "cn","aa", "pa", "tn","ra", # graph features
    "lp", "lv",
    "minT", "maxT", "minC", "maxC"    # triangle features     
]

In [None]:
def evaluate_model(predictions, actual):
    return pd.DataFrame({
        "Measure": ["Accuracy", "Precision", "Recall"],
        "Score": [accuracy_score(actual, predictions), 
                  precision_score(actual, predictions), 
                  recall_score(actual, predictions)]
    })

def feature_importance(columns, classifier):        
    display("Feature Importance")
    df = pd.DataFrame({
        "Feature": columns,
        "Importance": classifier.feature_importances_
    })
    df = df.sort_values("Importance", ascending=False)    
    ax = df.plot(kind='bar', x='Feature', y='Importance', legend=None)
    ax.xaxis.set_label_text("")
    plt.tight_layout()
    plt.show()

In [None]:
# MSCALPELLO Modified to use connectionid and an appropriate index
# Using NEO4J IDs does not work consistently
# Added Resource allocation and Adamic Adar
def create_LP_features(data, rel_type):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1:Person) 
    USING INDEX p1:Person(connectionid) 
    WHERE p1.connectionid = pair.node1
    MATCH (p2:Person) 
    USING INDEX p2:Person(connectionid) 
    WHERE p2.connectionid = pair.node2
    RETURN pair.node1 AS node1,
           pair.node2 AS node2,
           algo.linkprediction.commonNeighbors(
               p1, p2, {relationshipQuery: $relType}) AS cn,
           algo.linkprediction.adamicAdar(
               p1, p2, {relationshipQuery: $relType}) AS aa,
           algo.linkprediction.preferentialAttachment(
               p1, p2, {relationshipQuery: $relType}) AS pa,
           algo.linkprediction.totalNeighbors(
               p1, p2, {relationshipQuery: $relType}) AS tn,
           algo.linkprediction.resourceAllocation(
               p1, p2, {relationshipQuery: $relType}) AS ra
               
    """
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]
    features = graph.run(query, {"pairs": pairs, "relType": rel_type}).to_data_frame()
    features = features.drop_duplicates()
    return pd.merge(data, features, on = ["node1", "node2"])

# MSCALPELLO Modified to use connectionid and an appropriate index
# Using NEO4J IDs does not work consistently
def create_community_features(data, triangles_prop, coefficient_prop):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1:Person) 
    USING INDEX p1:Person(connectionid) 
    WHERE p1.connectionid = pair.node1
    MATCH (p2:Person) 
    USING INDEX p2:Person(connectionid) 
    WHERE p2.connectionid = pair.node2
    RETURN pair.node1 AS node1,
    pair.node2 AS node2,
    apoc.coll.min([p1[$trianglesProp], p2[$trianglesProp]]) AS minT,
    apoc.coll.max([p1[$trianglesProp], p2[$trianglesProp]]) AS maxT,
    apoc.coll.min([p1[$coefficientProp], p2[$coefficientProp]]) AS minC,
    apoc.coll.max([p1[$coefficientProp], p2[$coefficientProp]]) AS maxC
    """    
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]    
    params = {
    "pairs": pairs,
    "trianglesProp": triangles_prop,
    "coefficientProp": coefficient_prop
    }
    features = graph.run(query, params).to_data_frame()
    features = features.drop_duplicates()
    return pd.merge(data, features, on = ["node1", "node2"])

# MSCALPELLO Modified to use connectionid and an appropriate index
# Using NEO4J IDs does not work consistently
def communitydetection_features(data, partition_prop, louvain_prop):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1:Person) 
    USING INDEX p1:Person(connectionid) 
    WHERE p1.connectionid = pair.node1
    MATCH (p2:Person) 
    USING INDEX p2:Person(connectionid) 
    WHERE p2.connectionid = pair.node2
    RETURN pair.node1 AS node1,
    pair.node2 AS node2,
    algo.linkprediction.sameCommunity(p1, p2, $partitionProp) AS lp,    
    algo.linkprediction.sameCommunity(p1, p2, $louvainProp) AS lv
    """
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]
    params = {
    "pairs": pairs,
    "partitionProp": partition_prop,
    "louvainProp": louvain_prop
    }
    features = graph.run(query, params).to_data_frame()
    features = features.drop_duplicates()
    return pd.merge(data, features, on = ["node1", "node2"])

# MSCALPELLO compare surname ids and return a 1 if the same
# MSCALPELLO Modified to use connectionid and an appropriate index
# Using NEO4J IDs does not work consistently
def samename_features(data, samename_prop):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1:Person) 
    USING INDEX p1:Person(connectionid) 
    WHERE p1.connectionid = pair.node1
    MATCH (p2:Person) 
    USING INDEX p2:Person(connectionid) 
    WHERE p2.connectionid = pair.node2
    RETURN pair.node1 AS node1,
    pair.node2 AS node2,
    algo.linkprediction.sameCommunity(p1, p2, $samenameProp) AS sn
"""
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]
    params = {
    "pairs": pairs,
    "samenameProp": samename_prop
    }
    features = graph.run(query, params).to_data_frame()
    features = features.drop_duplicates()
    return pd.merge(data, features, on = ["node1", "node2"])

In [None]:
#MSCALPELLO
#Load negative links for a subgraph
#You need to put a name in here
load_links = graph.run("""
MATCH (p1:Person {FullName: "Name here"})
WHERE (p1)-[:Knows]-()
MATCH (p1)-[:Knows*1..3]-(p2:Person)
WHERE not((p1)-[:Knows]-(p2))
RETURN p1.connectionid AS node1, p2.connectionid AS node2, 0 AS label
""").to_data_frame()

In [None]:
load_links= load_links.drop_duplicates()

In [None]:
#MSCALPELLO
#Used for including hand picked node pairs that the model 
#might predict positively against
#Load missing link data - you will need to include a file path
nolinks = pd.read_csv('file path here noedges.csv')

#MSCALPELLO
# Add the nodes with no link
all_links = load_links.append(nolinks,ignore_index=True)

#MSCALPELLO
# Apply graphs algorithms to node pairs
# use the main graph not a sub-graph
all_links= create_LP_features(all_links,"Knows")

In [None]:
#MSCALPELLO - Added iterations on LP to ensure it will converge
# Apply community features to data
# Setup label propagation
graph.run("""
CALL algo.labelPropagation("Person", "Knows", "BOTH",
{iterations:20, partitionProperty: "labelprop"});
""").to_data_frame()

In [None]:
graph.run("""
CALL algo.louvain.stream("Person", "Knows", {includeIntermediateCommunities:true})
YIELD nodeId, community, communities
WITH algo.getNodeById(nodeId) AS node, communities[0] AS smallestCommunity
SET node.louvain = smallestCommunity;
""").stats()

In [None]:
all_links = communitydetection_features(all_links, "community", "louvain")

In [None]:
# Apply Triangle features to node pairs
# Setup data in graphDB
graph.run("""
CALL algo.triangleCount('Person', 'Knows', { write:true,
writeProperty:'triangles', clusteringCoefficientProperty:'clustercoef'});
""").to_data_frame()

all_links = create_community_features(all_links,"triangles","clustercoef")

In [None]:
#MSCALPELLO
#save output data
all_links.to_csv('file path here Verify_V3.csv')

#MSCALPELLO
#Run the classifier
predictions=lrclassifier.predict(all_links[columns])
predictpct=lrclassifier.predict_proba(all_links[columns])

#MSCALPELLO
#save output data predictions
pd.DataFrame(predictions).to_csv('file path here V3_predictions.csv')

#MSCALPELLO
#save output data predictions - percentages
pd.DataFrame(predictpct).to_csv('file path here V3_predictpct.csv')