In [None]:
#Mat Scalpello Dundee University MSc data Science Project 2020
#Orignal Code from Graph Algorithms - Mark Needham and Amy Hodler 2019 (O'Reilly Media)
#Modifications by Mat Scalpello as indicated
# V8 can also be used for models V10-12
# Added surname community detection features
# Same as Version three except using connectionID instead of nodeID (which changes)
# Added community detection features - label prop and louvain
# Added triangles and clustering coefficient
# Used largest Louvain community 

In [None]:
from py2neo import Graph
import pandas as pd

import matplotlib 
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import pandas as pd
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
graph = Graph("bolt://localhost", auth=("neo4j", "123"))
import pickle

In [None]:
#Function to down sample negative examples
#MSCALPELLO - Changed downsample function to be able to specify a percentage of negative links required V10-12
def down_sample(df):
    dscopy = df.copy()
    
    negatives = Counter(dscopy.label.values)[0]
    positives = Counter(dscopy.label.values)[1]
    samplesize = round(negatives - (positives))
    #Use this one for 60%
    #samplesize = round(negatives - (positives*.6))
    #Use this one for 80%
    #samplesize = round(negatives - (positives*.8))
                       
    dscopy = dscopy.drop(dscopy[dscopy.label == 0].sample(n=samplesize, random_state=1).index)
    return dscopy.sample(frac=1)

def evaluate_model(predictions, actual):
    return pd.DataFrame({
        "Measure": ["Accuracy", "Precision", "Recall"],
        "Score": [accuracy_score(actual, predictions), 
                  precision_score(actual, predictions), 
                  recall_score(actual, predictions)]
    })

def feature_importance(columns, classifier):        
    display("Feature Importance")
    df = pd.DataFrame({
        "Feature": columns,
        "Importance": classifier.feature_importances_
    })
    df = df.sort_values("Importance", ascending=False)    
    ax = df.plot(kind='bar', x='Feature', y='Importance', legend=None)
    ax.xaxis.set_label_text("")
    plt.tight_layout()
    plt.show()

In [None]:
# MSCALPELLO Modified to use connectionid and an appropriate index
# Using NEO4J IDs does not work consistently
# Added Resource allocation and Adamic Adar
def create_LP_features(data, rel_type):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1:Person) 
    USING INDEX p1:Person(connectionid) 
    WHERE p1.connectionid = pair.node1
    MATCH (p2:Person) 
    USING INDEX p2:Person(connectionid) 
    WHERE p2.connectionid = pair.node2
    RETURN pair.node1 AS node1,
           pair.node2 AS node2,
           algo.linkprediction.commonNeighbors(
               p1, p2, {relationshipQuery: $relType}) AS cn,
           algo.linkprediction.adamicAdar(
               p1, p2, {relationshipQuery: $relType}) AS aa,
           algo.linkprediction.preferentialAttachment(
               p1, p2, {relationshipQuery: $relType}) AS pa,
           algo.linkprediction.totalNeighbors(
               p1, p2, {relationshipQuery: $relType}) AS tn,
           algo.linkprediction.resourceAllocation(
               p1, p2, {relationshipQuery: $relType}) AS ra
               
    """
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]
    features = graph.run(query, {"pairs": pairs, "relType": rel_type}).to_data_frame()
    features = features.drop_duplicates()
    return pd.merge(data, features, on = ["node1", "node2"])

# MSCALPELLO Modified to use connectionid and an appropriate index
# Using NEO4J IDs does not work consistently
def create_community_features(data, triangles_prop, coefficient_prop):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1:Person) 
    USING INDEX p1:Person(connectionid) 
    WHERE p1.connectionid = pair.node1
    MATCH (p2:Person) 
    USING INDEX p2:Person(connectionid) 
    WHERE p2.connectionid = pair.node2
    RETURN pair.node1 AS node1,
    pair.node2 AS node2,
    apoc.coll.min([p1[$trianglesProp], p2[$trianglesProp]]) AS minT,
    apoc.coll.max([p1[$trianglesProp], p2[$trianglesProp]]) AS maxT,
    apoc.coll.min([p1[$coefficientProp], p2[$coefficientProp]]) AS minC,
    apoc.coll.max([p1[$coefficientProp], p2[$coefficientProp]]) AS maxC
    """    
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]    
    params = {
    "pairs": pairs,
    "trianglesProp": triangles_prop,
    "coefficientProp": coefficient_prop
    }
    features = graph.run(query, params).to_data_frame()
    features = features.drop_duplicates()
    return pd.merge(data, features, on = ["node1", "node2"])

# MSCALPELLO Modified to use connectionid and an appropriate index
# Using NEO4J IDs does not work consistently
def communitydetection_features(data, partition_prop, louvain_prop):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1:Person) 
    USING INDEX p1:Person(connectionid) 
    WHERE p1.connectionid = pair.node1
    MATCH (p2:Person) 
    USING INDEX p2:Person(connectionid) 
    WHERE p2.connectionid = pair.node2
    RETURN pair.node1 AS node1,
    pair.node2 AS node2,
    algo.linkprediction.sameCommunity(p1, p2, $partitionProp) AS lp,    
    algo.linkprediction.sameCommunity(p1, p2, $louvainProp) AS lv
    """
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]
    params = {
    "pairs": pairs,
    "partitionProp": partition_prop,
    "louvainProp": louvain_prop
    }
    features = graph.run(query, params).to_data_frame()
    features = features.drop_duplicates()
    return pd.merge(data, features, on = ["node1", "node2"])

# MSCALPELLO compare surname ids and return a 1 if the same
# MSCALPELLO Modified to use connectionid and an appropriate index
# Using NEO4J IDs does not work consistently
def samename_features(data, samename_prop):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1:Person) 
    USING INDEX p1:Person(connectionid) 
    WHERE p1.connectionid = pair.node1
    MATCH (p2:Person) 
    USING INDEX p2:Person(connectionid) 
    WHERE p2.connectionid = pair.node2
    RETURN pair.node1 AS node1,
    pair.node2 AS node2,
    algo.linkprediction.sameCommunity(p1, p2, $samenameProp) AS sn
"""
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]
    params = {
    "pairs": pairs,
    "samenameProp": samename_prop
    }
    features = graph.run(query, params).to_data_frame()
    features = features.drop_duplicates()
    return pd.merge(data, features, on = ["node1", "node2"])

In [None]:
# MSCALPELLO Use ExtraTrees classifier
# Create the classifier
        
rfclassifier = ExtraTreesClassifier(n_estimators=1000, max_depth=50, 
                                  random_state=0,class_weight="balanced")

# Feature column names for a full link prediction features
columns = ["cn","aa","pa", "tn","ra", # link prediction features
    "lp","lv",                        # Community detection features
    "minT", "maxT", "minC", "maxC",   # triangle features
    "sn"                              # same name feature 
]

In [None]:
#MSCALPELLO
# TRAINING
# Find positive links -Training data
train_positive_links = graph.run("""
MATCH (p1:Person)-[:SUB_GRAPH_TRAINING]->(p2:Person)
RETURN p1.connectionid AS node1, p2.connectionid AS node2, 1 AS label
""").to_data_frame()

In [None]:
#MSCALPELLO
# TEST
# Find positive links - Test data
test_positive_links = graph.run("""
MATCH (p1:Person)-[:SUB_GRAPH_TEST]->(p2:Person)
RETURN p1.connectionid AS node1, p2.connectionid AS node2, 1 AS label
""").to_data_frame()

In [None]:
#MSCALPELLO
# TRAINING
# Find negative links - Training data
train_negative_links = graph.run("""
MATCH (p1:Person)
WHERE (p1)-[:SUB_GRAPH_TRAINING]-()
MATCH (p1)-[:SUB_GRAPH_TRAINING*2..3]-(p2)
WHERE not((p1)-[:SUB_GRAPH_TRAINING]-(p2))
RETURN p1.connectionid AS node1, p2.connectionid AS node2, 0 AS label
""").to_data_frame()

In [None]:
#MSCALPELLO
# TEST
# Find negative examples - test set
test_negative_links = graph.run("""
MATCH (p1:Person)
WHERE (p1)-[:SUB_GRAPH_TEST]-()
MATCH (p1)-[:SUB_GRAPH_TEST*2..3]-(p2)
WHERE not((p1)-[:SUB_GRAPH_TEST]-(p2))
RETURN p1.connectionid AS node1, p2.connectionid AS node2, 0 AS label
""").to_data_frame()

In [None]:
#MSCALPELLO
# TRAINING
# Remove duplicates - training set
train_negative_links = train_negative_links.drop_duplicates()

#Down sample Negative links
training_df = train_negative_links.append(train_positive_links, ignore_index=True)
training_df['label'] = training_df['label'].astype('category')
training_df = down_sample(training_df)

In [None]:
# TEST
# Remove duplicates - test set
test_negative_links = test_negative_links.drop_duplicates()

# Create DataFrame from positive and negative examples
test_df = test_negative_links.append(test_positive_links, ignore_index=True)
test_df['label'] = test_df['label'].astype('category')
# Down sample negative examples
test_df = down_sample(test_df)

In [None]:
# TRAINING
# Apply the link prediction features to the data set
training_df = create_LP_features(training_df, "SUB_GRAPH_TRAINING")

In [None]:
# TEST
# Apply the link prediction features to the data set
test_df = create_LP_features(test_df, "SUB_GRAPH_TEST")

In [None]:
#MSCALPELLO - Added iterations on LP to ensure it will converge
# TRAINING
# Community detection - label propogation
graph.run("""
CALL algo.labelPropagation("Person", "SUB_GRAPH_TRAINING", "BOTH",
{iterations:20, partitionProperty: "partitionTrain"});
""").to_data_frame()

In [None]:
#MSCALPELLO - Added iterations on LP to ensure it will converge
# TEST
# Community detection - label propogation
graph.run("""
CALL algo.labelPropagation("Person", "SUB_GRAPH_TEST", "BOTH",
{iterations:20, partitionProperty: "partitionTest"});
""").to_data_frame()

In [None]:
#MSCALPELLO
# TRAINING
# Community detection - Louvain
graph.run("""
CALL algo.louvain.stream("Person", "SUB_GRAPH_TRAINING", {includeIntermediateCommunities:true})
YIELD nodeId, community, communities
WITH algo.getNodeById(nodeId) AS node, community AS smallestCommunity
SET node.louvainTrain = smallestCommunity;
""").stats()

In [None]:
#MSCALPELLO
# TEST
# Community detection - Louvain
graph.run("""
CALL algo.louvain.stream("Person", "SUB_GRAPH_TEST", {includeIntermediateCommunities:true})
YIELD nodeId, community, communities
WITH algo.getNodeById(nodeId) AS node, community AS smallestCommunity
SET node.louvainTest = smallestCommunity;
""").stats()

In [None]:
# Use the community detection algorithm to find the label prop and louvain features seeded into the graph
# TRAINING
training_df = communitydetection_features(training_df, "partitionTrain", "louvainTrain")
#TEST
test_df = communitydetection_features(test_df, "partitionTest", "louvainTest")

In [None]:
#training_df.count()

In [None]:
graph.run("""
CALL algo.triangleCount('Person', 'SUB_GRAPH_TRAINING', { write:true,
writeProperty:'trianglesTrain', clusteringCoefficientProperty:'coefficientTrain'});
""").to_data_frame()

In [None]:
graph.run("""
CALL algo.triangleCount('Person', 'SUB_GRAPH_TEST', { write:true,
writeProperty:'trianglesTest', clusteringCoefficientProperty:'coefficientTest'});
""").to_data_frame()

In [None]:
training_df = create_community_features(training_df, "trianglesTrain", "coefficientTrain")
test_df = create_community_features(test_df, "trianglesTest", "coefficientTest")

In [None]:
#MSCALPELLO See if node pair has the same surname
#TRAINING
training_df = samename_features(training_df, "surnameID")
#TEST
test_df = samename_features(test_df, "surnameID")

In [None]:
training_df.tail()

In [None]:
# Run the classifier against the training data
X = training_df[columns]
y = training_df["label"]
rfclassifier.fit(X, y)

# Apply the test data and evaluate performance
predictions = rfclassifier.predict(test_df[columns])
y_test = test_df["label"]

display(evaluate_model(predictions, y_test))
feature_importance(columns, rfclassifier)

In [None]:
#MSCALPELLO
#File path for model
filename = 'File path eher V5.sav'
pickle.dump(rfclassifier, open(filename, 'wb'))