# DeepGL - Deep Feature Learning for graphs

We've implemented the [DeepGL](https://arxiv.org/abs/1704.08829) algorithm as a Neo4j procedure and this notebook shows our experiments with it against a SNAP email dataset.

First up let's import some things...

In [2]:
driver = GraphDatabase.driver("bolt://localhost", auth=("neo4j", "neo"))

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import manifold
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
import pandas as pd
from neo4j.v1 import GraphDatabase
import tensorflow as tf


# load data
from sklearn.preprocessing import StandardScaler

In [4]:
edge_list_file = "https://github.com/meltzerpete/Embedding-Vis/raw/master/emails/emails.edgelist"
labels_file = "https://github.com/meltzerpete/Embedding-Vis/raw/master/emails/emails.labels"
attributes_file = None  # attributes file should have nodeIds in column 0, followed by one column per attribute

with driver.session() as session:
    session.run("CREATE CONSTRAINT ON (n:Node) ASSERT n.id IS UNIQUE")
    
    result = session.run("""\
        LOAD CSV FROM $edgelistFile AS row
        FIELDTERMINATOR " "
        MERGE (e1:Node {id: row[0]})
        MERGE (e2:Node {id: row[1]})
        MERGE (e1)-[:LINK]->(e2)
        """, {"edgelistFile": edge_list_file})
    print(result.summary().counters)

    result = session.run("""\
        LOAD CSV FROM $labelsFile AS row
        FIELDTERMINATOR " "
        MATCH (e:Node {id: row[0]})
        SET  e.label = toInteger(row[1])-1
        """, {"labelsFile": labels_file})
    print(result.summary().counters)

    if attributes_file is not None:
        result = session.run("""\
            load csv from $attributesFile  as row
            FIELDTERMINATOR " "
            with toString(toInteger(row[0])) AS nodeId, row[1..] AS properties
            MATCH (s:Node {id: nodeId})
            WITH s, properties
            UNWIND range(0, size(properties)-1) AS index
            CALL apoc.create.setProperty(s, "property_" + index, toFloat(properties[index])) YIELD node
            return count(*)
            """, {"attributesFile": attributes_file})
        print(result.summary().counters)

{'labels_added': 1005, 'relationships_created': 25571, 'nodes_created': 1005, 'properties_set': 1005}


{'properties_set': 1005}


Time to load our data into Neo4j.

In [9]:
embedding_property_name = "embedding-python"
node_features = []
pruning_lambda = 0.6
diffusions = 3
iterations = 3

with driver.session() as session:
    params = {
        "writeProperty": embedding_property_name,
        # this tells the procedure the name of the node attribute properties in the neo database
        # the import function above calls them "property_0", "property_1", "property_2", ...
        # this node_features argument should take a list of these names
        # for the case of enzymes it should be from "property_0" to "property_17"
        "nodeFeatures": node_features,
        "pruningLambda": pruning_lambda,
        "diffusions": diffusions,
        "iterations": iterations
    }
    result = session.run("""
    call
    algo.deepgl(
        null,
        null,
        {nodeFeatures: $nodeFeatures,
         pruningLambda: $pruningLambda,
         diffusions: $diffusions,
         iterations: $iterations,
         writeProperty: $writeProperty})
    """, params)
    print(result.peek())

<Record loadMillis=52 computeMillis=63542 writeMillis=24 nodes=1005 writeProperty='embedding-python' embeddingSize=18 numberOfLayers=4 features=['max_out_neighbourhood( max_in_neighbourhood( IN_DEGREE))', 'max_out_neighbourhood( IN_DEGREE)', 'diffuse( hadamard_in_neighbourhood( max_out_neighbourhood( max_in_neighbourhood( IN_DEGREE))))', 'max_out_neighbourhood( mean_out_neighbourhood( IN_DEGREE))', 'diffuse( hadamard_out_neighbourhood( diffuse( rbf_out_neighbourhood( max_in_neighbourhood( IN_DEGREE)))))', 'hadamard_out_neighbourhood( IN_DEGREE)', 'max_out_neighbourhood( mean_out_neighbourhood( hadamard_out_neighbourhood( IN_DEGREE)))', 'IN_DEGREE', 'diffuse( rbf_out_neighbourhood( max_in_neighbourhood( IN_DEGREE)))', 'mean_out_neighbourhood( max_out_neighbourhood( IN_DEGREE))', 'max_in_neighbourhood( IN_DEGREE)', 'mean_out_neighbourhood( max_in_neighbourhood( IN_DEGREE))', 'mean_in_neighbourhood( max_out_neighbourhood( IN_DEGREE))', 'mean_out_neighbourhood( sum_out_neighbourhood( diffu

In [16]:
with driver.session() as session:
    result = session.run("""\
    MATCH (n) 
    WITH n.label as class, count(*) AS c
    ORDER BY c DESC
    WITH class WHERE c > 50
    WITH class ORDER BY class
    with collect(class) AS biggestClasses
    MATCH (p:Node) WHERE p.label IN biggestClasses
    RETURN p.`%s` AS embedding, apoc.coll.indexOf(biggestClasses, p.label) AS label, p.label as initialLabel
    ORDER BY label
    """ % embedding_property_name)

    df = pd.DataFrame(dict(row) for row in result)

emb = df["embedding"].apply(pd.Series).values
labels = df["label"].values
emb.shape

(433, 18)

In [11]:
# Heatmap
colours = ['r', 'g', 'b', 'black', 'y', 'orange']
cols = pd.DataFrame(labels).apply(lambda x: colours[int(x)], axis=1).values

dist = np.ndarray([len(emb), len(emb)])

for i, e1 in enumerate(emb):
    for j, e2 in enumerate(emb):
        dist.itemset((i, j), np.linalg.norm(e1 - e2, 2))

plt.imshow(dist)
plt.axes().xaxis.tick_top()
plt.xticks(np.arange(len(dist)), labels)
plt.yticks(np.arange(len(dist)), labels)
plt.show()

# 2D Visualisation
# from: https://baoilleach.blogspot.com/2014/01/convert-distance-matrix-to-2d.html
adist = dist
amax = np.amax(adist)
adist /= amax

mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=6)
results = mds.fit(adist)

coords = results.embedding_

plt.subplots_adjust(bottom=0.1)
plt.scatter(
    coords[:, 0], coords[:, 1], marker='o', c=cols
)

plt.show()



In [12]:
X = pd.DataFrame(emb)
y = labels

X = StandardScaler().fit_transform(X)
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=.4, random_state=42)

clf = MLPClassifier(solver='sgd',
                    activation='tanh',
                    learning_rate_init=0.001,
                    alpha=1e-5,
                    hidden_layer_sizes=(30, 30),
                    max_iter=10000,
                    batch_size=X.shape[0],
                    random_state=0)

clf.n_outputs_ = 6
clf.out_activation_ = "softmax"
print(clf.get_params())
clf.fit(train_x, train_y)

mean_acc = clf.score(test_x, test_y)
print(mean_acc)

{'activation': 'tanh', 'alpha': 1e-05, 'batch_size': 433, 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (30, 30), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_iter': 10000, 'momentum': 0.9, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': 0, 'shuffle': True, 'solver': 'sgd', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}




0.7816091954022989


In [14]:
X.shape

(433, 18)