In [1]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from neo4j import GraphDatabase

In [2]:
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "test1234"), encrypted=False)

In [3]:
def createGraphProjection(tx):
    return tx.run(
        """CALL gds.graph.create(
            'payment', 
            'Client', 
            { PAYS: { type: 'PAYS', orientation:'NATURAL'}}, 
            {
                nodeProperties: { },
                relationshipProperties: {amount: { property: 'amount', aggregation: 'NONE', defaultValue: 0.0 }}
            }
        )"""
    ).consume()

In [4]:
def runFastRP(tx): 
    return tx.run(
        """CALL gds.fastRP.write(
            'payment', 
            {
                embeddingDimension: 8,
                iterationWeights: [0.0, 0.20, 0.40, 0.60, 0.80, 1.00, 1.20, 1.40, 1.60, 1.80, 2.00],
                relationshipWeightProperty: 'amount',
                writeProperty:'fastRP'
            }
            )"""
    ).consume()    

In [5]:
def dropGraphProjection(tx):
    return tx.run("CALL gds.graph.drop('payment')").consume()

In [6]:
def getEmbeddings(tx, embedding, category, nodeIds):
    data = tx.run("MATCH (c:Client) where exists(c.fastRP) with c, c:Mule as isMule RETURN {id: c.id, embedding: c.fastRP, isMule:isMule} AS result limit 30000")
    for record in data: 
        embedding.append(record["result"]["embedding"])
        category.append(record["result"]["isMule"])
        nodeIds.append(record["result"]["id"])

In [7]:
# Create projection 
with driver.session(database = "paysim") as session:
    result = session.read_transaction(createGraphProjection)
    print(result)

<neo4j.work.summary.ResultSummary object at 0x139ba7a00>


In [8]:
# Run algorithm and write back result
with driver.session(database = "paysim") as session:
    result = session.write_transaction(runFastRP)
    print(result)

<neo4j.work.summary.ResultSummary object at 0x105f54100>


In [9]:
# Drop projection
with driver.session(database = "paysim") as session:
    result = session.read_transaction(dropGraphProjection)
    print(result)

<neo4j.work.summary.ResultSummary object at 0x139c312e0>


In [10]:
embedding = []
category = []
nodeIds = []
with driver.session(database = "paysim") as session:
    session.read_transaction(getEmbeddings, embedding, category, nodeIds)
    

In [11]:
# X will hold the n-dimensional input features
X = np.array(embedding)
# y holds the corresponding target values
y = np.array(category)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=None)

print(
    "Array shapes:\n X_train = {}\n y_train = {}\n X_test = {}\n y_test = {}".format(
        X_train.shape, y_train.shape, X_test.shape, y_test.shape
    )
)

Array shapes:
 X_train = (15336, 8)
 y_train = (15336,)
 X_test = (6574, 8)
 y_test = (6574,)


In [12]:
clf = LogisticRegressionCV(
    Cs=10, cv=10, scoring="accuracy", verbose=True, multi_class="ovr", max_iter=300, solver='lbfgs'
)
clf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.1s finished


LogisticRegressionCV(cv=10, max_iter=300, multi_class='ovr', scoring='accuracy',
                     verbose=True)

In [13]:
# Feature Importance
importance = clf.coef_[0]
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: -0.07338
Feature: 1, Score: -0.22293
Feature: 2, Score: -0.02176
Feature: 3, Score: 0.18499
Feature: 4, Score: -0.49267
Feature: 5, Score: 0.27256
Feature: 6, Score: 0.49659
Feature: 7, Score: -0.34022


In [14]:
y_pred = clf.predict(X_test)

In [15]:
accuracy_score(y_test, y_pred)

0.9119257681776696

In [16]:
print(classification_report(y_test, y_pred, labels=[True, False]))
print("y_test mules = {}\ny_pred mules = {}".format(np.count_nonzero(y_test == True), np.count_nonzero(y_pred == True) ))

              precision    recall  f1-score   support

        True       0.49      1.00      0.66       556
       False       1.00      0.90      0.95      6018

    accuracy                           0.91      6574
   macro avg       0.74      0.95      0.80      6574
weighted avg       0.96      0.91      0.92      6574

y_test mules = 556
y_pred mules = 1135
