In [None]:

!pip install neo4j

In [97]:
import logging
import os
import sys
import ast
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from neo4j import GraphDatabase
from neo4j.exceptions import DriverError, SessionError

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
import optuna

class CoraNeo4jConnector:

    def __init__(self, uri, user, password, logfile):
        if os.path.exists(logfile):
            logging.basicConfig(filename=logfile, encoding='utf-8', level=logging.INFO)
            self.driver = GraphDatabase.driver(uri, auth=(user, password))
            assert self.__verify_driver_and_connection() is True, "Neo4j connection cannot be verified!"
            logging.info("Driver has been initialized and connected successfully!")
            logging.info(f'''
Server Info:
"Address: {self.driver.get_server_info().address}
"Protocol Version: {self.driver.get_server_info().protocol_version}
"Agent: {self.driver.get_server_info().agent}
''')
        else:
            raise FileNotFoundError("Logger file does not exist!")

    def __verify_driver_and_connection(self):
        if self.driver is None:
            return False
        try:
            self.driver.verify_connectivity()
            return True
        except DriverError as err:
            logging.error(f"Connection cannot be verified!, {err}")
            self.driver.close()
            return False

    def query(self, cypher, *parameters):
        assert self.__verify_driver_and_connection() is True, "Neo4j connection cannot be verified!"
        try:
            with self.driver.session() as session:
                result = list(session.run(cypher, parameters))
                logging.info(f"Query: {cypher} is sent to the database and the result is returned to the caller!")
                return result
        except SessionError as err:
            print(f"Query: {cypher} is failed!")
            logging.error(f"Query: {cypher} is failed!")

    def close(self):
        assert self.__verify_driver_and_connection() is True, "Neo4j connection cannot be verified!"
        self.driver.close()

    def clear_entire_db(self):
        assert self.__verify_driver_and_connection() is True, "Neo4j connection cannot be verified!"
        try:
            with self.driver.session() as session:
                session.execute_write(CoraNeo4jConnector.__clear_entire_db, "MATCH (n) DETACH DELETE n")
            logging.info("Database is cleared successfully!")
        except SessionError as err:
            logging.error(f"cannot clear entire db {err}")

    @staticmethod
    def __clear_entire_db(tx, cypher):
        # all nodes and relationships
        result = tx.run(cypher)
        result.consume()

    def load_cora_nodes(self):
        assert self.__verify_driver_and_connection() is True, "Neo4j connection cannot be verified!"
        try:
            with self.driver.session() as session:
                result = session.run(
                    "CREATE CONSTRAINT paper_constraint IF NOT EXISTS FOR (p:Paper) REQUIRE p.id IS UNIQUE")
                _ = result.single()
                logging.info("Constraint is created on Paper for unique id!")
                record = session.execute_write(CoraNeo4jConnector.__load_cora_nodes_and_return_node_count)
                _ = record[0]
                logging.info("Nodes are loaded to database successfully!")
        except IndexError:
            logging.error(f"Record does not have given index, load_cora_nodes!")
            sys.exit(-1)
        except SessionError as err:
            logging.error(f"cannot load nodes to db, {err}")

    @staticmethod
    def __load_cora_nodes_and_return_node_count(tx):

        cypher = """
LOAD CSV WITH HEADERS FROM 'https://cora-dataset.s3.eu-central-1.amazonaws.com/dataset/nodes.csv' AS row
WITH row
MERGE (p:Paper {id: row.paper_id, subject: row.subject, words: row.words})
REMOVE p.features
RETURN COUNT(*)
"""
        # idempotent operation
        result = tx.run(cypher)
        return result.single()

    def load_cora_edges(self):
        try:
            with self.driver.session() as session:
                record = session.execute_write(CoraNeo4jConnector.__load_cora_edges_and_return_node_count)
                _ = record[0]
                logging.info("Edges are loaded successfully")
        except IndexError:
            logging.error(f"Record does not have given index, load_cora_edges!")
            sys.exit(-1)
        except SessionError as err:
            logging.error(f"cannot load given edges to db, {err}")

    @staticmethod
    def __load_cora_edges_and_return_node_count(tx):

        cypher = """
LOAD CSV WITH HEADERS FROM 'https://cora-dataset.s3.eu-central-1.amazonaws.com/dataset/edges.csv' AS row
WITH row
MATCH (to:Paper {id: row.cited_paper_id})
MATCH (from:Paper {id: row.citing_paper_id})
MERGE (from)-[:CITED]->(to)
RETURN COUNT(*)
"""
        # idempotent operation
        result = tx.run(cypher)
        return result.single()

In [98]:
def load_cora_to_neo4j(d):
    try:
        d.clear_entire_db()
        d.load_cora_nodes()
        d.load_cora_edges()
    except Exception:
        print("Cora dataset cannot be loaded to neo4j!")
    return

def create_in_memory_graph(d):
    d.query('''
    CALL gds.graph.project(
       'Cora',
       'Paper',
       {CITED:
           {
               orientation: 'UNDIRECTED'
           }
       }
    )''')
    return

with open("auth.json", "r") as auth:
    auth = json.load(auth)
    driver = CoraNeo4jConnector(auth["URI"],
                            auth["USER"],
                            auth["PASSWORD"],
                            auth["LOGFILE"])
    # cora dataset is already loaded to neo4j instance in the cloud
    # load_cora_to_neo4j(driver)
    # create_in_memory_graph(driver)


In [99]:
# loading database back to DataFrame to apply machine learning tasks
query = "MATCH (p:Paper) RETURN p.id AS id, p.subject AS subject, p.words AS words"
df_cora = pd.DataFrame([dict(row) for row in driver.query(query)])
df_cora['words'] = df_cora['words'].apply(ast.literal_eval)
df_cora['words'] = df_cora['words'].apply(lambda x: np.array(x, dtype=float))

In [None]:

def random_forest_without_embedding(k_folds=5, show_matrix=True):
    acc_scores = []

    # create X and y value for the scikit-learn
    X = np.array(df_cora['words'].to_list())
    y = np.array(df_cora['subject'].factorize()[0], dtype=float)

    for _ in range(0, k_folds):

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)
        clf = RandomForestClassifier(max_depth=500, random_state=0)
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)

        acc = accuracy_score(pred, y_test)
        acc_scores.append(acc)

    print('Accuracy scores: ', acc_scores)
    print('Mean accuracy: ', np.mean(acc_scores))

    if show_matrix:
        disp = ConfusionMatrixDisplay.from_estimator(
            clf,
            X_test,
            y_test,
            display_labels=clf.classes_,
            cmap=plt.cm.Blues,
            normalize='true'
        )
        disp.ax_.set_title("Random forest without embedding")
    return

random_forest_without_embedding()

In [None]:
def create_fastrp_embedding(dim=10):
    query = """CALL gds.fastRP.write(
               'Cora',
               {
                   embeddingDimension: %d,
                   iterationWeights: [0.0, 0.0, 1.0, 1.0],
                   writeProperty: 'fastrp_embedding'
               }
           )
    """ % (dim)
    driver.query(query)
    return


In [None]:
def random_forest_with_fastrp(dim, k_folds=5, show_matrix=True):
    acc_scores = []

    create_fastrp_embedding(dim)
    # create X and y value for the scikit-learn
    query = """MATCH (p:Paper) RETURN p.id AS id, p.subject AS subject, p.fastrp_embedding AS fastrp_embedding"""
    df_embedding = pd.DataFrame([dict(row) for row in driver.query(query)])
    df_embedding['X'] = np.array(df_embedding['fastrp_embedding'].apply(lambda x: np.array(x)))
    X = np.array(df_embedding['X'].to_list())
    y = np.array(df_cora['subject'].factorize()[0], dtype=float)

    for _ in range(0, k_folds):

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
        clf = RandomForestClassifier(max_depth=500, random_state=0)
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)

        acc = accuracy_score(pred, y_test)
        acc_scores.append(acc)

    print('Accuracy scores: ', acc_scores)
    print('Mean accuracy: ', np.mean(acc_scores))

    if show_matrix:
        disp = ConfusionMatrixDisplay.from_estimator(
            clf,
            X_test,
            y_test,
            display_labels=clf.classes_,
            cmap=plt.cm.Blues,
            normalize='true'
        )
        disp.ax_.set_title("Random forest with FastRP embedding")
    return

random_forest_with_fastrp(dim=506)

In [102]:
def create_node2vec_embedding(embedding_dimension=128, walk_length=250, return_factor=1.0, in_out_factor=2.0):
    query = """CALL gds.beta.node2vec.write(
               'Cora',
               {
                   embeddingDimension: %d,
                   returnFactor: %d,
                   inOutFactor: %d,
                   walkLength: %d,
                   writeProperty: 'node2vec_embedding'
               }
           )
    """ % (embedding_dimension, return_factor, in_out_factor, walk_length)
    driver.query(query)
    return
# create X and y value for the scikit-learn
query = """MATCH (p:Paper) RETURN p.id AS id, p.subject AS subject, p.node2vec_embedding AS node2vec_embedding"""
df_embedding = pd.DataFrame([dict(row) for row in driver.query(query)])
df_embedding['X'] = np.array(df_embedding['node2vec_embedding'].apply(lambda x: np.array(x)))
X = np.array(df_embedding['X'].to_list())
y = np.array(df_cora['subject'].factorize()[0], dtype=float)

def opt_for_node2vec(params):
    acc_scores = []
    k_folds = 5

    for _ in range(0, k_folds):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
        clf = RandomForestClassifier(max_depth=500, random_state=0)
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)

        acc = accuracy_score(pred, y_test)
        acc_scores.append(acc)

    return np.mean(acc_scores)

def objective_learn_node2vec():

    def objective(params):

        embedding_dimension = params['embeddingDimension']
        return_factor = params['returnFactor']
        in_out_factor = params['inOutFactor']
        walk_length = params['walkLength']

        # Create embeddings
        query = """CALL gds.beta.node2vec.write(
               'Cora',
               {
                   embeddingDimension: %d,
                   returnFactor: %d,
                   inOutFactor: %d,
                   walkLength: %d,
                   writeProperty: 'node2vec_embedding'
               }
           )
    """ % (embedding_dimension, return_factor, in_out_factor, walk_length)

        driver.query(query)

        return opt_for_node2vec(params)

    return objective

def optuna_objective_node2vec(trial):

    params = {
        'embeddingDimension': trial.suggest_int('embeddingDimension', 30, 700, log=True),
        'returnFactor': trial.suggest_float('returnFactor', 0.1, 5.0),
        'inOutFactor': trial.suggest_float('inOutFactor', 0.1, 5.0),
        'walkLength': trial.suggest_int('walkLength', 2, 700)
    }

    objective = objective_learn_node2vec()

    return objective(params)

def optuna_apply():
    initial_params = {
        'embeddingDimension': 111,
        'returnFactor': 2.9,
        'inOutFactor': 3.1,
        'walkLength': 150
    }

    study = optuna.create_study(direction='maximize')
    study.enqueue_trial(initial_params)
    study.optimize(optuna_objective_node2vec, n_trials=100)

    print('Accuracy: {}'.format(study.best_trial.value))
    print('Best hyperparameters: {}'.format(study.best_trial.params))

def random_forest_with_node2vec(embedding_dimension=128, k_folds=5, show_matrix=True):
    acc_scores = []

    create_node2vec_embedding(embedding_dimension=embedding_dimension, walk_length=80, return_factor=0.5, in_out_factor=2.0)
    # create X and y value for the scikit-learn
    query = """MATCH (p:Paper) RETURN p.id AS id, p.subject AS subject, p.node2vec_embedding AS node2vec_embedding"""
    df_embedding = pd.DataFrame([dict(row) for row in driver.query(query)])
    df_embedding['X'] = np.array(df_embedding['node2vec_embedding'].apply(lambda x: np.array(x)))
    X = np.array(df_embedding['X'].to_list())
    y = np.array(df_cora['subject'].factorize()[0], dtype=float)

    for _ in range(0, k_folds):

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
        clf = RandomForestClassifier(max_depth=500, random_state=0)
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)

        acc = accuracy_score(pred, y_test)
        acc_scores.append(acc)

    print('Accuracy scores: ', acc_scores)
    print('Mean accuracy: ', np.mean(acc_scores))

    if show_matrix:
        disp = ConfusionMatrixDisplay.from_estimator(
            clf,
            X_test,
            y_test,
            display_labels=clf.classes_,
            cmap=plt.cm.Blues,
            normalize='true'
        )
        disp.ax_.set_title("Random forest with Node2vec embedding")
    return

optuna_apply()
# random_forest_with_node2vec(embedding_dimension=506)

[32m[I 2023-01-11 06:32:11,861][0m A new study created in memory with name: no-name-ee3262ca-5612-4485-bfc1-58cb7281954d[0m
[32m[I 2023-01-11 06:32:27,391][0m Trial 0 finished with value: 0.86794682422452 and parameters: {'embeddingDimension': 111, 'returnFactor': 2.9, 'inOutFactor': 3.1, 'walkLength': 150}. Best is trial 0 with value: 0.86794682422452.[0m
[32m[I 2023-01-11 06:33:24,107][0m Trial 1 finished with value: 0.863810930576071 and parameters: {'embeddingDimension': 234, 'returnFactor': 3.6292918987687357, 'inOutFactor': 4.588967834962466, 'walkLength': 540}. Best is trial 0 with value: 0.86794682422452.[0m
[32m[I 2023-01-11 06:34:07,550][0m Trial 2 finished with value: 0.862629246676514 and parameters: {'embeddingDimension': 50, 'returnFactor': 4.8709780962860085, 'inOutFactor': 0.7556254254369404, 'walkLength': 259}. Best is trial 0 with value: 0.86794682422452.[0m
[32m[I 2023-01-11 06:35:33,212][0m Trial 3 finished with value: 0.8596750369276218 and parameters

Accuracy: 0.8720827178729691
Best hyperparameters: {'embeddingDimension': 323, 'returnFactor': 0.5666657792831505, 'inOutFactor': 2.5259004945121823, 'walkLength': 2}
