In [1]:
import pickle
import random as rd
import numpy as np
import scipy.sparse as sp
from scipy.io import loadmat
import copy as cp
from sklearn.metrics import f1_score, accuracy_score, recall_score, roc_auc_score, average_precision_score
from collections import defaultdict

def sparse_to_adjlist(sp_matrix, filename):
	"""
	Transfer sparse matrix to adjacency list
	:param sp_matrix: the sparse matrix
	:param filename: the filename of adjlist
	"""
	# add self loop
	homo_adj = sp_matrix + sp.eye(sp_matrix.shape[0])
	# create adj_list
	adj_lists = defaultdict(set)
	edges = homo_adj.nonzero()
	for index, node in enumerate(edges[0]):
		adj_lists[node].add(edges[1][index])
		adj_lists[edges[1][index]].add(node)
	with open(filename, 'wb') as file:
		pickle.dump(adj_lists, file)
	file.close()

PATH_TO_MAT = '../../CARE-GNN/data/YelpChi.mat'
PATH_TO_PICKLE = 'data/yelp_homo_adjlists.pickle'

data_file = loadmat(PATH_TO_MAT)
yelp_homo = data_file['homo']
sparse_to_adjlist(yelp_homo, PATH_TO_PICKLE)

with open(PATH_TO_PICKLE, 'rb') as file:
    homo = pickle.load(file)
file.close()

labels = data_file['label'].flatten()
feat_data = data_file['features'].todense().A



In [19]:
from tqdm import tqdm
print(np.shape(feat_data))
print(np.shape(labels))

no_edges = 0
for i in range(len(homo)):
    no_edges += len(homo[i])
edge_index = np.zeros((2, no_edges), dtype=int)

current_edge = 0
for i in tqdm(range(len(homo))):
    for j in homo[i]:
        edge_index[0][current_edge] += i
        edge_index[1][current_edge] += j
        current_edge += 1

(45954, 32)
(45954,)


100%|█████████████| 45954/45954 [00:07<00:00, 6472.44it/s]


3869956.0

In [20]:
from gqlalchemy import Memgraph
memgraph = Memgraph("127.0.0.1", 7687)
memgraph.drop_database()

In [21]:
results = memgraph.execute_and_fetch(
    """
    MATCH (n) RETURN count(n) AS number_of_nodes ;
    """
)
print(next(results))

{'number_of_nodes': 0}


In [59]:
print(feat_data[i].tolist())

[0.022375547147414496, 0.0704948438311448, 0.42868165294161287, 0.9999851621040137, 0.9999851621040137, 0.39845685881741966, 0.8235922546182951, 0.49702500185473697, 0.9654573781437792, 0.1502633726537577, 0.9999851621040137, 0.5832183396394391, 0.5839157207507976, 0.3814823058090363, 0.3816455226648861, 0.9999737277671229, 0.6430917163649739, 0.9999737277671229, 0.8025116254630481, 0.7833591676956625, 0.755169061818564, 0.7705120458187741, 0.9480597956020282, 0.8677718519296955, 0.9950248756218906, 0.9104477611940298, 0.07960199004975121, 0.00995024875621886, 0.014925373134328401, 0.5920398009950248, 0.13930348258706468, 0.4975124378109453]


In [22]:
f = open("yelp.cypherl", "w")

for i in tqdm(range(len(feat_data))):
    # memgraph.execute(f"CREATE( :NODE {{ id: {i}, features: {(feat_data[i]).tolist()}, class: {labels[i]} }} );\n")
    f.write(f"CREATE( :NODE {{ id: {i}, features: {(feat_data[i]).tolist()}, class: {labels[i]} }} );\n")

# memgraph.execute("CREATE INDEX ON :NODE(id);") #don't load edges without this
f.write("CREATE INDEX ON :NODE(id);")

for i in tqdm(range(len(edge_index[0]))):
    edge = edge_index[:,i].tolist()
    # memgraph.execute(f"MATCH (a:NODE {{id:{edge[0]}}} ), (b:NODE {{id:{edge[1]}}}) CREATE (a)-[r:EDGE]-(b);\n")
    f.write(f"MATCH (a:NODE {{id:{edge[0]}}} ), (b:NODE {{id:{edge[1]}}}) CREATE (a)-[r:EDGE]->(b);\n")

f.close()

100%|██████████████████████████████████████| 45954/45954 [00:01<00:00, 44208.29it/s]
100%|█████████████████████████████████| 7739912/7739912 [00:08<00:00, 950736.57it/s]


In [9]:
f = open("yelp.cypherl", "r")
assert 7785866 == len(f.readlines()), "not properly loaded"
f.close()

In [111]:
f = open("yelp.cypherl", "w")

memgraph.execute("CREATE INDEX ON :NODE(id);\n") #don't load edges without this
# f.write("CREATE INDEX ON :NODE(id);\n")

for i in tqdm(range(len(edge_index[0]))):
    edge = edge_index[:,i].tolist()
    memgraph.execute(f"MERGE (a:NODE {{id:{edge[0]}}} ) MERGE (b:NODE {{id:{edge[1]}}}) CREATE (a)-[r:EDGE]->(b);\n")
    # f.write(f"MERGE (a:NODE {{id:{edge[0]}}} ) MERGE (b:NODE {{id:{edge[1]}}}) CREATE (a)-[r:EDGE]->(b);\n")

for i in tqdm(range(len(feat_data))):
    memgraph.execute(f"CREATE( :NODE {{ id: {i}, features: {(feat_data[i]).tolist()}, class: {labels[i]} }} );")
    # f.write(f"CREATE( :NODE {{ id: {i}, features: {(feat_data[i]).tolist()}, class: {labels[i]} }} );")


f.close()


  0%|      | 23857/7739912 [00:24<2:13:39, 962.16it/s]


KeyboardInterrupt: 