# Loading data from torch_geometric.datasets (CORA) to Memgraph

In [1]:
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures
from tqdm import tqdm

dataset = Planetoid(root='data/Planetoid', name='Cora', transform=NormalizeFeatures())

print()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

  from .autonotebook import tqdm as notebook_tqdm



Dataset: Cora():
Number of graphs: 1
Number of features: 1433
Number of classes: 7

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
Number of nodes: 2708
Number of edges: 10556
Average node degree: 3.90
Number of training nodes: 140
Training node label rate: 0.05
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [2]:
from gqlalchemy import Memgraph
memgraph = Memgraph("127.0.0.1", 7687)
memgraph.drop_database()

In [64]:
results = memgraph.execute_and_fetch(
    """
    MATCH (n) RETURN count(n) AS number_of_nodes ;
    """
)
print(next(results))

{'number_of_nodes': 271}


In [4]:
for i in tqdm(range(data.num_nodes)):
    memgraph.execute(f"CREATE( :PAPER {{ id: {i}, features: {data.x[i].numpy().tolist()}, class: {data.y[i].numpy().tolist()} }} );")
    

100%|███████████████████| 2708/2708 [00:44<00:00, 60.71it/s]


In [5]:
memgraph.execute("CREATE INDEX ON :PAPER(id);") #don't load edges without this

In [6]:
for i in tqdm(range(data.num_edges)):
    edge = data.edge_index[:,i].numpy().tolist()
    memgraph.execute(f"MATCH (a:PAPER {{id:{edge[0]}}} ), (b:PAPER {{id:{edge[1]}}}) CREATE (a)-[r:CITES]->(b);")

100%|████████████████| 10556/10556 [00:15<00:00, 700.99it/s]


## Load 90% of dataset

In [51]:
import numpy as np
results = memgraph.execute_and_fetch(
    """
    MATCH (n) RETURN n.id AS ids ;
    """
)

#for i in list(results):
#    print(i)


total = sum(1 for _ in results)
mask = np.zeros((total))

i = 0
while i < total * 0.1:
    mask[i] = 1
    i+=1

mask = np.random.permutation(mask)



<class 'int'>


In [54]:
for i in range(total):
    if mask[i]:
        memgraph.execute(
            f"MATCH (n {{id: {i}}}) DETACH DELETE n ;"
        )

results = memgraph.execute_and_fetch(
    """
    MATCH (n) RETURN count(n) AS number_of_nodes ;
    """
)
print(next(results)) 

{'number_of_nodes': 2437}


## Now load only previously dropped data

In [60]:
memgraph.drop_database()
for i in tqdm(range(data.num_nodes)):
    memgraph.execute(f"CREATE( :PAPER {{ id: {i}, features: {data.x[i].numpy().tolist()}, class: {data.y[i].numpy().tolist()} }} );")

100%|███████████████████| 2708/2708 [00:50<00:00, 53.61it/s]


In [61]:
memgraph.execute("CREATE INDEX ON :PAPER(id);") #don't load edges without this

In [62]:
for i in tqdm(range(data.num_edges)):
    edge = data.edge_index[:,i].numpy().tolist()
    memgraph.execute(f"MATCH (a:PAPER {{id:{edge[0]}}} ), (b:PAPER {{id:{edge[1]}}}) CREATE (a)-[r:CITES]->(b);")

100%|████████████████| 10556/10556 [00:15<00:00, 690.01it/s]


In [63]:
for i in tqdm(range(total)):
    if not mask[i]:
        memgraph.execute(
            f"MATCH (n:PAPER {{id: {i}}}) DETACH DELETE n ;"
        )

100%|██████████████████| 2708/2708 [00:02<00:00, 925.38it/s]
