In [None]:
from graphdatascience import GraphDataScience
from torch_geometric.data import Data, download_url
import torch
import torch.optim as optim
from torch_geometric.nn import TransE

In [None]:
gds = GraphDataScience("bolt://localhost:7687", auth=('neo4j', 'neo4jneo4j'), database="fb15k-237")

In [None]:
url = ('https://raw.githubusercontent.com/villmow/datasets_knowledge_embedding/master/FB15k-237')
raw_file_names = ['train.txt', 'valid.txt', 'test.txt']
raw_dir = './data_from_url'
for filename in raw_file_names:
    download_url(f'{url}/{filename}', raw_dir)

In [None]:
def process():
    data_list_, node_dict_, rel_dict_ = [], {}, {}
    for file_name in raw_file_names:
        file_name_path = raw_dir + '/' + file_name
        with open(file_name_path, 'r') as f:
            data = [x.split('\t') for x in f.read().split('\n')[:-1]]

        edge_index = torch.empty((2, len(data)), dtype=torch.long)
        edge_type = torch.empty(len(data), dtype=torch.long)
        for i, (src, rel, dst) in enumerate(data):
            if src not in node_dict_:
                node_dict_[src] = len(node_dict_)
            if dst not in node_dict_:
                node_dict_[dst] = len(node_dict_)
            if rel not in rel_dict_:
                rel_dict_[rel] = len(rel_dict_)

            edge_index[0, i] = node_dict_[src]
            edge_index[1, i] = node_dict_[dst]
            edge_type[i] = rel_dict_[rel]

        data = Data(edge_index=edge_index, edge_type=edge_type)
        data_list_.append(data)

    for data in data_list_:
        data.num_nodes = len(node_dict_)

    return data_list_, node_dict_, rel_dict_

data_list, node_dict, rel_dict = process()

In [None]:
gds.run_cypher("CREATE CONSTRAINT entity_id FOR (e:Entity) REQUIRE e.id IS UNIQUE")

In [None]:
rel_id_to_text_dict = {}
for k in rel_dict:
    text = k
    id = rel_dict[k]
    rel_id_to_text_dict[id] = text

In [None]:
def write_chunk(chunk_dict):
    gds.run_cypher(
            "UNWIND $nodes AS node CREATE (n:Entity {id: node[1], value: node[0]})",
            params={"nodes": list(chunk_dict.items())},
        )

idx = 0
chunk_size = 1000
chunk_dict = {}
for k in node_dict:
    chunk_dict[k] = node_dict[k]
    idx += 1
    if idx % chunk_size == 0:
        write_chunk(chunk_dict)
        chunk_dict = {}
if len(chunk_dict) > 0:
    write_chunk(chunk_dict)
print(f"TOTAL records: {idx} from {len(node_dict)}")

In [None]:
def write_rel_chunk(ll:list, label):
    gds.run_cypher(
            "UNWIND $list AS l MATCH (e_s:Entity {id: l.source}), (e_t:Entity {id: l.target}) "+
            "CREATE (e_s)-["+label+" { rel_id: l.id, text: l.text }]->(e_t)",
            params={"list": ll},
        )

def create_rels(data:Data, label:str):
    idx = 0
    chunk_size = 1000
    chunk_list = []
    print("Writing " + label + " relationships")
    for i in range(data.num_edges):
        source = data.edge_index[0, i].item()
        target = data.edge_index[1, i].item()
        id = data.edge_type[i].item()
        text = rel_id_to_text_dict[id]
        l = {
            "source": source,
            "target": target,
            "id": id,
            "text": text
        }
        chunk_list.append(l)
        idx += 1
        if idx % chunk_size == 0:
            write_rel_chunk(chunk_list, label)
            chunk_list = []
    if len(chunk_list) > 0:
        write_rel_chunk(chunk_list, label)
    print(f"TOTAL records: {idx} from {data.num_edges}")

create_rels(data_list[0], ":TRAIN")
create_rels(data_list[1], ":VAL")
create_rels(data_list[2], ":TEST")

In [None]:
# Node: (:Entity {id:int, value:str})
# Edge: [:(TRAIN|TEST|VAL) {rel_id:int, text:str}]

In [None]:
def print_graph_info(G):
    print(f"Graph '{G.name()}' node count: {G.node_count()}")
    print(f"Graph '{G.name()}' node labels: {G.node_labels()}")
    print(f"Graph '{G.name()}' relationship types: {G.relationship_types()}")
    print(f"Graph '{G.name()}' relationship count: {G.relationship_count()}")

def get_data_from_db(edge_label):
    node_projection = {"Entity": {"properties": "id"}}
    relationship_projection = {edge_label : {"orientation": "NATURAL", "properties": "rel_id"}}
    G, result = gds.graph.project(
        "fb15k-graph-tt"+edge_label,
        node_projection,
        relationship_projection,
    )
    print(f"The projection took {result['projectMillis']} ms")
    print_graph_info(G)

    return G

def get_whole_dataset():
    node_projection = {"Entity": {"properties": "id"}}
    relationship_projection = {
        "TRAIN" : {"orientation": "NATURAL", "properties": "rel_id"},
        "TEST" : {"orientation": "NATURAL", "properties": "rel_id"},
        "VAL" : {"orientation": "NATURAL", "properties": "rel_id"},
    }
    G, result = gds.graph.project(
        "fb15k-graph-whole",
        node_projection,
        relationship_projection,
    )
    print(f"The projection took {result['projectMillis']} ms")
    print_graph_info(G)

    return G

In [None]:
train_db_data_G = get_data_from_db("TRAIN")
test_db_data_G = get_data_from_db("TEST")
val_db_data_G = get_data_from_db("VAL")
db_data_G = get_whole_dataset()

In [None]:
node_properties = gds.graph.nodeProperties.stream(
    db_data_G,
    ["id"],
    separate_property_columns=True,
)
print(node_properties)

In [None]:
nodeId_to_id = dict(zip(node_properties.nodeId, node_properties.id))
id_to_nodeId = dict(zip(node_properties.id, node_properties.nodeId))

In [None]:
def create_tensor(graph):
    rels_tmp = gds.graph.relationshipProperties.stream(graph, ["rel_id"], separate_property_columns=True)
    topology = [rels_tmp.sourceNodeId.map(lambda x: nodeId_to_id[x]), rels_tmp.targetNodeId.map(lambda x: nodeId_to_id[x])]
    edge_index = torch.tensor(topology, dtype=torch.long)
    edge_type = torch.tensor(rels_tmp.rel_id.astype(int), dtype=torch.long)
    data = Data(edge_index=edge_index, edge_type=edge_type)
    data.num_nodes = len(nodeId_to_id)
    display(data)
    return data

train_tensor = create_tensor(train_db_data_G)
test_tensor = create_tensor(test_db_data_G)
val_tensor = create_tensor(val_db_data_G)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = TransE(
    num_nodes=train_tensor.num_nodes,
    num_relations=train_tensor.num_edge_types,
    hidden_channels=50,
).to(device)

loader = model.loader(
    head_index=train_tensor.edge_index[0],
    rel_type=train_tensor.edge_type,
    tail_index=train_tensor.edge_index[1],
    batch_size=1000,
    shuffle=True,
)

optimizer = optim.Adam(model.parameters(), lr=0.01)


def train():
    model.train()
    total_loss = total_examples = 0
    for head_index, rel_type, tail_index in loader:
        optimizer.zero_grad()
        loss = model.loss(head_index, rel_type, tail_index)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * head_index.numel()
        total_examples += head_index.numel()
    return total_loss / total_examples


@torch.no_grad()
def test(data):
    model.eval()
    return model.test(
        head_index=data.edge_index[0],
        rel_type=data.edge_type,
        tail_index=data.edge_index[1],
        batch_size=20000,
        k=10,
    )


for epoch in range(1, 501):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    if epoch % 75 == 0:
        rank, hits = test(val_tensor)
        print(f'Epoch: {epoch:03d}, Val Mean Rank: {rank:.2f}, '
              f'Val Hits@10: {hits:.4f}')

print(model)
idx = torch.LongTensor([1])
print(model.rel_emb(idx))
rank, hits_at_10 = test(test_tensor)
print(f'Test Mean Rank: {rank:.2f}, Test Hits@10: {hits_at_10:.4f}')

In [None]:
torch.save(model, "./model_501.pt")

In [None]:
torch.save(model.state_dict(), "./model_501_st_dict")

In [None]:
model = torch.load("./model_501.pt")

In [None]:
# write embeddings to graph
for i in range(0, len(nodeId_to_id)):
    if i % 100 == 0:
        print(f"Node embeddings uploading: {i} of {len(nodeId_to_id)}", end="\r")
    gds.run_cypher(
            "MATCH (n:Entity {id: $i}) SET n.emb = $EMBEDDING",
            params={
                "i": i,
                "EMBEDDING": model.node_emb.weight[i].tolist()
            },
        )

In [None]:
# prediction stage

In [None]:
# 1. Pick relation id to predict
rel_id_to_predict = 17
rel_label_to_predict = f"REL_{rel_id_to_predict}"

In [None]:
# 2. Put correspond relationships to the graph
def write_rel_chunk(ll:list, label):
    gds.run_cypher(
            "UNWIND $list AS l MATCH (e_s:Entity {id: l.source}), (e_t:Entity {id: l.target}) "+
            "CREATE (e_s)-["+label+"{ text: l.text }]->(e_t)",
            params={"list": ll},
        )

data = test_tensor
list_of_target_rels = []
for i in range(data.num_edges):
    if data.edge_type[i].item() != rel_id_to_predict:
        pass
    source = data.edge_index[0, i].item()
    target = data.edge_index[1, i].item()
    id = data.edge_type[i].item()
    text = rel_id_to_text_dict[id]
    l = {
        "source": source,
        "target": target,
        "text": text
    }
    list_of_target_rels.append(l)
write_rel_chunk(list_of_target_rels, ":"+rel_label_to_predict)

In [None]:
# 3. Get embedding of that rel from the model
target_emb = model.node_emb.weight[rel_id_to_predict].tolist()

In [None]:
# 4. Project graph to test
node_projection_test = {"Entity": {"properties": ["id", "emb"] }}
relationship_projection_test = {rel_label_to_predict : {"orientation": "NATURAL"}}
G_test, result = gds.graph.project(
        "graph_to_test1",
        node_projection_test,
        relationship_projection_test,
    )
print_graph_info(G_test)

In [None]:
# 5. Set the model to predict
transe_model = gds.model.transe.create(
    G_test, "emb", {rel_label_to_predict: target_emb}
)
result = transe_model.predict_stream(
    source_node_filter=[id_to_nodeId[5], id_to_nodeId[10]],
    target_node_filter="Entity",
    relationship_type=rel_label_to_predict,
    top_k=3,
    concurrency=4
)
print(result)