In [None]:
import os
import csv
import pandas as pd 
import numpy as np 
import pickle
from tqdm import tqdm
import glob
import json

In [None]:
kg_folder = 'Data/iBKH/'
triplet_path = 'Data/triplets/'
dict_path = 'Data/dicts/'

In [None]:
entity_folder = os.path.join(kg_folder, 'Entity')
entity_files = glob.glob(os.path.join(entity_folder, '*_vocab.csv'))

all_ents = pd.concat([pd.read_csv(file) for file in entity_files])
all_ents.to_csv(os.path.join(entity_folder, 'all_ent.csv'), columns=['primary', 'name'], index=False)

In [None]:
from langchain_core.documents import Document

docs = []
ids = []
names = []

id2label = {}
label2id = {}

with open(kg_folder + "Entity/all_ent.csv", 'r') as f:
    reader = csv.reader(f)
    header = next(reader)
    for id, name in reader:
        if id in id2label:
            continue
        # print(id, name)
        doc = Document(page_content=name, metadata={"id":id})
        docs.append(doc)
        ids.append(id)
        names.append(name)
        id2label[id] = name
        label2id[name] = id

print(len(docs), docs[-1], ids[-1], names[-1], id2label[ids[-1]], label2id[names[-1]])

In [None]:
import pickle

id2labelfile = 'id2label.pkl'
label2idfile = 'label2id.pkl'

with open(kg_folder + "Entity/" + id2labelfile, 'wb') as file:
    pickle.dump(id2label, file)

with open(kg_folder + "Entity/" + label2idfile, 'wb') as file:
    pickle.dump(label2id, file)


In [None]:
triplet = pd.read_csv(triplet_path + 'triplet_whole.csv')
head_list = list(triplet.drop_duplicates(subset='Head', keep='first')['Head'])
tail_list = list(triplet.drop_duplicates(subset='Tail', keep='first')['Tail'])

In [None]:
relations = list(triplet['Relation'].unique())

with open(kg_folder + "Relation/" + 'relations.json', 'w') as f:
    json.dump(relations, f)

In [None]:
# head -> list of tails
head2tails_dict = {}
for head_entity in head_list:
    sub_df = triplet.loc[triplet['Head'] == head_entity]
    mapped_tail_list = list(sub_df.drop_duplicates(subset='Tail', keep='first')['Tail'])
    head2tails_dict[head_entity] = mapped_tail_list

with open(dict_path + 'head2tails_dict.pkl', 'wb') as f:
    pickle.dump(head2tails_dict, f)

In [None]:
# tail -> list of heads
tail2heads_dict = {}
for tail_entity in tail_list:
    sub_df = triplet.loc[triplet['Tail'] == tail_entity]
    mapped_head_list = list(sub_df.drop_duplicates(subset='Head', keep='first')['Head'])
    tail2heads_dict[tail_entity] = mapped_head_list

with open(dict_path + 'tail2heads_dict.pkl', 'wb') as f:
    pickle.dump(tail2heads_dict, f)

In [None]:
# head -> list of rels
head2rels_dict = {}
for head_entity in head_list:
    sub_df = triplet.loc[triplet['Head'] == head_entity]
    mapped_rel_list = list(sub_df.drop_duplicates(subset='Relation', keep='first')['Relation'])
    head2rels_dict[head_entity] = mapped_rel_list

with open(dict_path + 'head2rels_dict.pkl', 'wb') as f:
    pickle.dump(head2rels_dict, f)

In [None]:
# tail -> list of rels
tail2rels_dict = {}
for tail_entity in tqdm(tail_list):
    sub_df = triplet.loc[triplet['Tail'] == tail_entity]
    mapped_rel_list = list(sub_df.drop_duplicates(subset='Relation', keep='first')['Relation'])
    tail2rels_dict[tail_entity] = mapped_rel_list

with open(dict_path + 'tail2rels_dict.pkl', 'wb') as f:
    pickle.dump(tail2rels_dict, f)

In [None]:
# head@rel -> list of tails
head_rel2tails_dict = {}
for (head, relation), group in tqdm(triplet.groupby(['Head', 'Relation']), desc="Processing groups"):
    key = f'{head}@{relation}'
    tail_list = list(group['Tail'].unique())
    head_rel2tails_dict[key] = tail_list

with open(dict_path + 'head@rel2tails_dict.pkl', 'wb') as f:
    pickle.dump(head_rel2tails_dict, f)

In [None]:
# tail@rel -> list of heads
tail_rel2heads_dict = {}
for (tail, relation), group in tqdm(triplet.groupby(['Tail', 'Relation']), desc="Processing groups"):
    key = f'{tail}@{relation}'
    head_list = list(group['Head'].unique())
    tail_rel2heads_dict[key] = head_list

with open(dict_path + 'tail@rel2heads_dict.pkl', 'wb') as f:
    pickle.dump(tail_rel2heads_dict, f)

In [None]:
# rel@tail -> list of heads
rel_tail2heads_dict = {}
for (tail, relation), group in tqdm(triplet.groupby(['Tail', 'Relation']), desc="Processing groups"):
    key = f'{relation}@{tail}'
    head_list2 = list(group['Head'].unique())
    rel_tail2heads_dict[key] = head_list2

with open(dict_path + 'rel@tail2heads_dict.pkl', 'wb') as f:
    pickle.dump(rel_tail2heads_dict, f)

In [None]:
def examine_pkl(path=dict_path ,filename='tail2heads_dict.pkl'):
    with open(path + filename, 'rb') as file:
        data = pickle.load(file)

    if isinstance(data, dict):
        for i, (key, value) in enumerate(data.items()):
            print(f"{key}: {value}")
            if i == 4:
                break

examine_pkl(filename='rel@tail2heads_dict.pkl')

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="ibkh_collection_cosine",
    collection_metadata={"hnsw:space": "cosine"},
    embedding_function=embeddings_model,
    persist_directory="./chroma_langchain_db_cosine",  # Where to save data locally, remove if not neccesary
)

In [None]:
i = 0
while i < len(docs):
    print(i)
    vector_store.add_documents(documents=docs[i:i+5000])
    i+=5000

In [None]:
te = 'water-induced urticaria'
results = vector_store.similarity_search_with_score(
        te, k=4
    )
print(results)
for doc, score in results:
    print(doc.page_content, doc.metadata['id'], score)
# res, score = results[0]
# print(res)

In [None]:
te = 'fever'
results = vector_store.similarity_search_with_relevance_scores(
        te, k=4
    )
print(results)
for doc, score in results:
    print(doc.page_content, doc.metadata['id'], score)
# res, score = results[0]
# print(res)