In [5]:
import os
from openai import AzureOpenAI
from string import Template
import json
from neo4j import GraphDatabase
import glob
from timeit import default_timer as timer
from dotenv import load_dotenv
from time import sleep
from collections import deque

from prompts import IS_SAME_ENTITY_PROMPT

load_dotenv()

True

In [6]:
# Openai configuration
openai_api_key=os.getenv("OPENAI_API_KEY")
azure_endpoint=os.getenv("OPENAI_ENDPOINT")
openai_api_version=os.getenv("OPENAI_API_VERSION")
openai_model=os.getenv("OPENAI_MODEL")

client = AzureOpenAI(  
    api_key=openai_api_key,
    azure_endpoint=azure_endpoint,  
    api_version=openai_api_version,  
)

In [None]:
# Function to call OpenAi
def process_gpt(file_prompt, system_msg):
    completion = client.chat.completions.create(
        model=openai_model,
        max_tokens=15000,
        temperature=0,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": file_prompt},
        ],
    )
    nlp_results = completion.choices[0].message.content
    return nlp_results

# Function to check if is the same entity
def is_same_entity(label, entity1, entity2) -> bool:
    e1 = entity1["name"]
    e2 = entity2["name"]
    c1 = entity1["chunkText"]
    c2 = entity2["chunkText"]
    labelName = label["name"]
    labelDescription = label["description"]
    system_msg = "You are a helpful IT-project that decides wether or not two entities are the same and return 0 to indicate no match and 1 to indicate match."

    file_prompt = Template(IS_SAME_ENTITY_PROMPT).substitute(e1=e1, e2=e2, c1=c1, c2=c2, labelName=labelName, labelDescription=labelDescription)

    # print(file_prompt)

    result = process_gpt(file_prompt, system_msg)
    
    if result == '0': return False
    if result == '1': return True
    raise Exception("OpenAi did not return 0 or 1")


def merge_from_table(merge_table, entities):
    n = len(entities)
    visited = [False] * n
    merged = []

    def bfs_helper(start):
        queue = deque([start])
        same_entities = []
        visited[start] = True
        
        while queue:
            node = queue.popleft()
            same_entities.append(entities[node])
            for neighbor in range(n):
                if merge_table[node][neighbor] == 1 and not visited[neighbor]:
                    visited[neighbor] = True
                    queue.append(neighbor)
        return same_entities

    for i in range(n):
        if not visited[i]:
            same_entities = bfs_helper(i)
            if same_entities:
                name = same_entities[0]["name"]

                for entity in same_entities[1:]:
                    name += "/" + entity["name"]

                merged.append({
                    "name": name,
                    "entities": same_entities
                })
    
    return merged


# Function to merge the entities that are the same entities
def merge_entities(labels: list[dict], extractions: list[dict]):
    # group entities from extractions under their respective labels
    for label in labels:
        label["entities"] = []
        for extraction in extractions:
            for entity in extraction["entities"]:
                if label["name"] == entity["label"]:
                    label["entities"].append({
                        "chunkId": extraction["chunkId"],
                        "chunkText": extraction["chunkText"],
                        "name": entity["name"]
                    })


    # group entities that are semantically the same within their respective label
    for label in labels:
        entities = label["entities"]
        n = len(entities)

        # create table that knows which entities are the same
        merge_table = [[0 for _ in range(n)] for _ in range(n)]
        for i in range(n):
            for j in range(i + 1, n):
                print(f'compare {entities[i]["name"]} with {entities[j]["name"]}')
                start_compare = timer()
                is_same = is_same_entity(label, entities[i], entities[j])
                end_compare = timer()
                print(is_same, f" {end_compare-start_compare}s")
                if is_same:
                    merge_table[i][j] = 1

        # group the entities with knowledge from the merge_table
        label["merged"] = merge_from_table(merge_table, entities)

        # update the relationships
        r_failed = []
        for merged_entity in label["merged"]:
            if len(merged_entity["entities"]) > 0:
                for entity in merged_entity["entities"]:
                    extraction = next((extraction for extraction in extractions if extraction['chunkId'] == entity["chunkId"]), None)
                    if extraction == None:
                        raise Exception("Extraction not found")
                    for rel in extraction["relationships"]:
                        try:
                            head, rel_type, tail = rel.split("|")
                        except:
                            r_failed.append(rel)
                            continue

                        if head == entity["name"]: head = merged_entity["name"]
                        if tail == entity["name"]: tail = merged_entity["name"]

                        rel = f"{head}|{rel_type}|{tail}"

    return labels, extractions

                        



SyntaxError: f-string: unmatched '[' (407540145.py, line 96)

In [10]:
labels = []
with open("data/bob/labels_rels.json") as f:
    questions = json.load(f)
    for q in questions:
        labels += q["labels"]

with open("data/bob/entities_rels.json") as f:
    extractions = json.load(f)

label = {
    "name": "Person",
    "description": "Individuals"
}

rolf = {
    "chunkId": "chunk_y39pavhm2478ccqfp6k9oiqs",
    "chunkText": "<|document|>Rolf's Dog.pdf<|/document|>\nRolf took his dog, Max, on their usual walk to the Pond of Wondering one crisp autumn morning. The path was covered in golden leaves, crunching softly beneath Rolf\u2019s boots. He liked walking Max; only then could he forget about his regal duties. Max trotted ahead, tail wagging, nose to the ground as he sniAed out hidden treasures. When they reached the pond, Max barked excitedly and splashed into the cool water. Rolf laughed and tossed a stick, watching Max paddle eagerly after it. Suddenly, Max stopped and stared at the water, ears perked. Rolf walked closer and saw a small turtle struggling to reach the shore. Without hesitation, Max nudged it gently with his nose, helping it to safety.\n\nThe turtle blinked slowly and slipped into the reeds, disappearing. Max wagged his tail proudly, and Rolf knelt to scratch behind his ears. As they walked home, Rolf smiled, feeling grateful for his loyal friend and the quiet magic of the Pond of Wondering.",
    "name": "Rolf"
}

king_rolf = {
    "chunkId": "chunk_mgec4ssu2cqd7u1scbzm3v8j",
    "chunkText": "<|document|>King Rolf.pdf<|/document|>\nKing Rolf ruled over the ancient kingdom of Veloria, a land surrounded by towering mountains and lush forests. He was known for his wisdom and bravery, but his reign was threatened when a mysterious sorcerer arrived, claiming to be invited by the royal council. The sorcerer, named Malvek, promised great power in exchange for a secret kept within the castle walls. Rolf, suspicious but intrigued, allowed Malvek to stay under close watch. One night, Malvek disappeared into the depths of the castle, and strange shadows began to creep through the halls. Rolf confronted Malvek deep beneath the castle, where he discovered an ancient relic glowing with dark energy. Malvek attempted to seize it, but Rolf drew his enchanted sword and shattered the relic with a single strike. The dark magic dissipated, and Malvek vanished into thin air. Peace returned to Veloria, and Rolf was hailed as the protector of the kingdom. From that day on, he vowed to guard the secrets of Veloria with his life.",
    "name": "King Rolf"
}

# print(is_same_entity(label, rolf, king_rolf))

labels, extractions = merge_entities(labels, extractions)

with open("data/bob/merged_labels.json", "w") as f:
    f.write(json.dumps(labels))

with open("data/bob/merged_relationships.json", "w") as f:
    f.write(json.dumps(extractions))

You will get two entities at the end of this prompt after Context: ; as well as the chunks from which they were extracted and  you then decide if they are the same entity but in a different context. You will do this as described bellow:
0. You return 0 or 1. You always return 0 or 1. You only return 0 or 1. You return 0 if the json objects do not describe the same thing. You return 1 if they describe the same.

1. Under Entity 1 you will find name: and chunk: where the name and the context from which the entity was extracted.The same goes for Entity 2. Under Label you will find what label the entities belong to and a description of what kind of entities belong to that label.

2. Decide whether the entities are the same but extracted from a different perspective, context, or role. Even if the descriptions emphasize different aspects of the entity (e.g., a "house" in one context and a "home" in another), they can still refer to the same entity. Try to find specific details in the chunks 

KeyboardInterrupt: 