In [1]:
import os
from openai import AzureOpenAI
from string import Template
import json
from neo4j import GraphDatabase
import glob
from timeit import default_timer as timer
from dotenv import load_dotenv
from time import sleep

from prompts import IS_SAME_LABEL_PROMPT

load_dotenv()

True

In [2]:
# Openai configuration
openai_api_key=os.getenv("OPENAI_API_KEY")
azure_endpoint=os.getenv("OPENAI_ENDPOINT")
openai_api_version=os.getenv("OPENAI_API_VERSION")
openai_model=os.getenv("OPENAI_MODEL")

client = AzureOpenAI(  
    api_key=openai_api_key,
    azure_endpoint=azure_endpoint,  
    api_version=openai_api_version,  
)

In [None]:
# Function to call OpenAi
def process_gpt(file_prompt, system_msg):
    completion = client.chat.completions.create(
        model=openai_model,
        max_tokens=15000,
        temperature=0,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": file_prompt},
        ],
    )
    nlp_results = completion.choices[0].message.content
    return nlp_results

def matches_label(labels, label):
    for other in labels:
        if label['name'] == other['name'] and label['description'] == other['description']:
            return True
    
    return False

def matches_rels(rels, rel):
    for other in rels:
        if rel == other:
            return True
    
    return False

# Function to create list of labels and relationships that differ
def primitive_prune(detailed):
    labels = []
    rels = []
    for detail in detailed:
        for label in detail["labels"]:
            if not matches_label(labels, label): labels.append(label)
        
        for rel in detail["relationships"]:
            if not matches_rels(rels, rel): rels.append(rel)

    result = {
        "labels": labels,
        "relationships": rels
    }

    return result

# Function to check if is same label semantically
def is_same_label(label1: dict, label2: dict) -> bool:
    label1Name = label1["name"]
    label1Description = label1["description"]
    label2Name = label2["name"]
    label2Description = label2["description"]
    system_msg = "You are a helpful IT-project that decides wether or not two objects are the same and return 0 to indicate no match and 1 to indicate match."
    file_prompt = Template(IS_SAME_LABEL_PROMPT).substitute(obj1Name=label1Name, obj1Description=label1Description, obj2Name=label2Name, obj2Description=label2Description)

    result = process_gpt(file_prompt, system_msg)
    
    if result == '0': return False
    if result == '1': return True
    raise Exception("OpenAi did not return 0 or 1")

# Function to create list of labels that are semantically different
def prune_semantically(labels: list[dict]):
    print("labels: ", labels)
    label_groups = []
    for label in labels:
        print("label: ", label)
        is_in_group = False
        for group in label_groups:
            if label["name"] == group["name"]:
                is_in_group = True
                group["labels"].append(label)
                break
        
        if not is_in_group:
            label_groups.append({
                "name": label["name"],
                "labels": [label]
            })

    for group in label_groups:
        labels = group["labels"]
        distinct_labels = []
        print("prune group: ", group)
        start = timer()
        while len(labels) > 0:
            label = labels[0]
            distinct_labels.append(label)
            del labels[0]
            labels = list(filter(lambda other: not is_same_label(label, other), labels))
        end = timer()
        print(f"{end-start}s")
        group["labels"] = distinct_labels

    all = []
    for group in label_groups:
        all += group["labels"]

    return all


In [7]:
with open("primitively_pruned.json") as f:
    labels = json.load(f)

result = prune_semantically(labels)

with open("semanticly_pruned.json", "w") as r:
    json.dump(result, r)

labe:  labels


TypeError: string indices must be integers, not 'str'