In [1]:
import os
from openai import AzureOpenAI
from string import Template
import json
from neo4j import GraphDatabase
import glob
from timeit import default_timer as timer
from dotenv import load_dotenv
from time import sleep
from prompts import EXTRACT_E_AND_R_PROMPT
 
load_dotenv()


True

In [2]:
# Openai configuration
openai_api_key=os.getenv("OPENAI_API_KEY")
azure_endpoint=os.getenv("OPENAI_ENDPOINT")
openai_api_version=os.getenv("OPENAI_API_VERSION")
openai_model=os.getenv("OPENAI_MODEL")

client = AzureOpenAI(  
    api_key=openai_api_key,
    azure_endpoint=azure_endpoint,  
    api_version=openai_api_version,  
)

In [None]:
def process_gpt(file_prompt, system_msg):
    completion = client.chat.completions.create(
        model=openai_model,
        max_tokens=15000,
        temperature=0,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": file_prompt},
        ],
    )
    nlp_results = completion.choices[0].message.content
    sleep(8)
    return nlp_results

def extract(chunk, prompt, labelsText, relsText, on_try, i):
    start = timer()
    print(f"Extract for chunk {i} on try {on_try}")
    system_msg = "You are a helpful IT-project that extracts information from documents and returns valid json data and json formatted data only."

    prompt = Template(prompt).substitute(ctext=chunk["text"], labels=labelsText, rels=relsText)
    try: 
        result = json.loads(process_gpt(prompt, system_msg))
    except:
        end = timer()
        print(f"Failed: {end-start}s, try {on_try}")
        raise Exception("failed")

    end = timer()
    print(f"Success: {end-start}s, try {on_try}")
    return result

    

def extract_e_r(chunks, prompt, labels, rels, tries = 3):
    labelsText = ""
    for label in labels:
        labelsText += f'{label["name"]}: {label["description"]}\n'
    
    relsText = ""
    for rel in rels:
        relsText += f"{rel}\n"

    
    results = []
    failed = []

    start = timer()
    print(f"Extracting for {len(chunks)} chunks")



    for i, chunk in enumerate(chunks):
        on_try = 0
        while on_try < tries:
            on_try += 1
            try:
                result = extract(chunk, prompt, labelsText, relsText, on_try, i)
                results.append({
                    "chunkId": chunk["id"],
                    "entities": result["entities"],
                    "relationships": result["relationships"]
                })
                on_try = tries
            except:
                if on_try >= tries: failed.append(chunk["id"])

    end = timer()
    print(f"Pipeline completed in {end-start} seconds")
    return results, failed

In [None]:
with open("semanticly_pruned.json") as f:
    labels = json.load(f)

with open("primitively_pruned.json") as f:
    rels = json.load(f)["relationships"]

with open("chunks.json") as f:
    chunks = json.load(f)[:5]

results, failed = extract_e_r(chunks, EXTRACT_E_AND_R_PROMPT, labels, rels)
print("results: ", results)
print("failed: ", failed)


with open("es_n_rs.json", "w") as f:
    f.write(json.dumps(results))

with open("failed.json", "w") as f:
    f.write(json.dumps(failed))

Extracting for 5 chunks
Extract for chunk 0 on try 1
Success: 14.803175041999566s, try 1
Extract for chunk 1 on try 1
Failed: 9.411060791999262s, try 1
Extract for chunk 1 on try 2
Failed: 10.036680667000837s, try 2
Extract for chunk 1 on try 3
Failed: 10.24050675000035s, try 3
Extract for chunk 2 on try 1
Failed: 11.264269290999437s, try 1
Extract for chunk 2 on try 2
Failed: 16.382902041001216s, try 2
Extract for chunk 2 on try 3
Failed: 13.19817250000051s, try 3
Extract for chunk 3 on try 1
Success: 10.130062166999778s, try 1
Extract for chunk 4 on try 1
Failed: 9.23507462499947s, try 1
Extract for chunk 4 on try 2
Failed: 11.87857724999958s, try 2
Extract for chunk 4 on try 3
Failed: 10.265949041000567s, try 3
Pipeline completed in 126.84969916700175 seconds
[{'chunkId': 'chunk_vbwu0qkt4zjxiy34s7sd8zdy', 'entities': [{'label': 'Fund', 'name': 'Unigestion Secondary VI'}, {'label': 'Transaction', 'name': 'GP-led deals'}, {'label': 'Transaction', 'name': 'Direct Secondaries'}, {'label