## 1. Configuration

In [1]:
# ipython kernel install --name ".venv" --user
import os
from openai import AzureOpenAI
from string import Template
import json
from timeit import default_timer as timer
from dotenv import load_dotenv
from time import sleep

In [2]:
from prompts import EXTRACT_LABELS_PROMPT

load_dotenv()

print(EXTRACT_LABELS_PROMPT)


Extract labels from the text following at the end as described in the immediately following instuctions:
0.a Do NOT add the word json at the beginning of the response. DO NOT add the following characters: ``` at the beginning of the response.
0.b ALWAYS FINISH THE OUTPUT. Never send partial responses. Your response only includes a json list with the label objects.
Return only valid json. Do not add the word json at the beginning or the following characters: ```.

1. Create a json list and add the labels to it as json objects. A label is the generic term of an entity/object/instance that appears in the text.
Possible labels are: document, concept, person, location, date, company etc. Find labels beyond the scope mentioned. Find labels that are relevant in the text. Find labels as diverse as possible.
Connections/Relationships are not Labels. Do not add relationship to the labels.
Example:
Labels are the generic term of the entity, Example: Document for Appendix B.7, Person for Donald T

In [3]:
# Openai configuration
openai_api_key=os.getenv("OPENAI_API_KEY")
azure_endpoint=os.getenv("OPENAI_ENDPOINT")
openai_api_version=os.getenv("OPENAI_API_VERSION")
openai_model=os.getenv("OPENAI_MODEL")

client = AzureOpenAI(  
    api_key=openai_api_key,
    azure_endpoint=azure_endpoint,  
    api_version=openai_api_version,  
)

## 2. Helper Functions

In [8]:
# Function to call OpenAi
def process_gpt(file_prompt, system_msg):
    completion = client.chat.completions.create(
        model=openai_model,
        max_tokens=15000,
        temperature=0,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": file_prompt},
        ],
    )
    nlp_results = completion.choices[0].message.content
    return nlp_results

# Function to extract labels from text
def extract_labels_from_chunk(chunk: dict, prompt_template: Template):
    text = chunk["text"]
    system_msg = "You are a helpful IT-project that extracts information from documents and returns valid json data and json formatted data only."

    prompt = Template(prompt_template).substitute(text=text)
    TRIES = 3
    tries_count = 0
    success = False
    while not success and tries_count < TRIES: 
        tries_count += 1
        try:
            result = process_gpt(file_prompt=prompt, system_msg=system_msg)
            labels = json.loads(result)
            success = True
        except Exception as e:
            print(f"Error processing: ", e)

    return labels

# Function to extract all labels from array of chunks
def extract_labels(chunks: list[dict], prompt_template: Template, chunk_count: int | None = 10, specific_chunk: int | None = None):
    chunk_count = chunk_count if chunk_count else len(chunks)
    start = timer()
    print(f"Extracting labels for {chunk_count} of {len(chunks)} chunks")
    labels = []
    for i, chunk in enumerate(chunks):
        start_chunk = timer()
        if specific_chunk or (chunk_count and i >= chunk_count): break
        print(f"Extracting labels for chunk {i} of {len(chunks)} chunks")
        sleep(0.1)
        chunk_labels = extract_labels_from_chunk(chunk, prompt_template)
        labels.extend(chunk_labels)
        end_chunk = timer()
        print(f"{end_chunk-start_chunk}s")

    if specific_chunk:
        chunk = chunks[specific_chunk]
        start_chunk = timer()
        print(f"Extracting labels for chunk {specific_chunk} of {len(chunks)} chunks")
        sleep(0.1)
        chunk_labels = extract_labels_from_chunk(chunk, prompt_template)
        labels.extend(chunk_labels)
        end_chunk = timer()
        print(f"{end_chunk-start_chunk}s")

    end = timer()
    print(f"Pipeline completed in {end-start} seconds for {chunk_count} of {len(chunks)} chunks")
    return labels


### 3. Running

In [9]:
with open("chunks.json") as f:
    chunks = json.load(f)

result = extract_labels(chunks, EXTRACT_LABELS_PROMPT, chunk_count=20, specific_chunk = 12)
print('result: ')
print(result)
with open("extracted_labels_first_20_chunks.json", "w") as f:
    json.dump(result, f)

Extracting labels for 20 of 528 chunks
Extracting labels for chunk 12 of 528 chunks
4.014518500000122s
Pipeline completed in 4.014705165998748 seconds for 20 of 528 chunks
result: 
[{'name': 'Document', 'description': 'Parts of the document like pages, paragraphs, appendices'}, {'name': 'Company', 'description': 'Organizations and business entities'}, {'name': 'Risk', 'description': 'Potential negative outcomes or dangers associated with investments'}, {'name': 'Investment', 'description': 'The act of allocating resources, usually money, in order to generate income or profit'}, {'name': 'Performance', 'description': 'The return or results of an investment over a period of time'}, {'name': 'Currency', 'description': 'A system of money in general use in a particular country'}, {'name': 'Verification', 'description': 'The process of establishing the truth, accuracy, or validity of something'}, {'name': 'Forecast', 'description': 'A prediction or estimate of future events, especially comin