## 1. Configuration

In [2]:
# ipython kernel install --name ".venv" --user
import os
from openai import AzureOpenAI
from string import Template
import json
from timeit import default_timer as timer
from dotenv import load_dotenv
from time import sleep
from prompts import EXTRACT_LABELS_AND_RELATIONSHIPS_FROM_QUESTIONS_PROMT
import tiktoken

In [3]:
# Openai configuration
openai_api_key=os.getenv("OPENAI_API_KEY")
azure_endpoint=os.getenv("OPENAI_ENDPOINT")
openai_api_version=os.getenv("OPENAI_API_VERSION")
openai_model=os.getenv("OPENAI_MODEL")

client = AzureOpenAI(  
    api_key=openai_api_key,
    azure_endpoint=azure_endpoint,  
    api_version=openai_api_version,  
)
encoding = tiktoken.encoding_for_model(openai_model)

print(encoding)

<Encoding 'o200k_base'>


## 2. Helper Functions

In [4]:
# Function to call OpenAi
def process_gpt(file_prompt, system_msg):
    completion = client.chat.completions.create(
        model=openai_model,
        max_tokens=15000,
        temperature=0,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": file_prompt},
        ],
    )
    nlp_results = completion.choices[0].message.content
    return nlp_results

MAX_CONTEXT = 100000

# Function to extract labels and relationships from competence questions and chunks as context
def extract_labels_rels_from_questions(prompt_template, questionList: list[str], chunks: list[dict]):
    start = timer()
    system_msg = "You are a helpful IT-project that extracts information from documents and returns valid json data and json formatted data only."

    context = ""
    token_count = 0
    for chunk in chunks:
        token_count += len(encoding.encode(chunk["text"]))
        if token_count > MAX_CONTEXT: break
        context += chunk["text"]

    result = []
    for q in questionList:
        file_prompt = Template(prompt_template).substitute(question=q, context=context)
        ls_n_rs = json.loads(process_gpt(file_prompt, system_msg))
        result.append({"question": q, "labels": ls_n_rs["labels"], "relationships": ls_n_rs["relationships"]})
    
    end = timer()
    print(f"Pipeline completed in {end-start} seconds")
    return result 


### 3. Running

In [None]:
with open("data/bob/questions.json") as f:
    questions = json.load(f)

with open("chunks.json") as f:
    chunks = json.load(f)

result = extract_labels_rels_from_questions(EXTRACT_LABELS_AND_RELATIONSHIPS_FROM_QUESTIONS_PROMT, questions, chunks)
print(result)

with open("data/bob/labels_rels.json", "w") as f:
    f.write(json.dumps(result))

Pipeline completed in 322.24565508399974 seconds
[{'question': 'How many transactions and portfolio companies does Unigestion target?', 'labels': [{'name': 'Transaction', 'description': 'Deals or investments made by a fund, including GP-led deals, direct secondaries, and fund/LP stakes.'}, {'name': 'Portfolio Company', 'description': "Companies that are part of a fund's investment portfolio, often targeted for growth and value creation."}, {'name': 'Fund', 'description': 'An investment vehicle that pools capital from investors to invest in various assets, including secondary markets.'}, {'name': 'General Partner (GP)', 'description': "The entity responsible for managing a private equity fund, making investment decisions, and overseeing the fund's operations."}, {'name': 'Limited Partner (LP)', 'description': 'Investors who provide capital to a private equity fund but do not have a role in managing the fund.'}, {'name': 'Investment Strategy', 'description': 'The approach or plan adopted

TypeError: the JSON object must be str, bytes or bytearray, not list