This notebooks aims to creates a boosted dataset of fake conversations between a user and an assistant about a given list of acronym and their definitions.

## Loads config

In [None]:
# Loads config
import os

is_on_onyxia = (
    bool(os.environ["IS_ON_ONYXIA"]) if "IS_ON_ONYXIA" in os.environ else False
)

# finds open web ui url
if is_on_onyxia:
    owui_url = "https://llm.lab.sspcloud.fr/api/chat/completions"
    owui_token = os.environ["OWUI_TOKEN"] if "OWUI_TOKEN" in os.environ else None
    if owui_token is None:
        raise ValueError(f"No token Open Web UI {owui_url}, was found.")
else:
    import yaml

    with open("no_onyxia_conf.yaml", "rt") as f:
        no_onyxia_conf = yaml.safe_load(f)
    owui_url = no_onyxia_conf["OWUI_URL"]
    owui_token = no_onyxia_conf["OWUI_TOKEN"]

model_name = "llama3.3:70b" 

print(
    f"""
    is_on_onyxia : {is_on_onyxia},
    url_owui: {owui_url},
    has_owui_token: {owui_token is not None},
    LLM used : {model_name},
"""
)

## Connects to OpenWebUI (RAGaRenn)

In [None]:
from dataset_tools import WebUIConnector, create_acronym_prompt

owui = WebUIConnector(owui_token, owui_url, fav_model=model_name)
owui.get_chat_response("How much is 1+1") # test request

## Creates custom prompt and asks a LLM (RAGaRenn, ...)

In [None]:
n_conv_per_acronym = 3 # number of conversation to generate per acronym

# example
prompt = create_acronym_prompt(n_conv_per_acronym, acro="PER", definition="Purée et Epices de la Réussite")
answer = owui.get_chat_response(prompt, return_list=True)
print(answer)

In [None]:
import json

with open("./data/acronym_list.json", "rt") as f:
    all_acro = json.load(f)
n_acros = len(all_acro)

In [None]:
def create_conv(all_acro: list, n_batch: int):
    boosted_convs = []
    n_acros = len(all_acro)
    print(f"Batch number {n_batch}")
    for k, each_elem in enumerate(all_acro):
        
        acro = each_elem['acro']
        acro_def = each_elem["definition"] if "definition" in each_elem else None

        prompt = create_acronym_prompt(n_conv_per_acronym, acro, acro_def)
        answer = owui.get_chat_response(prompt, return_list=True)
        if answer is None or answer == []:
            continue
        boosted_convs += answer
        print(k+1, "on", n_acros)

    with open(f"./data/batched_data/boosted_data_{n_batch}.json", "wt") as f:
        json.dump(boosted_convs, f, indent=4)
    

In [None]:
# we do batch in case the connection with LLm breaks

batch_size = 5
n_batch = n_acros//batch_size
start_on_batch = 0
for k in range(n_batch+1):
    if k < start_on_batch:
        continue
    start_batch = k*batch_size
    end_batch = (k+1)*batch_size
    data = all_acro[start_batch:end_batch]
    create_conv(data, k)

In [None]:
# Merge all batches
all_data = []
for i in range(n_batch):
    with open(f"./data/batched_data/boosted_data_{i}.json", "rt") as f:
        d = json.load(f)
    all_data += d

In [None]:
# saves into a single json all conversations
with open("./data/dataset_all_convs.json", "wt") as f:
    json.dump(all_data, f)