This notebooks aims to creates a boosted dataset of fake conversations between a user and an assistant about a given list of acronym and their definitions.

## Loads config

In [1]:
# Loads config
import os

is_on_onyxia = (
    bool(os.environ["IS_ON_ONYXIA"]) if "IS_ON_ONYXIA" in os.environ else False
)

# finds open web ui url
if is_on_onyxia:
    owui_url = "https://llm.lab.sspcloud.fr/api/chat/completions"
    owui_token = os.environ["OWUI_TOKEN"] if "OWUI_TOKEN" in os.environ else None
    if owui_token is None:
        raise ValueError(f"No token Open Web UI {owui_url}, was found.")
else:
    import yaml

    with open("../conf/conf.yaml", "rt") as f:
        no_onyxia_conf = yaml.safe_load(f)
    owui_url = no_onyxia_conf["OWUI_URL"]
    owui_token = no_onyxia_conf["OWUI_TOKEN"]


# model_name = "llama3.3:70b"
# model_name = "llama3.1:8b"
# model_name = "mistral-small3.1:latest" 
model_name = "mistral-small:latest" 

print(
    f"""
    is_on_onyxia : {is_on_onyxia},
    url_owui: {owui_url},
    has_owui_token: {owui_token is not None},
    LLM used : {model_name},
"""
)


    is_on_onyxia : False,
    url_owui: https://ragarenn.eskemm-numerique.fr/marius.garenaux-gruau@irisa.fr/api/chat/completions,
    has_owui_token: True,
    LLM used : mistral-small:latest,



## Connects to OpenWebUI (RAGaRenn or Onyxia)

In [2]:
from dataset_tools import WebUIConnector, create_acronym_prompt

owui = WebUIConnector(owui_token, owui_url, fav_model=model_name)
owui.get_chat_response("How much is 1+1") # test request

'The sum of 1 + 1 is 2.'

## Creates custom prompt and asks a LLM (RAGaRenn, ...)

In [3]:
n_conv_per_acronym = 3 # number of conversation to generate per acronym

# example
prompt = create_acronym_prompt(n_conv_per_acronym, acro="PER", definition="Purée et Epices de la Réussite")
answer = owui.get_chat_response(prompt, return_list=True)
print(answer)

Error during parsing result of request to json. Re-trying with an other method.
[[{'role': 'user', 'content': 'What does PER stand for?'}, {'role': 'assistant', 'content': "PER stands for 'Purée et Epices de la Réussite'."}], [{'role': 'user', 'content': 'Can you explain what PER means?'}, {'role': 'assistant', 'content': "'Purée et Epices de la Réussite' is the definition of PER."}], [{'role': 'user', 'content': 'What is the acronym PER short for?'}, {'role': 'assistant', 'content': "The acronym PER stands for 'Purée et Epices de la Réussite'."}]]


In [4]:
import os

if not os.path.isdir("../data") :
    raise Exception(f"Please add a data directory a the root of the project; or rename example_data to data.")

In [5]:
import json
import random

# loads list of acronyms and their definitions

with open("../data/acronym.json", "rt") as f:
    raw_data = json.load(f)
n_acros = len(raw_data)
print(f"Acronym exemple : {raw_data[random.randint(0, len(raw_data)-1)]}")

Acronym exemple : {'acronym': 'ExaNeSt', 'definition': 'European Exascale System Interconnect and Storage', 'verbose_def': None}


In [None]:
def create_conv_datasets(acro_list: list) -> tuple[list[dict[str, str | list]], list[dict[str, str | list]]]:
    """
    Asks a LLM for fake conversation about acronym and their definitions.
    Saves the result in json.
    Returns a training and an evaluation datasets.
    :param acro_list: A list of acronym with restricted scheme
    :return: train_dataset, eval_dataset
        Each is a list of dictionnaries; each dict containing the acronym, the definition 
        and the conversations in OpenAI conversation standard. 
        
        Ex :
        {
            "acronym": PER,
            "ground_truth": "Purée et Epices de la Réussite",
            "conversation": [[
                    {'role': 'user', 'content': 'What does PER stand for?'}, 
                    {'role': 'assistant', 'content': "PER stands for 'Purée et Epices de la Réussite'."}
            ]]
        }
    """
    train_dataset = []
    eval_dataset = []
    n_acros = len(acro_list)
    for i, each_elem in enumerate(acro_list):
        
        acro = each_elem['acronym']
        acro_def = None
        verbose_def = None
        if "definition" in each_elem and each_elem["definition"] is not None:
            acro_def = each_elem["definition"]
        if "definition" in each_elem and each_elem["definition"] is None and "verbose_def" in each_elem:
            acro_def = each_elem["verbose_def"]
        if "definition" not in each_elem:
            acro_def = each_elem["verbose_def"]
        if "verbose_def" in each_elem and "definition" in each_elem and each_elem["definition"] is not None:
            verbose_def = each_elem["verbose_def"]
        if acro_def is None:
            continue
        
        prompt = create_acronym_prompt(n_conv_per_acronym + 1, acro, acro_def, verbose_def)
        answer = owui.get_chat_response(prompt, return_list=True)
        if answer is None or answer == []:
            continue

        train_conv = answer[:n_conv_per_acronym]
        eval_conv = [answer[n_conv_per_acronym]]
        
        train_elem = {
            "acronym": acro,
            "ground_truth": acro_def,
            "conversation": train_conv
        }
        eval_elem = {
            "acronym": acro,
            "ground_truth": acro_def,
            "conversation": eval_conv
        }

        train_dataset.append(train_elem)
        eval_dataset.append(eval_elem)
        
        print(i+1, "on", n_acros)
    return train_dataset, eval_dataset
    

In [None]:
# we do batch in case the connection with LLm breaks

batch_size = 10
n_batch = n_acros//batch_size
start_on_batch = 0
for k in range(n_batch+1):
    if k < start_on_batch:
        continue
    print(f"Batch number {k} over {n_batch}.")
    start_batch = k*batch_size
    end_batch = (k+1)*batch_size
    acro_list = raw_data[start_batch:end_batch]
    train_dataset, eval_dataset = create_conv_datasets(acro_list)
    
    with open(f"../data/batched_data/boosted_data_{k}.json", "wt") as f:
        json.dump(train_dataset, f, indent=4)
    
    with open(f"../data/batched_data/eval_data_{k}.json", "wt") as f:
        json.dump(eval_dataset, f, indent=4)
    

Batch number 13 over 27.
Error during parsing result of request to json. Re-trying with an other method.
1 on 10
Error during parsing result of request to json. Re-trying with an other method.
2 on 10
Error during parsing result of request to json. Re-trying with an other method.
3 on 10
Error during parsing result of request to json. Re-trying with an other method.
4 on 10
Error during parsing result of request to json. Re-trying with an other method.
5 on 10
Error during parsing result of request to json. Re-trying with an other method.
6 on 10
Error during parsing result of request to json. Re-trying with an other method.
7 on 10
Error during parsing result of request to json. Re-trying with an other method.
8 on 10
Error during parsing result of request to json. Re-trying with an other method.
9 on 10
Error during parsing result of request to json. Re-trying with an other method.
10 on 10
Batch number 14 over 27.
Error during parsing result of request to json. Re-trying with an oth

In [None]:
# Merge all batches
all_data = []
for i in range(n_batch+1):
    with open(f"../data/batched_data/boosted_data_{i}.json", "rt") as f:
        d = json.load(f)
    all_data += d

# Merge all eval batches
all_eval_data = []
for i in range(n_batch+1):
    with open(f"../data/batched_data/eval_data_{i}.json", "rt") as f:
        d = json.load(f)
    all_eval_data += d

In [9]:
# saves into a single json all conversations
with open("../data/train_dataset.json", "wt") as f:
    json.dump(all_data, f)

# saves into a single json all conversations
with open("../data/eval_dataset.json", "wt") as f:
    json.dump(all_eval_data, f)