This notebooks aims to creates a boosted dataset of fake conversations between a user and an assistant about a given list of acronym and their definitions.

## Loads config

In [None]:
import os
from typing import Literal

# which_infra:Literal["onyxia", "datalab_gcp", "local"] = os.environ["WHICH_INFRA"]
which_infra:Literal["onyxia", "datalab_gcp", "local"] = "local"

match which_infra:
    case "onyxia":
        owui_url = "https://llm.lab.sspcloud.fr/api/chat/completions"
        owui_token = os.environ["OWUI_TOKEN"] if "OWUI_TOKEN" in os.environ else None
        if owui_token is None:
            raise ValueError(f"No token Open Web UI {owui_url}, was found. Please add environment variable OWUI_TOKEN in your Onyxia secrets. See README.md to get more informations.")
        data_dir = "../data"
    case "datalab_gcp":
        # todo : deal with secrets in datalab
        import yaml
        with open("../conf/conf.yaml", "rt") as f:
            conf = yaml.safe_load(f)
        owui_url = conf["OWUI_URL"]
        owui_token = conf["OWUI_TOKEN"]
        data_dir = "../../bucket/data"
    case "local": 
        import yaml
        with open("../conf/conf.yaml", "rt") as f:
            conf = yaml.safe_load(f)
        owui_url = conf["OWUI_URL"]
        owui_token = conf["OWUI_TOKEN"]
        data_dir = "../bucket/data"
    case _:
        raise ValueError(f"Unexpected value for environment variable WHICH_INFRA. Accepted values are : 'onyxia', 'datalab_gcp' and 'local'.")

# model_name = "llama3.3:70b"
# model_name = "llama3.1:8b"
# model_name = "mistral-small3.1:latest" 
model_name = "mistral-small:latest" 

print(
    f"""
    which_infra : {which_infra},
    url_owui: {owui_url},
    token available for owui : {owui_token is not None},
    LLM used for data generation : {model_name},
    loading data from : {data_dir},
"""
)


    which_infra : local,
    url_owui: https://ragarenn.eskemm-numerique.fr/marius.garenaux-gruau@irisa.fr/api/chat/completions,
    token available for owui : True,
    LLM used for data generation : mistral-small:latest,
    loading data from : ../bucket/data,



## Connects to OpenWebUI (RAGaRenn or Onyxia)

In [2]:
from dataset_tools import WebUIConnector, create_acronym_prompt

owui = WebUIConnector(owui_token, owui_url, fav_model=model_name)
owui.get_chat_response("How much is 1+1")  # test request

'The sum of 1 + 1 is 2.'

## Creates custom prompt and asks a LLM (RAGaRenn, ...)

In [3]:
n_conv_per_acronym = 3  # number of conversation to generate per acronym

# example
prompt = create_acronym_prompt(
    n_conv_per_acronym, acro="PER", definition="Purée et Epices de la Réussite"
)
print(prompt)
answer = owui.get_chat_response(prompt, return_list=True)
print(answer)

Create 3 fictive conversations between an user and an assistant.
Those conversations must contains 1 question and 1 answer.
Each question must be an user asking for the definition of the acronym PER; and each answer must contain the definition : 'Purée et Epices de la Réussite'.
All the answer must be somehow diverse.
Each conversation will be formatted in a json list, where each element is itself a list of the form : 
[
  {
     'role': 'user'',
     'content': THE QUESTION
  },
  {
    'role': 'assistant',
     'content': THE ANSWER
  }
] 
Keep it short. The answer must be the raw json; no fioritures.

Error during parsing result of request to json. Trying to remove ```json ```.
Successfully parsed json.
[[{'role': 'user', 'content': 'What does PER stand for?'}, {'role': 'assistant', 'content': "PER stands for 'Purée et Epices de la Réussite'."}], [{'role': 'user', 'content': 'Can you explain the acronym PER?'}, {'role': 'assistant', 'content': "The acronym PER is defined as 'Purée e

In [8]:
import os

if not os.path.isdir(data_dir):
    raise Exception(
        f"Please add a data directory {data_dir} with datasets inside."
    )

In [10]:
import json
import random
import os

# loads list of acronyms and their definitions
raw_data_dir = os.path.join(data_dir, "acronym.json")
print(f"Loading raw data from {raw_data_dir}.")

with open(raw_data_dir, "rt") as f:
    raw_data = json.load(f)
    
n_acros = len(raw_data)
print(f"Exemple of dataset element : {raw_data[random.randint(0, len(raw_data)-1)]}")

Loading raw data from ../bucket/data/acronym.json.
Exemple of dataset element : {'acronym': 'EuroLab-4-HPC-D2.1', 'verbose_def': 'Preliminary roadmap / research vision'}


In [11]:
def create_conv_datasets(
    acro_list: list,
) -> tuple[list[dict[str, str | list]], list[dict[str, str | list]]]:
    """
    Asks a LLM for fake conversation about acronym and their definitions.
    Saves the result in json.
    Returns a training and an evaluation datasets.
    :param acro_list: A list of acronym with restricted scheme
    :return: train_dataset, eval_dataset
        Each is a list of dictionnaries; each dict containing the acronym, the definition
        and the conversations in OpenAI conversation standard.

        Ex :
        {
            "acronym": PER,
            "ground_truth": "Purée et Epices de la Réussite",
            "conversation": [[
                    {'role': 'user', 'content': 'What does PER stand for?'},
                    {'role': 'assistant', 'content': "PER stands for 'Purée et Epices de la Réussite'."}
            ]]
        }
    """
    train_dataset = []
    eval_dataset = []
    n_acros = len(acro_list)
    for i, each_elem in enumerate(acro_list):

        acro = each_elem["acronym"]
        acro_def = None
        verbose_def = None
        if "definition" in each_elem and each_elem["definition"] is not None:
            acro_def = each_elem["definition"]
        if (
            "definition" in each_elem
            and each_elem["definition"] is None
            and "verbose_def" in each_elem
        ):
            acro_def = each_elem["verbose_def"]
        if "definition" not in each_elem:
            acro_def = each_elem["verbose_def"]
        if (
            "verbose_def" in each_elem
            and "definition" in each_elem
            and each_elem["definition"] is not None
        ):
            verbose_def = each_elem["verbose_def"]
        if acro_def is None:
            continue

        prompt = create_acronym_prompt(
            n_conv_per_acronym + 1, acro, acro_def, verbose_def
        )
        answer = owui.get_chat_response(prompt, return_list=True)
        if answer is None or answer == []:
            continue

        train_conv = answer[:n_conv_per_acronym]
        eval_conv = [answer[n_conv_per_acronym]]

        train_elem = {
            "acronym": acro,
            "ground_truth": acro_def,
            "conversation": train_conv,
        }
        eval_elem = {
            "acronym": acro,
            "ground_truth": acro_def,
            "conversation": eval_conv,
        }

        train_dataset.append(train_elem)
        eval_dataset.append(eval_elem)

        print(i + 1, "on", n_acros)
    return train_dataset, eval_dataset

In [12]:
# we do batch in case the connection with LLM breaks

batch_size = 1
n_batch = n_acros // batch_size
start_on_batch = 0
for k in range(n_batch + 1):
    if k < start_on_batch:
        continue
    print(f"Batch number {k} over {n_batch}.")
    start_batch = k * batch_size
    end_batch = (k + 1) * batch_size
    acro_list = raw_data[start_batch:end_batch]
    train_dataset, eval_dataset = create_conv_datasets(acro_list)

    train_dir_batch = os.path.join(data_dir, f"batched_data/train_data_{k}.json")
    print(f"Saving training batch {k} to {train_dir_batch}")
    with open(train_dir_batch, "wt") as f:
        json.dump(train_dataset, f, indent=4)

    eval_dir_batch = os.path.join(data_dir, f"batched_data/eval_data_{k}.json")
    print(f"Saving eval batch {k} to {eval_dir_batch}")
    with open(eval_dir_batch, "wt") as f:
        json.dump(eval_dataset, f, indent=4)

Batch number 0 over 282.
Error during parsing result of request to json. Trying to remove ```json ```.
Successfully parsed json.
1 on 1
Saving training batch 0 to ../bucket/data/batched_data/train_data_0.json
Saving eval batch 0 to ../bucket/data/batched_data/eval_data_0.json
Batch number 1 over 282.


KeyboardInterrupt: 

In [None]:
# Merge all batches
all_data = []
for k in range(n_batch + 1):
    train_dir_batch = os.path.join(data_dir, f"batched_data/train_data_{k}.json")
    print(f"Loading training batch {k} from {train_dir_batch}")
    with open(train_dir_batch, "rt") as f:
        d = json.load(f)
    all_data += d

# Merge all eval batches
all_eval_data = []
for k in range(n_batch + 1):
    eval_dir_batch = os.path.join(data_dir, f"batched_data/eval_data_{k}.json")
    print(f"Loading eval batch {k} from {eval_dir_batch}")
    with open(eval_dir_batch, "rt") as f:
        d = json.load(f)
    all_eval_data += d

In [14]:
import os

train_data_dir = os.path.join(data_dir, "train_dataset.json")
eval_data_dir = os.path.join(data_dir, "eval_dataset.json")

if os.path.isfile(train_data_dir):
    raise FileExistsError(f"An existing file was found at : {train_data_dir}. Not overwritting it.")

if os.path.isfile(eval_data_dir):
    raise FileExistsError(f"An existing file was found at : {eval_data_dir}. Not overwritting it.")

print(f"Saving training data to : {train_data_dir}")
print(f"Saving eval data to : {eval_data_dir}")

# saves into a single json all conversations
with open(train_data_dir, "wt") as f:
    json.dump(all_data, f)

# saves into a single json all conversations
with open(eval_data_dir, "wt") as f:
    json.dump(all_eval_data, f)

FileExistsError: An existing file was found at : ../bucket/data/train_dataset.json. Not overwritting it.