# Model evaluation

Now that the model is trained, we can make an automatic evaluation of it; using Natural Language Processing tools such as Cross-Encoder, Bi-Encoders or Static embeddings. We can also use a LLM to judge if the fine-tuned one makes relevant answers or not.

## 0 - Loads model and configuration, along with test dataset

In [None]:
import os
from typing import Literal
from datetime import datetime
import torch

which_infra:Literal["onyxia", "datalab_gcp", "local"] = os.environ["WHICH_INFRA"] if "WHICH_INFRA" in os.environ else "datalab_gcp"
date = datetime.now().strftime("%m_%d_%Y-%Hh_%Mmin")

# change to the path to the folder were the trained model is located 
# ex : ../bucket/results_05_13_2025-10h_21min/checkpoint-500
model_path: str = "../bucket/model/results_05_20_2025-16h_50min/final_model" 

match which_infra:
    case "onyxia":
        test_dir = "../bucket/test"
        data_dir = "../bucket/data"
        test_dir = os.path.join("../bucket/tests", date)
    case "local":
        data_dir = "../bucket/data"
        test_dir = os.path.join("../bucket/tests", date)
    case "datalab_gcp":
        data_dir = "../../bucket/data"
        test_dir = os.path.join("../../bucket/fine_tuning_acronym/tests", date)
    case _:
        raise ValueError(f"Unexpected value for environment variable WHICH_INFRA : '{which_infra}'. Accepted values are : 'onyxia', 'datalab_gcp' and 'local'.")

dtype = torch.bfloat16

print(f"""
    Running on : {which_infra},
    Model will be loaded from : {model_path},
    Datatype: {dtype},
    Tests will be saved at : {test_dir}
    Loads test data from : {data_dir}.
""")

In [None]:
# Loads data for evaluation

import json
import os

path_eval_dataset = os.path.join(data_dir, "eval_dataset.json")
print(f"Loading eval data from : {path_eval_dataset}")

with open(path_eval_dataset, "rt") as f:
    eval_dataset = json.load(f)

print(eval_dataset[1]) # example of an element of the dataset

In [None]:
from transformers import pipeline

pl = pipeline("text-generation", model=model_path, torch_dtype=dtype, do_sample=True)


In [None]:
pl("1+1 ?", pad_token_id=pl.tokenizer.eos_token_id) # test model availability

## 1 - Try the model on the evaluation dataset

For each question in the conversation dataset, we try the fine tuned model on this question, and save the results in a answer dataset.

⚠️⚠️ This need to be done only once for each fine-tuned model. No need to run the cells more than one time for each model.⚠️⚠️ 

In [None]:
from tqdm import tqdm

answer_dataset = []

for each_try in tqdm(eval_dataset): # todo: use transformers pipeline parallelism
    question = [each_try["conversation"][0][0]]
    answer = pl(question, pad_token_id=pl.tokenizer.eos_token_id, max_new_tokens=200)[0]['generated_text'][1]['content']
    answer_dataset.append({
        "question": question[0]['content'],
        "answer": answer,
        "expected_answer": each_try["conversation"][0][1]['content'],
        "ground_truth": each_try["ground_truth"],
        "acronym": each_try["acronym"]
    })

In [None]:
answer_dataset[1] # example

In [None]:
save_answer_dataset = os.path.join(test_dir, "answer_dataset.json")

print(f"Saving answer dataset to {save_answer_dataset}.")

with open(save_answer_dataset, "wt") as f:
    json.dump(answer_dataset, f)


## 2 - Evaluate the model with several methods

We compute different metrics between the text generation of the fine-tuned model and the expected answers from the evaluation dataset.

Once the test data is generated, you can reload the answer dataset and evaluate the model on this dataset; that is : compare answer made by the fine-tuned model and exepected answers (either ground truth definitions of the acronyms or LLM generated expected answers)

In [None]:
answer_dataset_path = os.path.join(test_dir, "answer_dataset.json")

with open(answer_dataset_path, "rt") as f:
    answer_dataset = json.load(f)

print(answer_dataset[1]) # example

import pandas as pd

pd.options.display.max_colwidth = 500 # to display full texts

df = pd.DataFrame.from_dict(answer_dataset) # packaging everything in a pandas datafram

import random
displayed_examples = random.sample(list(df.index), 5)

display(df.loc[displayed_examples])

### 2.1 - First approach : Static Embeddings (/ ~ Bi-encoder)

Static embeddings are light to use, but could lack of accuracy in some use cases

In [None]:
from wordllama import WordLlama

# Load pre-trained static embeddings (truncate dimension to 64)
wl = WordLlama.load(trunc_dim=64)

df["static_embedding_sim"] = df.apply(lambda x : wl.similarity(x.answer,x.expected_answer), axis="columns")

# compute similarity between static embeddings of fine-tuned answers and expected answers.

In [None]:
display(df.loc[displayed_examples])

### 2.2 - Second approach : Cross-Encoder
Using CrossEncoder (https://www.sbert.net/examples/cross_encoder/applications/README.html).

Heavier thant static embeddings, but provides more accuracy when it comes to similarity.

In [None]:
from sentence_transformers.cross_encoder import CrossEncoder

cross_encoder = CrossEncoder("cross-encoder/stsb-distilroberta-base")

In [None]:
couple_list = df[["answer", "expected_answer"]].to_numpy().tolist() # not using direct dataframe to use parallel computing of lib sentence_transformer

res = cross_encoder.predict(couple_list)

df["cross_encoder_score"] = res

In [None]:
display(df.loc[displayed_examples])

### 2.3 - Third approach, using LLM as a judge

Here we asks an instruct LLM whether the corresponding answer seems relevant or not; and to put the answer inside specific characters.

In [None]:

match which_infra: # loads open web ui url and access token
    case "onyxia":
        owui_url = "https://llm.lab.sspcloud.fr/api/chat/completions"
        owui_token = os.environ["OWUI_TOKEN"] if "OWUI_TOKEN" in os.environ else None
        if owui_token is None:
            raise ValueError(f"No token Open Web UI {owui_url}, was found. Please add environment variable OWUI_TOKEN in your Onyxia secrets. See README.md to get more informations.")
        judge_model_name = os.environ["JUDGE_MODEL_NAME"]
    case "datalab_gcp":
        import yaml
        with open("../conf/conf.yaml", "rt") as f:
            conf = yaml.safe_load(f)
        owui_url = conf["OWUI_URL"]
        owui_token = conf["OWUI_TOKEN"]
        judge_model_name = conf["OWUI_FAV_MODEL"]
    case "local": 
        import yaml
        with open("../conf/conf.yaml", "rt") as f:
            conf = yaml.safe_load(f)
        owui_url = conf["OWUI_URL"]
        owui_token = conf["OWUI_TOKEN"]
        judge_model_name = conf["OWUI_FAV_MODEL"]
    case _:
        raise ValueError(f"Unexpected value for environment variable WHICH_INFRA. Accepted values are : 'onyxia', 'datalab_gcp' and 'local'.")

print(
    f"""
    which_infra : {which_infra},
    url_owui: {owui_url},
    token available for owui : {owui_token is not None},
    LLM used for data generation : {judge_model_name}
"""
)

In [None]:
from test_tools import create_judgement_prompt, extract_values
import sys
sys.path.append("../")
from owui_connector.owui import WebUIConnector
from tqdm import tqdm

owui = WebUIConnector(owui_token, owui_url, fav_model=judge_model_name)
triplet_list = df[["question", "answer", "expected_answer"]].to_numpy().tolist()

all_results = []
for each_triplet in tqdm(triplet_list):
    prompt = create_judgement_prompt(question=each_triplet[0], answer_to_test=each_triplet[1], definition=each_triplet[2])
    response = owui.get_chat_response(prompt)
    result, explain = extract_values(response)
    all_results.append({"result": result, "explain": explain})


In [None]:
df["llm_judge_result"] = pd.Series([each_res["result"] for each_res in all_results], dtype="int")
df["llm_judge_eplain"] = [each_res["explain"] for each_res in all_results]

In [None]:
judge_accuracy = df.llm_judge_result.sum()/df.shape[0] # fine tuned model on more epochs
print("Accuracy according to LLM judge :", judge_accuracy)

In [None]:
display(df.loc[displayed_examples])

## 3 - Save test results for this model

We save the test results as .csv file, and metadata (model, date of test) about this session.

In [None]:
test_result_dir = os.path.join(test_dir, "test_result.csv")
print(f"Saving test results to {test_result_dir}")
df.to_csv(test_result_dir)

In [None]:
metadata_test = {
    "date": date,
    "model_path": model_path,
    "judge_model_name": judge_model_name,
    "judge_accuracy": judge_accuracy,
    "notes": "Complete with note about this test"
}

metadata_test_path = os.path.join(test_dir, "metadata.json")
with open(metadata_test_path, "wt") as f:
    json.dump(metadata_test, f, indent=4)

It is hard to interpret the raw numbers out of this test step. But we can compare them between several models (for example the untrained model)

See next notebook [2-compare_models.ipynb](2-compare_models.ipynb).