# 2 - Compare models
Using the pre-computed tests for each fine tuned model, we can compare them. For example, compare the fine-tuned model with the raw model to see if there is a modification in the evaluations.

In [None]:
import pandas as pd
import os
from typing import Literal
import json

which_infra:Literal["onyxia", "datalab_gcp", "local"] = os.environ["WHICH_INFRA"] if "WHICH_INFRA" in os.environ else "local"


match which_infra:
    case "onyxia":
        test_dir = "../bucket/tests"
    case "local":
        test_dir = "../bucket/tests"
    case "datalab_gcp":
        test_dir = "../../bucket/tests"
    case _:
        raise ValueError(f"Unexpected value for environment variable WHICH_INFRA : '{which_infra}'. Accepted values are : 'onyxia', 'datalab_gcp' and 'local'.")

def load_test_result(test_folder_name: str) -> pd.DataFrame:
    """
    Loads a test result table in .csv into a pandas dataframe
    :param test_folder_name: the name of the folder were you can find test_result.csv (ex: 05_12_2025-17h_06min)
    :return: a pandas dataframe where each row is an element of the evaluation dataset
    """
    test_file = os.path.join(test_dir, test_folder_name, "test_result.csv")
    with open(os.path.join(test_dir, test_folder_name, "metadata.json"), "rt") as f:
        metadata = json.load(f)
    print(f"Loading test for model {metadata['model_name']}, made at date {metadata['date']}")
    return pd.read_csv(test_file, index_col=0)

In [None]:
# loading two test files
t1 = load_test_result("no_fine_tuning_test")
t2 = load_test_result("05_13_2025-11h_02min")

In [None]:
t1

In [None]:
t2

In [None]:
t1.cross_encoder_score.mean(),t2.cross_encoder_score.mean()

In [None]:
t1.static_embedding_sim.mean(),t2.static_embedding_sim.mean()

In [None]:
threshold = .6
pd.options.display.max_colwidth = 500

t1.loc[t1.static_embedding_sim > threshold]

In [None]:
t2.loc[t2.static_embedding_sim < threshold]