# 2 - Compare models
Using the pre-computed tests for each fine tuned model, we can compare them. For example, compare the fine-tuned model with the raw model to see if there is a modification in the evaluations.

In [None]:
import pandas as pd
import os
from typing import Literal
import json

which_infra:Literal["onyxia", "datalab_gcp", "local"] = os.environ["WHICH_INFRA"] if "WHICH_INFRA" in os.environ else "datalab_gcp"


match which_infra:
    case "onyxia":
        test_dir = "../bucket/tests"
    case "local":
        test_dir = "../bucket/tests"
    case "datalab_gcp":
        test_dir = "../../bucket/fine_tuning_acronym/tests"
    case _:
        raise ValueError(f"Unexpected value for environment variable WHICH_INFRA : '{which_infra}'. Accepted values are : 'onyxia', 'datalab_gcp' and 'local'.")

def load_test_result(test_folder_name: str) -> pd.DataFrame:
    """
    Loads a test result table in .csv into a pandas dataframe
    :param test_folder_name: the name of the folder were you can find test_result.csv (ex: 05_12_2025-17h_06min)
    :return: a pandas dataframe where each row is an element of the evaluation dataset
    """
    test_file = os.path.join(test_dir, test_folder_name, "test_result.csv")
    with open(os.path.join(test_dir, test_folder_name, "metadata.json"), "rt") as f:
        metadata = json.load(f)
    print(f"Loading test for model {metadata['model_name']}, made at date {metadata['date']}")
    return pd.read_csv(test_file, index_col=0)

pd.options.display.max_colwidth = 500 # to display full texts

In [None]:
# loading two test files
t1 = load_test_result("no-fine-tuning_llama_1B")
t2 = load_test_result("05_15_2025-11h_42min")
t3 = load_test_result("05_15_2025-13h_43min")

In [None]:
t1.sample(3)

In [None]:
# accuracy
n_sample = t1.shape[0]
t1.llm_judge_result.sum()/n_sample

In [None]:
t2.llm_judge_result.sum()/n_sample # fine tuned model

In [None]:
t3.llm_judge_result.sum()/n_sample # fine tuned model

In [None]:
t3_pos = t3.loc[t3.llm_judge_result == 0]

In [None]:
t3_pos.sample(5)

## Comparison between LLM-judge result and embedding similarity

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def plot_correlation_between_sim_and_judge(df):
    """
    Show the different correlation for different threshold for classification
    with static embeddings and cross-encoder.
    """
    all_thresholds = np.linspace(0, 1, 50)
    all_corr_static = {}
    all_corr_cross_enc = {}

    for each_treshold in all_thresholds:
        static_classification = df.static_embedding_sim.apply(lambda x: 1 if x>each_treshold else 0)
        cross_encoder_classification = df.cross_encoder_score.apply(lambda x: 1 if x>each_treshold else 0)
        corr_static = static_classification.corr(df.llm_judge_result)
        corr_cross = cross_encoder_classification.corr(df.llm_judge_result)
        all_corr_static[each_treshold] = 0 if np.isnan(corr_static) else corr_static
        all_corr_cross_enc[each_treshold] = 0 if np.isnan(corr_cross) else corr_cross

    max_cross_enc = max(all_corr_cross_enc, key=all_corr_cross_enc.get)
    max_static = max(all_corr_static, key=all_corr_static.get)
    print("Max correlation with cross-encoder output", max_cross_enc)
    print("Max correlation with static embedding similarity", max_static)
    plt.plot(all_thresholds, all_corr_static.values(), color="blue")
    plt.plot(all_thresholds, all_corr_cross_enc.values(), color="red")
    plt.show()
    return max_cross_enc, max_static

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

def confusion_matrix_between_sim_and_judge(y_true, y_pred, y_true_name, y_pred_name):
    """
    y_true = llm_judge_result
    y_pred = cross_encoder_classification
    """
    cm = confusion_matrix(y_true, y_pred)

    return sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
                xticklabels=[f"{y_pred_name} 0", f"{y_pred_name} 1"], 
                yticklabels=[f"{y_true_name} 0", f"{y_true_name} 1"], 
                center=5, square=True)

# comment : lot of false positives (Wrong answer that are validated)

In [None]:
t1_cross_enc_threshold, t1_static_threshold = plot_correlation_between_sim_and_judge(t1)
t2_cross_enc_threshold, t2_static_threshold = plot_correlation_between_sim_and_judge(t2)
t3_cross_enc_threshold, t3_static_threshold = plot_correlation_between_sim_and_judge(t3)

In [None]:
t1["static_embedding_class"] = t1.static_embedding_sim.apply(lambda x : 1 if x > t1_static_threshold else 0)
t1["cross_encoder_class"] = t1.cross_encoder_score.apply(lambda x : 1 if x > t1_cross_enc_threshold else 0)
t2["static_embedding_class"] = t2.static_embedding_sim.apply(lambda x : 1 if x > t2_static_threshold else 0)
t2["cross_encoder_class"] = t2.cross_encoder_score.apply(lambda x : 1 if x > t2_cross_enc_threshold else 0)
t3["static_embedding_class"] = t3.static_embedding_sim.apply(lambda x : 1 if x > t3_static_threshold else 0)
t3["cross_encoder_class"] = t3.cross_encoder_score.apply(lambda x : 1 if x > t3_cross_enc_threshold else 0)