# Evaluation of Open-Source models

The aim of this notebook is to generate predictions for the Open-Source models trained in the previous notebook (`OpenSourceModelTraining.ipynb`).

In [1]:
# ---------------------------- PREPARING NOTEBOOK ---------------------------- #
# Autoreload
%load_ext autoreload
%autoreload 2

# Random seed
import numpy as np
np.random.seed(42)

# External modules
import os
from IPython.display import display, Markdown, Latex, clear_output
from tqdm import notebook as tqdm

# Set global log level
import logging
logging.basicConfig(level=logging.INFO)
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Define PWD as the current git repository
import git
repo = git.Repo('.', search_parent_directories=True)
pwd = repo.working_dir
os.chdir(pwd)

In [2]:
# --------------------------- LOAD AND PREPARE DATA -------------------------- #
import os
import pandas as pd

csv_list = {}
for file in os.listdir(os.path.join(pwd, "data", "raw", "difficulty_estimation")):
    if file.endswith(".csv"):
        file_key = [
            "_".join(os.path.splitext(file)[0].split("_")[:-1]),
            os.path.splitext(file)[0].split("_")[-1],
        ]
        if file_key[0] not in csv_list:
            csv_list[file_key[0]] = {}
        csv_list[file_key[0]][file_key[1]] = pd.read_csv(
            os.path.join(pwd, "data", "raw", "difficulty_estimation", file)
        )

## Bert

First, we will generate the predictions of the **Bert** model.

In [13]:
# ---------------------------- COMPUTE PREDICTIONS --------------------------- #=
def bert_predict(dataset: str, huggingface_token: str):
    import dill
    import torch
    from transformers import CamembertForSequenceClassification
    from huggingface_hub import snapshot_download, login

    # Set huggingface token
    login(token=huggingface_token)

    # Clone model checkpoint
    snapshot_download(
        repo_id=f"OloriBern/Lingorank_Bert_{dataset}",
        local_dir=dataset,
        revision="main",
        repo_type="model",
    )

    # Load tokenizer and label encoder
    with open(
        os.path.join(
            dataset,
            "train_camembert_tokenizer_label_encoder.pkl",
        ),
        "rb",
    ) as f:
        tokenizer, label_encoder = dill.load(f)

    # Charger le modèle; assurons-nous qu'il matche la classe de votre modèle
    model = CamembertForSequenceClassification.from_pretrained(dataset)

    # Mettre le modèle en mode évaluation
    model.eval()

    # Préparer les données pour le modèle
    inputs = tokenizer(
        csv_list[dataset]["test"]["sentence"].tolist(),
        padding=True,
        truncation=True,
        return_tensors="pt",
    )

    # Charger les tensors sur l'appareil adéquat (GPU si disponible)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Désactiver le calcul du gradient puisque nous sommes en inférence
    with torch.no_grad():
        # Faire les prédictions
        outputs = model(**inputs)

    # Appliquer une fonction softmax pour obtenir les probabilités
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # Convertir les prédictions en numpy array pour faciliter l'accès aux résultats et leur manipulation
    predictions = predictions.cpu().numpy()

    # Get best predictions
    predictions = np.argmax(predictions, axis=1)

    # Apply label encoder
    predictions = label_encoder.inverse_transform(predictions)

    return predictions

In [14]:
# -------------------------- CONNECT TO HUGGINGFACE -------------------------- #
from getpass import getpass
from huggingface_hub import login
import os

connected = False
while not (connected):
    try:
        with open(os.path.join(pwd, ".huggingface_key"), "r") as f:
            huggingface_token = f.read()
            login(token=huggingface_token)
            connected = True
    except:
        huggingface_token = getpass("Enter your HuggingFace token: ")
        with open(os.path.join(pwd, ".huggingface_key"), "w") as f:
            f.write(huggingface_token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/lopilo24/.cache/huggingface/token
Login successful


In [18]:
# ------------------------- DEFINE SLURMRAY LAUNCHER ------------------------- #
from slurmray.RayLauncher import RayLauncher

create_launcher = lambda dataset: RayLauncher(
    project_name="camembert_base_difficulty_estimation",
    func=bert_predict,
    args={
        "dataset": dataset,
        "huggingface_token": huggingface_token,
    },
    modules=[],
    node_nbr=1,
    use_gpu=True,
    memory=128,
    max_running_time=60,
    server_run=True,
    server_ssh="curnagl.dcsr.unil.ch",
    server_username="hjamet",
)

In [33]:
# ---------------------- ...AND COMPUTE LJL PREDICTIONS ---------------------- #
ljl_predictions = create_launcher("ljl")()

Serializing function and arguments...
Connecting to the cluster...


INFO:paramiko.transport:Connected (version 2.0, client OpenSSH_8.0)
INFO:paramiko.transport:Authentication (password) successful!
INFO:paramiko.transport.sftp:[chan 0] Opened sftp connection (server version 3)


Writing slurmray server script...
Downloading server...
Running server...
Installing slurmray server
Writing python script...
Writing slurm script...
No serialization done.
Cluster detected, running on cluster...
Canceling old jobs...
Start to submit job!
Job submitted! Script file is at: </users/hjamet/slurmray-server/.slogs/server/sbatch.sh>. Log file is at: </users/hjamet/slurmray-server/.slogs/server/server_1212-18h48.log>
Start to monitor the queue... You can check the queue at: </users/hjamet/slurmray-server/.slogs/server/server_1212-18h48_queue.log>
Submitted batch job 35880823
IP Head: 10.203.101.82:6379
STARTING HEAD at dnagpu002
2023-12-12 18:48:50,491	INFO usage_lib.py:416 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `ray disable-usage-stats` before starting the cluster. See ht

In [21]:
# ------------------- ...AND COMPUTE SENTENCES PREDICTIONS ------------------- #
sentences_predictions = create_launcher("sentences")()

Serializing function and arguments...
Connecting to the cluster...


INFO:paramiko.transport:Connected (version 2.0, client OpenSSH_8.0)
INFO:paramiko.transport:Authentication (password) successful!
INFO:paramiko.transport.sftp:[chan 0] Opened sftp connection (server version 3)


Writing slurmray server script...
Downloading server...
Running server...
Installing slurmray server
Writing python script...
Writing slurm script...
No serialization done.
Cluster detected, running on cluster...
Canceling old jobs...
Start to submit job!
Job submitted! Script file is at: </users/hjamet/slurmray-server/.slogs/server/sbatch.sh>. Log file is at: </users/hjamet/slurmray-server/.slogs/server/server_1212-18h29.log>
Start to monitor the queue... You can check the queue at: </users/hjamet/slurmray-server/.slogs/server/server_1212-18h29_queue.log>
Submitted batch job 35880634
IP Head: 10.203.101.82:6379
STARTING HEAD at dnagpu002
2023-12-12 18:29:40,600	INFO usage_lib.py:416 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `ray disable-usage-stats` before starting the cluster. See ht

In [22]:
# --------------- ...AND COMPUTE FRENCH_DIFFICULTY PREDICTIONS --------------- #
french_difficulty_predictions = create_launcher("french_difficulty")()

Serializing function and arguments...
Connecting to the cluster...


INFO:paramiko.transport:Connected (version 2.0, client OpenSSH_8.0)
INFO:paramiko.transport:Authentication (password) successful!
INFO:paramiko.transport.sftp:[chan 0] Opened sftp connection (server version 3)


Writing slurmray server script...
Downloading server...
Running server...
Installing slurmray server
Writing python script...
Writing slurm script...
No serialization done.
Cluster detected, running on cluster...
Canceling old jobs...
Start to submit job!
Job submitted! Script file is at: </users/hjamet/slurmray-server/.slogs/server/sbatch.sh>. Log file is at: </users/hjamet/slurmray-server/.slogs/server/server_1212-18h35.log>
Start to monitor the queue... You can check the queue at: </users/hjamet/slurmray-server/.slogs/server/server_1212-18h35_queue.log>
Submitted batch job 35880693
IP Head: 10.203.101.82:6379
STARTING HEAD at dnagpu002
2023-12-12 18:35:33,895	INFO usage_lib.py:416 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `ray disable-usage-stats` before starting the cluster. See ht

In [None]:
# ------------------------------ FIX LJL LABELS ------------------------------ #
# (for some reason, the label encoder did not work properly)

transformation = {0: "level1", 1: "level2", 2: "level3", 3: "level4"}
ljl_transformed = np.vectorize(transformation.get)(ljl_predictions)
ljl_transformed

In [None]:
# ----------------------------- SAVE PREDICTIONS ----------------------------- #
import pandas as pd

# Create directory if it does not exist
save_path = os.path.join(pwd, "results", "OpenSourceModelsEvaluation")
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Save predictions
pd.concat(
    [
        csv_list["ljl"]["test"],
        pd.DataFrame({"predictions": ljl_transformed}),
    ],
    axis=1,
).to_csv(os.path.join(save_path, "ljl.csv"))

pd.concat(
    [
        csv_list["sentences"]["test"],
        pd.DataFrame({"predictions": sentences_predictions}),
    ],
    axis=1,
).to_csv(os.path.join(save_path, "sentences.csv"))

pd.concat(
    [
        csv_list["french_difficulty"]["test"],
        pd.DataFrame({"predictions": french_difficulty_predictions}),
    ],
    axis=1,
).to_csv(os.path.join(save_path, "french_difficulty.csv"))

In [4]:
# ------------------------------ COMPUTE METRICS ----------------------------- #
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

save_path = os.path.join(pwd, "results", "OpenSourceModelsEvaluation")

# Load predictions
predictions = {
    "ljl": pd.read_csv(os.path.join(save_path, "ljl.csv")),
    "sentences": pd.read_csv(os.path.join(save_path, "sentences.csv")),
    "french_difficulty": pd.read_csv(os.path.join(save_path, "french_difficulty.csv")),
}

# Compute metrics line = mode, column = metric
## Accuracy, F1, Precision macro, Precision micro, Recall macro, Recall micro
metrics = pd.DataFrame(
    np.zeros((len(predictions), 6)),
    index=predictions.keys(),
    columns=[
        "accuracy",
        "f1",
        "precision_macro",
        "precision_micro",
        "recall_macro",
        "recall_micro",
    ],
)

# Compute metrics
for key in predictions.keys():
    metrics.loc[key, "accuracy"] = accuracy_score(
        predictions[key]["difficulty"].tolist(),
        predictions[key]["predictions"].tolist(),
    )
    metrics.loc[key, "f1"] = f1_score(
        predictions[key]["difficulty"].tolist(),
        predictions[key]["predictions"].tolist(),
        average="macro",
    )
    metrics.loc[key, "precision_macro"] = precision_score(
        predictions[key]["difficulty"].tolist(),
        predictions[key]["predictions"].tolist(),
        average="macro",
    )
    metrics.loc[key, "precision_micro"] = precision_score(
        predictions[key]["difficulty"].tolist(),
        predictions[key]["predictions"].tolist(),
        average="micro",
    )
    metrics.loc[key, "recall_macro"] = recall_score(
        predictions[key]["difficulty"].tolist(),
        predictions[key]["predictions"].tolist(),
        average="macro",
    )
    metrics.loc[key, "recall_micro"] = recall_score(
        predictions[key]["difficulty"].tolist(),
        predictions[key]["predictions"].tolist(),
        average="micro",
    )

# Sort results by f1 score
metrics = metrics.sort_values(by="f1", ascending=False)

# Save results
path = os.path.join(
    pwd,
    "results",
    "difficulty_estimation",
    "OpenSourceModelsEvaluation",
    "bert_metrics.csv",
)
if not os.path.exists(os.path.dirname(path)):
    os.makedirs(os.path.dirname(path))
metrics.to_csv(path)

# Round results
metrics = metrics.round(4)
metrics.style.background_gradient(
    cmap="Blues",
    axis=0,
)

Unnamed: 0,accuracy,f1,precision_macro,precision_micro,recall_macro,recall_micro
sentences,0.8229,0.8212,0.8268,0.8229,0.8229,0.8229
ljl,0.6247,0.6311,0.6267,0.6247,0.6408,0.6247
french_difficulty,0.5229,0.5126,0.5282,0.5229,0.5229,0.5229


## Mistral 7B

In [3]:
from transformers import AutoTokenizer
from datasets import Dataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder


def prepare_mistral_datasets(
    df: pd.DataFrame,
    tokenizer: AutoTokenizer,
    context: str = None,
    split_size: float = 0.2,
):
    # Encode labels
    label_encoder = LabelEncoder()
    df["difficulty"] = label_encoder.fit_transform(df["difficulty"])
    df.astype({"difficulty": "str"})

    # Create dataset
    dataset = Dataset.from_pandas(df)

    # Remove labels if split_size is 0 (for inference)
    if split_size == 0:
        dataset = dataset.map(
            lambda example: {"sentence": example["sentence"], "difficulty": ""},
            remove_columns=["difficulty"],
        )

    # Add context, label and tokenize
    max_length = 512

    def simple_tokenize(e):
        result = tokenizer(
            e["sentence"],
        )
        return result

    encoded_sentences = dataset.map(simple_tokenize)
    ## Truncate sentences
    truncated_dataset = encoded_sentences.map(
        lambda e: {
            "sentence": tokenizer.decode(e["input_ids"][: int(max_length)]),
        }
    ).remove_columns(["input_ids", "attention_mask"])

    # Determine Context size
    full_context_string = f"[INST] <<SYS>>\n{context if context is not None else ''}\n<</SYS>>\n\n [/INST] {truncated_dataset['difficulty'][0]}"
    context_length = len(tokenizer(full_context_string)["input_ids"])
    max_length = int(max_length + context_length) + 2

    def tokenize(e):
        context_str = (
            f"[INST] <<SYS>>\n{context if context is not None else ''}\n<</SYS>>\n\n"
        )
        formated = f"{context_str}{e['sentence']} [/INST] {e['difficulty']}"
        result = tokenizer(
            formated, padding="max_length", truncation=True, max_length=max_length
        )
        result["labels"] = result["input_ids"].copy()
        return result

    encoded_dataset = truncated_dataset.map(tokenize)

    # Convertir les listes en tensors
    encoded_dataset.set_format(
        "torch", columns=["input_ids", "attention_mask", "labels"]
    )

    # Remove unnecessary columns
    encoded_dataset = encoded_dataset.remove_columns(["sentence", "difficulty"])

    # Split into train and test
    if split_size == 0:
        return encoded_dataset
    split_dataset = encoded_dataset.train_test_split(test_size=split_size)
    train_dataset = split_dataset["train"]
    test_dataset = split_dataset["test"]

    return train_dataset, test_dataset


tokenizer = AutoTokenizer.from_pretrained(
    "bofenghuang/vigostral-7b-chat",
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token
ljl_test_dataset = prepare_mistral_datasets(
    csv_list["ljl"]["test"], tokenizer=tokenizer, split_size=0
)

Map:   0%|          | 0/413 [00:00<?, ? examples/s]

Map:   0%|          | 0/413 [00:00<?, ? examples/s]

Map:   0%|          | 0/413 [00:00<?, ? examples/s]

Map:   0%|          | 0/413 [00:00<?, ? examples/s]

In [4]:
# Decode dataset
display(
    pd.Series(
        ljl_test_dataset.map(
            lambda e: {"sentence": tokenizer.decode(e["input_ids"])},
            remove_columns=["input_ids", "attention_mask", "labels"],
        )["sentence"]
    )
    .apply(lambda x: x[-11:])
    .value_counts()
)

pd.Series(
    ljl_test_dataset.map(
        lambda e: {"size": len(e["attention_mask"])},
        remove_columns=["input_ids", "attention_mask", "labels"],
    )["size"]
).astype(int).describe()

Map:   0%|          | 0/413 [00:00<?, ? examples/s]

/INST] </s>    413
Name: count, dtype: int64

Map:   0%|          | 0/413 [00:00<?, ? examples/s]

count    413.0
mean     535.0
std        0.0
min      535.0
25%      535.0
50%      535.0
75%      535.0
max      535.0
dtype: float64

In [5]:
# ---------------------------- SLURMRAY EVALUATION --------------------------- #
import os

import dill
import pandas as pd
import ray.train.huggingface
import ray.train.huggingface.transformers
import torch
from peft import PeftModel
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig


def evaluate_mistral(
    test_set: pd.DataFrame,
    model_name: str = "mistralai/Mistral-7B-v0.1",
    context: str = None,
    pwd: str = ".",
):
    # Fix partial import bug
    import ray.train.huggingface
    import ray.train.huggingface.transformers

    # Charger le tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        padding_side="left",
        add_bos_token=True,
        trust_remote_code=True,
    )
    tokenizer.pad_token = tokenizer.eos_token

    # Préparer les datasets
    test_dataset = prepare_mistral_datasets(
        test_set, tokenizer, context=context, split_size=0
    )

    # Charger le modèle et le tokenizer
    base_model = AutoModelForCausalLM.from_pretrained(
        os.path.join(pwd, "mistral_trained"),
        device_map="auto",
        use_cache=False,
        trust_remote_code=True,
    )
    model = PeftModel.from_pretrained(base_model, os.path.join(pwd, "mistral_trained"))

    # Move everything to GPU
    model.to("cuda")
    test_loader = DataLoader(test_dataset, batch_size=16)

    # Generate predictions
    with torch.no_grad():
        model.eval()
        predictions_ids = []

        for batch in test_loader:
            input_ids_batch = batch["input_ids"].to("cuda")
            attention_mask_batch = batch["attention_mask"].to("cuda")

            outputs = model.generate(
                input_ids=input_ids_batch,
                attention_mask=attention_mask_batch,
                max_new_tokens=3,
            )

            predictions_ids.extend(outputs)
        predictions = [
            tokenizer.decode(prediction, skip_special_tokens=True)
            for prediction in predictions_ids
        ]
        predictions_series = pd.Series(predictions)

    return predictions_series

In [6]:
# ------------------------------- RAY LAUNCHER ------------------------------- #
from slurmray.RayLauncher import RayLauncher

mistral_launcher_builder = lambda dataset, context: RayLauncher(
    project_name="mistral_base_difficulty_estimation",
    func=evaluate_mistral,
    args={
        "test_set": csv_list[dataset]["test"],
        "model_name": "bofenghuang/vigostral-7b-chat",
        "context": context,
        "pwd": "/scratch/hjamet",
    },
    modules=[],
    node_nbr=1,
    use_gpu=True,
    memory=128,
    max_running_time=60,
    server_run=True,
    server_ssh="curnagl.dcsr.unil.ch",
    server_username="hjamet",
)

In [7]:
# ---------------------------- EVALUATE MISTRAL... --------------------------- #
from sklearn.preprocessing import LabelEncoder


def evaluate_mistral_local(dataset: str, context: bool = False):
    # Define context
    if context:
        context_sentence = "Vous êtes un évaluateur linguistique utilisant le Cadre européen commun de référence pour les langues (CECRL). Votre mission est d'attribuer une note de compétence linguistique à ce texte, en utilisant les niveaux du CECRL, allant de A1 (débutant) à C2 (avancé/natif). Évaluez ce texte et attribuez-lui la note correspondante du CECRL."
        context = "CECRL"
    else:
        context_sentence = None
        context = "no-context"

    # Compute predictions
    launcher = mistral_launcher_builder(dataset, context_sentence)
    predictions = launcher()

    # Extract labels
    predictions_extracted = predictions.str.extract(r"\[/INST\].*?(\d)").rename(
        columns={0: "predictions"}
    )
    predictions_extracted["labels"] = csv_list[dataset]["test"]["difficulty"]
    predictions_extracted["generation"] = predictions

    # Encode labels
    label_encoder = LabelEncoder()
    predictions_extracted["labels"] = label_encoder.fit_transform(
        predictions_extracted["labels"]
    )

    # Save predictions
    path = os.path.join(
        pwd,
        "results",
        "OpenSourceModelsEvaluation",
        "mistral",
        f"{dataset}_{context}.csv",
    )
    ## Create directory if it does not exist
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))
    ## Save predictions
    predictions_extracted.to_csv(path)

    return predictions_extracted

In [15]:
# --------------------------------- ...ON LJL -------------------------------- #
ljl_mistral_predictions = evaluate_mistral_local("ljl", context=False)

Serializing function and arguments...
Connecting to the cluster...


INFO:paramiko.transport:Connected (version 2.0, client OpenSSH_8.0)
INFO:paramiko.transport:Authentication (password) successful!
INFO:paramiko.transport.sftp:[chan 0] Opened sftp connection (server version 3)


Writing slurmray server script...
Downloading server...
Running server...
Installing slurmray server
Looking in indexes: https://download.pytorch.org/whl/cu121
Writing python script...
Writing slurm script...
No serialization done.
Cluster detected, running on cluster...
Canceling old jobs...
Start to submit job!
Job submitted! Script file is at: </users/hjamet/slurmray-server/.slogs/server/sbatch.sh>. Log file is at: </users/hjamet/slurmray-server/.slogs/server/server_1801-19h52.log>
Start to monitor the queue... You can check the queue at: </users/hjamet/slurmray-server/.slogs/server/server_1801-19h52_queue.log>
Submitted batch job 37142044
IP Head: 10.203.101.82:6379
STARTING HEAD at dnagpu002
2024-01-18 19:52:32,371	INFO usage_lib.py:416 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `r

In [17]:
# --------------------------------- ...ON LJL -------------------------------- #
ljl_mistral_predictions = evaluate_mistral_local("ljl", context=True)

Serializing function and arguments...
Connecting to the cluster...


INFO:paramiko.transport:Connected (version 2.0, client OpenSSH_8.0)
INFO:paramiko.transport:Authentication (password) successful!
INFO:paramiko.transport.sftp:[chan 0] Opened sftp connection (server version 3)


Writing slurmray server script...
Downloading server...
Running server...
Installing slurmray server
Looking in indexes: https://download.pytorch.org/whl/cu121
Writing python script...
Writing slurm script...
No serialization done.
Cluster detected, running on cluster...
Canceling old jobs...
Start to submit job!
Job submitted! Script file is at: </users/hjamet/slurmray-server/.slogs/server/sbatch.sh>. Log file is at: </users/hjamet/slurmray-server/.slogs/server/server_1801-21h11.log>
Start to monitor the queue... You can check the queue at: </users/hjamet/slurmray-server/.slogs/server/server_1801-21h11_queue.log>
Submitted batch job 37142945
IP Head: 10.203.101.82:6379
STARTING HEAD at dnagpu002
2024-01-18 21:11:33,852	INFO usage_lib.py:416 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `r

In [8]:
# ------------------------------- ON SENTENCES ------------------------------- #
sentences_mistral_predictions = evaluate_mistral_local("sentences", context=False)

Serializing function and arguments...
Connecting to the cluster...


INFO:paramiko.transport:Connected (version 2.0, client OpenSSH_8.0)
INFO:paramiko.transport:Authentication (password) successful!
INFO:paramiko.transport.sftp:[chan 0] Opened sftp connection (server version 3)


Writing slurmray server script...
Downloading server...
Running server...
Installing slurmray server
Looking in indexes: https://download.pytorch.org/whl/cu121
Writing python script...
Writing slurm script...
No serialization done.
Cluster detected, running on cluster...
Canceling old jobs...
Start to submit job!
Job submitted! Script file is at: </users/hjamet/slurmray-server/.slogs/server/sbatch.sh>. Log file is at: </users/hjamet/slurmray-server/.slogs/server/server_1901-00h02.log>
Start to monitor the queue... You can check the queue at: </users/hjamet/slurmray-server/.slogs/server/server_1901-00h02_queue.log>
Submitted batch job 37144603
IP Head: 10.203.101.82:6379
STARTING HEAD at dnagpu002
2024-01-19 00:02:57,579	INFO usage_lib.py:416 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `r

In [8]:
# ------------------------------- ON SENTENCES ------------------------------- #
sentences_mistral_predictions = evaluate_mistral_local("sentences", context=True)

Serializing function and arguments...
Connecting to the cluster...


INFO:paramiko.transport:Connected (version 2.0, client OpenSSH_8.0)
INFO:paramiko.transport:Authentication (password) successful!
INFO:paramiko.transport.sftp:[chan 0] Opened sftp connection (server version 3)


Writing slurmray server script...
Downloading server...
Running server...
Installing slurmray server
Looking in indexes: https://download.pytorch.org/whl/cu121
Writing python script...
Writing slurm script...
No serialization done.
Cluster detected, running on cluster...
Canceling old jobs...
Start to submit job!
Job submitted! Script file is at: </users/hjamet/slurmray-server/.slogs/server/sbatch.sh>. Log file is at: </users/hjamet/slurmray-server/.slogs/server/server_1901-11h43.log>
Start to monitor the queue... You can check the queue at: </users/hjamet/slurmray-server/.slogs/server/server_1901-11h43_queue.log>
Submitted batch job 37153289
IP Head: 10.203.101.88:6379
STARTING HEAD at dnagpu008
2024-01-19 11:58:42,340	INFO usage_lib.py:416 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `r

In [10]:
# ------------------------------- ON SENTENCES ------------------------------- #
sentences_mistral_predictions = evaluate_mistral_local(
    "french_difficulty", context=False
)

Serializing function and arguments...
Connecting to the cluster...


INFO:paramiko.transport:Connected (version 2.0, client OpenSSH_8.0)
INFO:paramiko.transport:Authentication (password) successful!
INFO:paramiko.transport.sftp:[chan 0] Opened sftp connection (server version 3)


Writing slurmray server script...
Downloading server...
Running server...
Installing slurmray server
Looking in indexes: https://download.pytorch.org/whl/cu121
Writing python script...
Writing slurm script...
No serialization done.
Cluster detected, running on cluster...
Canceling old jobs...
Start to submit job!
Job submitted! Script file is at: </users/hjamet/slurmray-server/.slogs/server/sbatch.sh>. Log file is at: </users/hjamet/slurmray-server/.slogs/server/server_1901-13h30.log>
Start to monitor the queue... You can check the queue at: </users/hjamet/slurmray-server/.slogs/server/server_1901-13h30_queue.log>
Submitted batch job 37154419
IP Head: 10.203.101.88:6379
STARTING HEAD at dnagpu008
2024-01-19 13:30:30,261	INFO usage_lib.py:416 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `r

In [21]:
# ------------------------------- ON SENTENCES ------------------------------- #
sentences_mistral_predictions = evaluate_mistral_local(
    "french_difficulty", context=True
)

Serializing function and arguments...
Connecting to the cluster...


INFO:paramiko.transport:Connected (version 2.0, client OpenSSH_8.0)
INFO:paramiko.transport:Authentication (password) successful!
INFO:paramiko.transport.sftp:[chan 0] Opened sftp connection (server version 3)


Writing slurmray server script...
Downloading server...
Running server...
Installing slurmray server
Looking in indexes: https://download.pytorch.org/whl/cu121
Writing python script...
Writing slurm script...
No serialization done.
Cluster detected, running on cluster...
Canceling old jobs...
Start to submit job!
Job submitted! Script file is at: </users/hjamet/slurmray-server/.slogs/server/sbatch.sh>. Log file is at: </users/hjamet/slurmray-server/.slogs/server/server_1901-15h45.log>
Start to monitor the queue... You can check the queue at: </users/hjamet/slurmray-server/.slogs/server/server_1901-15h45_queue.log>
Submitted batch job 37161919
IP Head: 10.203.101.88:6379
STARTING HEAD at dnagpu008
2024-01-19 16:00:02,587	INFO usage_lib.py:416 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `r

In [5]:
# ------------------------------ COMPUTE METRICS ----------------------------- #
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
)
from matplotlib import pyplot as plt
import seaborn as sn

# Load predictions
full_metrics = {}
for root, dirs, files in os.walk(
    os.path.join(pwd, "results", "OpenSourceModelsEvaluation", "mistral")
):
    for file in files:
        if file.endswith(".csv"):
            predictions = pd.read_csv(os.path.join(root, file))
            # Compute metrics line = mode, column = metric
            ## Accuracy, F1, Precision macro, Precision micro, Recall macro, Recall micro
            metrics = {}
            metrics["accuracy"] = accuracy_score(
                predictions["labels"].tolist(),
                predictions["predictions"].tolist(),
            )
            metrics["f1"] = f1_score(
                predictions["labels"].tolist(),
                predictions["predictions"].tolist(),
                average="macro",
            )
            metrics["precision_macro"] = precision_score(
                predictions["labels"].tolist(),
                predictions["predictions"].tolist(),
                average="macro",
            )
            metrics["precision_micro"] = precision_score(
                predictions["labels"].tolist(),
                predictions["predictions"].tolist(),
                average="micro",
            )
            metrics["recall_macro"] = recall_score(
                predictions["labels"].tolist(),
                predictions["predictions"].tolist(),
                average="macro",
            )
            metrics["recall_micro"] = recall_score(
                predictions["labels"].tolist(),
                predictions["predictions"].tolist(),
                average="micro",
            )

            # Save metrics
            full_metrics[file] = metrics

            # Confusion Matrix
            # Calculer la matrice de confusion


# Create Index
full_metrics

# Round results
metrics_df = pd.DataFrame(full_metrics).T.round(4)

# Create MultiIndex
metrics_df.index = pd.MultiIndex.from_tuples(
    [
        tuple(x.split("_"))
        for x in metrics_df.index.str.replace(".csv", "").str.replace(
            "french_difficulty", "french-difficulty"
        )
    ],
    names=["dataset", "context"],
)

# Sort results by f1 score
metrics_df = metrics_df.sort_values(by="f1", ascending=False)

# Save results
path = os.path.join(
    pwd,
    "results",
    "difficulty_estimation",
    "OpenSourceModelsEvaluation",
    "mistral_metrics.csv",
)
if not os.path.exists(os.path.dirname(path)):
    os.makedirs(os.path.dirname(path))
metrics_df.to_csv(path)

# Display results
metrics_df.style.background_gradient(
    cmap="Blues",
    axis=0,
)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,precision_macro,precision_micro,recall_macro,recall_micro
dataset,context,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
sentences,CECRL,0.7479,0.7347,0.7739,0.7479,0.7479,0.7479
ljl,CECRL,0.6368,0.6281,0.6805,0.6368,0.6056,0.6368
sentences,no-context,0.6312,0.6279,0.6505,0.6312,0.6312,0.6312
french-difficulty,CECRL,0.5125,0.5151,0.5212,0.5125,0.5125,0.5125
ljl,no-context,0.4722,0.3376,0.4739,0.4722,0.3556,0.4722
french-difficulty,no-context,0.3542,0.3063,0.448,0.3542,0.3542,0.3542
