# Benchmarking [Mistral Medium 3](https://mistral.ai/news/mistral-medium-3)

[Here](https://docs.mistral.ai/getting-started/models/models_overview/) is the list of Mistral's models.

## Libraries

In [1]:
from mistralai import Mistral
from pathlib import Path
from time import sleep
import os
import pandas as pd
from tqdm.rich import tqdm
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score
)
import json
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.text import Text
import warnings

tqdm.pandas(desc="Fetching moderation scores")
console = Console()
warnings.filterwarnings("ignore")

## Global variables

In [2]:
ROOT = Path("../..")
DATA_DIR = ROOT / "data"
BENCHMARK_PATH = DATA_DIR / "benchmark_jigsaw" / "benchmark_jigsaw.csv"
output_path = DATA_DIR / "benchmark_jigsaw" / "eng-mistral_medium.csv"
API_KEY_PATH = DATA_DIR / "confidential" / "MISTRAL_API.txt"
console = Console()
model = "mistral-medium-2505"
system_prompt = (Path(".") / "API_SYSTEM_PROMPT.txt").read_text().strip()
prompt = (Path(".") / "API_PROMPT.txt").read_text().strip()

In [3]:
os.environ["HTTP_PROXY"] = "socks5h://127.0.0.1:1080"
os.environ["HTTPS_PROXY"] = "socks5h://127.0.0.1:1080"

In [4]:
client = Mistral(api_key=API_KEY_PATH.read_text().strip())

## Load dataset

In [5]:
df = pd.read_csv(BENCHMARK_PATH, encoding="utf-8")
df = df.dropna(subset=["content", "label"])
df["label"] = df["label"].astype(int)

label_counts = df["label"].value_counts().reset_index()
label_counts.columns = ["label", "count"]
table = Table(title="Label Counts", show_lines=True)
table.add_column("Label", justify="center", style="cyan")
table.add_column("Count", justify="center", style="yellow")
table.add_row("Total Rows", str(len(df)))
for _, row in label_counts.iterrows():
    table.add_row(str(row["label"]), str(row["count"]))
console.print(table, justify="left")
df.sample(5, random_state=42)

Unnamed: 0,msg_id,original_text,content,label
78,ddb11abeca87bb3c,birth place \n\n YC was not born in Jonesboro...,lieu de naissance \n\nYC n'est pas né à Jones...,0
277,6e9826f166b7d7fe,Nazis like you deserve worse than death. \n\nI...,Des nazis comme toi méritent pire que la mort....,1
250,5bff63ef016af9be,You can rationalize your dishonesty however yo...,Tu peux rationaliser ton malhonnêteté comme tu...,1
55,e8ea81d9eac81735,Deletion\nThis has been tagged for Speedy Dele...,Suppression \nCeci a été marqué pour une supp...,0
391,bf5aae5b17fb0b77,because I am a huge faggot,parce que je suis un énorme pédé,1


## Define prediction function

In [6]:
def create_batch_file(
        text: str,
        max_tokens: int = 20,
    ):
    input_text = prompt + f"« {text} »" + "\n Is this message toxic ?\n"
    return {
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": input_text}
        ],
        "max_tokens": max_tokens
        }

In [7]:
def create_json_batch(df, batch_path):

    json_batch = {}

    for i, row in df.iterrows():
        text = row["original_text"]
        if text in json_batch:
            continue

        json_batch[text] = {
            "custom_id": f"id_{i}",
            "body": create_batch_file(
                text=text
            )
        }

    batch_path.parent.mkdir(parents=True, exist_ok=True)
    with batch_path.open("w", encoding="utf-8") as f:
        for entry in json_batch.values():
            f.write(json.dumps(entry) + "\n")
    console.print(f"Batch file written to {batch_path}")

In [8]:
def upload_batch(batch_path) -> str:
    file = open(batch_path, "rb")

    uploaded = client.files.upload(
        file={
            "file_name": "batch_path",
            "content": file
        },
        purpose = "batch"
    )

    uploaded_file_id = uploaded.id

    console.print(f"[green]✔ Uploaded batch file. ID: [bold]{uploaded_file_id}[/bold]")

    return uploaded_file_id

In [9]:
def submit_batch(uploaded_file_id) -> str:

    batch_job = client.batch.jobs.create(
        input_files=[uploaded_file_id],
        model=model,
        endpoint="/v1/chat/completions",
    )

    console.print(f"[green]✔ Batch submitted. ID: [bold]{batch_job.id}[/bold]")
    return batch_job

In [10]:
def wait_for_completion(batch_job):
    elapsed = 0
    while batch_job.status in ["QUEUED", "RUNNING"]:
        batch_job = client.batch.jobs.get(job_id=batch_job.id)
        console.print(f"[yellow] Batch status (after {elapsed}s): [bold]{batch_job.status}[/bold]")
        sleep(60)
        elapsed += 60
    print(f"Batch job {batch_job.id} completed with status: {batch_job.status}")
    return batch_job

In [11]:
def download_and_parse_results(batch, output_path) -> pd.DataFrame:
    if batch.output_file is not None:
        print(f"Downloading file to {output_path}")
        output_file = client.files.download(file_id=batch.output_file)
        with open(output_path, "w") as f:
            for chunk in output_file.stream:
                f.write(chunk.decode("utf-8"))
        print(f"Downloaded file to {output_path}")
    
    content = output_path.read_bytes()

    results = []
    for line in content.decode("utf-8").splitlines():
        data = json.loads(line)
        if data.get("error"):
            console.print(f"[red] Error for {data['custom_id']}: {data['error']}")
            continue
        id = data["custom_id"]
        choice = data["response"]["body"]["choices"][0]["message"]["content"].strip()
        usage = data["response"]["body"].get("usage", {})
        results.append({
            "id": id,
            "conversation": choice,
        })
    return pd.DataFrame(results)

In [12]:
def pipeline_for(df):
    create_json_batch(
        df,
        Path(f"batch_{model}.jsonl")
    )
    uploaded_file_id = upload_batch(Path(f"batch_{model}.jsonl"))
    batch = submit_batch(uploaded_file_id)
    batch = wait_for_completion(batch)
    df_answers = download_and_parse_results(batch, Path(f"answers_{model}.jsonl"))
    return df_answers

In [13]:
results = pipeline_for(df)

Batch job e19ec2f0-3dc1-46d5-b540-23b86d78dc0a completed with status: SUCCESS
Downloading file to answers_mistral-medium-2505.jsonl
Downloaded file to answers_mistral-medium-2505.jsonl


In [14]:
df['id'] = [f"id_{i}" for i in range(len(df))]
results.sort_values(by='id', inplace=True)
df.sort_values(by='id', inplace=True)
df['toxicity_score'] = results['conversation'].tolist()

In [15]:
df['toxicity_score'] = results['conversation'].tolist()

In [16]:
df

Unnamed: 0,msg_id,original_text,content,label,id,toxicity_score
0,e6865c831da40459,Combining 'Official requests for clemency' sec...,Combiner la section 'Demandes officielles de c...,0,id_0,non-toxic
1,cff2b456e5059b47,Web \n\nIs this her own website? 113.22.84.65,Web\n\nEst-ce son propre site web ? 113.22.84.65,0,id_1,non-toxic
10,547f596e29854539,It looks like the same person to me earoot@yah...,Ça me semble être la même personne à moi earoo...,0,id_10,non-toxic
100,74795a18fe96b5cb,"""\nDude, you keep vandalizing it by adding spe...","Mec, tu continues à le vandaliser en ajoutant ...",0,id_100,non-toxic
101,807168058208dc6a,I think John Milton died a few centuries ago! ...,Je pense que John Milton est mort il y a quelq...,0,id_101,non-toxic
...,...,...,...,...,...,...
95,e7df978258868f5e,hu.... Does it refresh every time I refresh th...,Euh... Est-ce que ça se met à jour chaque fois...,0,id_95,non-toxic
96,909b3bec738614f9,"Please, could you explain your thinking for yo...","S'il te plaît, pourrais-tu expliquer tes réfle...",0,id_96,non-toxic
97,baf86b93cc212702,pdf \n\nI doubt you'll be surprised to learn I...,Je ne doute pas que tu seras surpris d'apprend...,0,id_97,non-toxic
98,63ef41ee3e3f5fd7,"""\n\nI think """"1st Lord Hailes"""" should be add...","Je pense que """"1er Lord Hailes"""" devrait être ...",0,id_98,non-toxic


In [21]:
df['prediction'] = df['toxicity_score'].apply(lambda x: 1 if x.lower() == "toxic" else 0)

In [22]:
for i, row in df.sample(5, random_state=42).iterrows():
    content = Text(row['content'], style="bold")
    toxicity = f"[yellow]Toxicity Score:[/yellow] [bold]{row['prediction']}[/bold]"
    label = f"[cyan]Label:[/cyan] [bold]{row['label']}[/bold]"
    panel = Panel.fit(
        f"{content}\n\n{toxicity}\n{label}",
        title=f"Exemple {i+1}",
        border_style="magenta"
    )
    console.print(panel)

## Metrics & Report        

| Metric                     | Formula                                           | Interpretation                                                                                                       |
| -------------------------- | ------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
| **Precision**              | `TP / (TP + FP)`                                  | Of the samples predicted **toxic**, how many were **actually toxic**? <br>→ High precision = **low false positives** |
| **Recall** *(Sensitivity)* | `TP / (TP + FN)`                                  | Of the **actual toxic** samples, how many did we **correctly identify**? <br>→ High recall = **low false negatives** |
| **F1-score**               | `2 * (Precision * Recall) / (Precision + Recall)` | Harmonic mean of precision and recall. <br>→ Best when **balance** is needed                                         |
| **Accuracy**               | `(TP + TN) / (TP + TN + FP + FN)`                 | Fraction of all correct predictions (toxic and non-toxic). <br>→ Can be misleading on imbalanced data                |
| **ROC AUC**                | Area under the ROC Curve                          | Measures the **ranking ability** of the classifier. <br>→ Higher = better separation of toxic vs. non-toxic          |


In [23]:
y_true = df["label"]
y_pred = df["prediction"]

In [24]:
# Rapport de classification
report = classification_report(y_true, y_pred, digits=3, output_dict=True)
table = Table(title="Classification Report", show_lines=True)
table.add_column("Classe", style="cyan", justify="center")
table.add_column("Precision", justify="center")
table.add_column("Recall", justify="center")
table.add_column("F1-score", justify="center")
table.add_column("Support", justify="center")

for label, metrics in report.items():
    if label in ["accuracy", "macro avg", "weighted avg"]:
        continue
    table.add_row(
        str(label),
        f"{metrics['precision']:.3f}",
        f"{metrics['recall']:.3f}",
        f"{metrics['f1-score']:.3f}",
        f"{int(metrics['support'])}"
    )
# Ajout de l'accuracy
table.add_row(
    "[bold yellow]Accuracy[/bold yellow]",
    "-",
    "-",
    f"{report['accuracy']:.3f}",
    "-"
)
console.print(table)

# Matrice de confusion
cm = confusion_matrix(y_true, y_pred)
cm_table = Table(title="Matrice de Confusion", show_lines=True)
cm_table.add_column(" ", style="bold")
cm_table.add_column("Prédit 0", justify="center")
cm_table.add_column("Prédit 1", justify="center")
cm_table.add_row("Réel 0", str(cm[0, 0]), str(cm[0, 1]))
cm_table.add_row("Réel 1", str(cm[1, 0]), str(cm[1, 1]))
console.print(cm_table)

# ROC AUC
try:
    auc = roc_auc_score(y_true, df["prediction"])
    console.print(Panel(f"[bold green]ROC AUC Score : {auc:.3f}[/bold green]", title="ROC AUC"))
except:
    console.print(Panel("[bold red]Impossible de calculer le ROC AUC.[/bold red]", title="ROC AUC"))

## Save the predictions

In [25]:
df

Unnamed: 0,msg_id,original_text,content,label,id,toxicity_score,prediction
0,e6865c831da40459,Combining 'Official requests for clemency' sec...,Combiner la section 'Demandes officielles de c...,0,id_0,non-toxic,0
1,cff2b456e5059b47,Web \n\nIs this her own website? 113.22.84.65,Web\n\nEst-ce son propre site web ? 113.22.84.65,0,id_1,non-toxic,0
10,547f596e29854539,It looks like the same person to me earoot@yah...,Ça me semble être la même personne à moi earoo...,0,id_10,non-toxic,0
100,74795a18fe96b5cb,"""\nDude, you keep vandalizing it by adding spe...","Mec, tu continues à le vandaliser en ajoutant ...",0,id_100,non-toxic,0
101,807168058208dc6a,I think John Milton died a few centuries ago! ...,Je pense que John Milton est mort il y a quelq...,0,id_101,non-toxic,0
...,...,...,...,...,...,...,...
95,e7df978258868f5e,hu.... Does it refresh every time I refresh th...,Euh... Est-ce que ça se met à jour chaque fois...,0,id_95,non-toxic,0
96,909b3bec738614f9,"Please, could you explain your thinking for yo...","S'il te plaît, pourrais-tu expliquer tes réfle...",0,id_96,non-toxic,0
97,baf86b93cc212702,pdf \n\nI doubt you'll be surprised to learn I...,Je ne doute pas que tu seras surpris d'apprend...,0,id_97,non-toxic,0
98,63ef41ee3e3f5fd7,"""\n\nI think """"1st Lord Hailes"""" should be add...","Je pense que """"1er Lord Hailes"""" devrait être ...",0,id_98,non-toxic,0


In [26]:
output_path

PosixPath('../../data/benchmark_jigsaw/eng-mistral_medium.csv')

In [27]:
df.to_csv(output_path, index=False, encoding="utf-8")