# Benchmarking [Gemini 2.0-flash](https://ai.google.dev/gemini-api/docs/quickstart?hl=fr)

## Libraries

In [1]:
from google import genai
from time import sleep 
from pathlib import Path
import os
import pandas as pd
from tqdm.rich import tqdm
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score
)
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.text import Text
import warnings
from google.api_core import exceptions as genai_errors
import random
from concurrent.futures import ThreadPoolExecutor, as_completed

tqdm.pandas(desc="Fetching moderation scores")
console = Console()
warnings.filterwarnings("ignore")

## Global variables

In [None]:
ROOT = Path('../..')
DATA_DIR = ROOT / "data"
BENCHMARK_PATH = DATA_DIR / "benchmark" / "benchmark_balanced_subset.csv"
output_path = DATA_DIR / "benchmark" / "gemini-2.0-flash_simple_prompt.csv"
API_KEY_PATH = DATA_DIR / "confidential" / "GEMINI_API.txt"
console = Console()
model = "gemini-2.0-flash"
system_prompt = (Path(".") / "API_SYSTEM_PROMPT_simple.txt").read_text().strip()
prompt = (Path(".") / "API_PROMPT_simple.txt").read_text().strip()

In [3]:
os.environ["HTTP_PROXY"] = "socks5h://127.0.0.1:1080"
os.environ["HTTPS_PROXY"] = "socks5h://127.0.0.1:1080"

In [4]:
client = genai.Client(api_key=API_KEY_PATH.read_text().strip())

## Load dataset

In [5]:
df = pd.read_csv(BENCHMARK_PATH, encoding="utf-8")
df = df.dropna(subset=["content", "label"])
df["label"] = df["label"].astype(int)

label_counts = df["label"].value_counts().reset_index()
label_counts.columns = ["label", "count"]
table = Table(title="Label Counts", show_lines=True)
table.add_column("Label", justify="center", style="cyan")
table.add_column("Count", justify="center", style="yellow")
table.add_row("Total Rows", str(len(df)))
for _, row in label_counts.iterrows():
    table.add_row(str(row["label"]), str(row["count"]))
console.print(table, justify="left")
df.sample(5, random_state=42)

Unnamed: 0,msg_id,content,label
650,anon_msg_5ff0a5549f65,"des bovins mon pote, j'ai jamais vu ça sauf en...",0
208,anon_msg_2ea521264d58,Cet accent du sud dégueulasse,1
427,anon_msg_99e41015a911,"Joffrin, je serais capable de faire des trucs ...",1
677,anon_msg_31edc839b8b8,"D'accord le julf, mais vous n'êtes pas BLANC.",1
1272,anon_msg_15c39b3b7e9a,C'est Siemens Espagne hein... Pas le PDG du gr...,0


## Define prediction function

In [6]:
def predict(text, max_retries=5, delay_base=2):
    input_text = system_prompt + "\n" + prompt + f"« {text} »" + "\n Ce message est-il toxique ?\n"
    for attempt in range(max_retries):
        try:
            response = client.models.generate_content(
                model=model,
                contents=input_text
            )
            result = response.text.strip().lower()
            if result == "toxique":
                return 1
            elif result == "non-toxique":
                return 0
            else:
                console.print(f"[red]Unexpected response: '{result}'[/red]")
                return None
        except genai_errors.ServiceUnavailable as e:
            wait_time = delay_base ** attempt + random.uniform(0, 1)
            console.print(f"[yellow]Model overloaded (attempt {attempt + 1}/{max_retries}). Retrying in {wait_time:.2f}s...[/yellow]")
            sleep(wait_time)
        except Exception as e:
            wait_time = delay_base ** attempt + random.uniform(0, 1)
            console.print(f"[red]Error: {e}[/red]")
            sleep(wait_time)
    console.print(f"[red]Max retries reached for: {text}[/red]")
    return None

def safe_predict(text):
    try:
        return predict(text)
    except Exception as e:
        console.print(f"[red]Error for text: {text} → {e}[/red]")
        return None

## Run prediction

In [7]:
# List of texts to classify
texts = df["content"].tolist()

# before launching:
results = [None] * len(texts)

with ThreadPoolExecutor(max_workers=4) as executor:
    future_to_idx = {
        executor.submit(safe_predict, txt): idx
        for idx, txt in enumerate(texts)
    }
    for future in tqdm(as_completed(future_to_idx), total=len(texts)):
        idx = future_to_idx[future]
        results[idx] = future.result()

Output()

In [9]:
df['toxicity_score'] = results
df['prediction'] = results

In [10]:
for i, row in df.sample(5, random_state=42).iterrows():
    content = Text(row['content'], style="bold")
    toxicity = f"[yellow]Toxicity Score:[/yellow] [bold]{int(row['toxicity_score'])}[/bold]"
    label = f"[cyan]Label:[/cyan] [bold]{row['label']}[/bold]"
    panel = Panel.fit(
        f"{content}\n\n{toxicity}\n{label}",
        title=f"Exemple {i+1}",
        border_style="magenta"
    )
    console.print(panel)

## Metrics & Report        

| Metric                     | Formula                                           | Interpretation                                                                                                       |
| -------------------------- | ------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
| **Precision**              | `TP / (TP + FP)`                                  | Of the samples predicted **toxic**, how many were **actually toxic**? <br>→ High precision = **low false positives** |
| **Recall** *(Sensitivity)* | `TP / (TP + FN)`                                  | Of the **actual toxic** samples, how many did we **correctly identify**? <br>→ High recall = **low false negatives** |
| **F1-score**               | `2 * (Precision * Recall) / (Precision + Recall)` | Harmonic mean of precision and recall. <br>→ Best when **balance** is needed                                         |
| **Accuracy**               | `(TP + TN) / (TP + TN + FP + FN)`                 | Fraction of all correct predictions (toxic and non-toxic). <br>→ Can be misleading on imbalanced data                |
| **ROC AUC**                | Area under the ROC Curve                          | Measures the **ranking ability** of the classifier. <br>→ Higher = better separation of toxic vs. non-toxic          |


In [11]:
y_true = df["label"]
y_pred = df["toxicity_score"].astype(int)

In [12]:
# Rapport de classification
report = classification_report(y_true, y_pred, digits=3, output_dict=True)
table = Table(title="Classification Report", show_lines=True)
table.add_column("Classe", style="cyan", justify="center")
table.add_column("Precision", justify="center")
table.add_column("Recall", justify="center")
table.add_column("F1-score", justify="center")
table.add_column("Support", justify="center")

for label, metrics in report.items():
    if label in ["accuracy", "macro avg", "weighted avg"]:
        continue
    table.add_row(
        str(label),
        f"{metrics['precision']:.3f}",
        f"{metrics['recall']:.3f}",
        f"{metrics['f1-score']:.3f}",
        f"{int(metrics['support'])}"
    )
# Ajout de l'accuracy
table.add_row(
    "[bold yellow]Accuracy[/bold yellow]",
    "-",
    "-",
    f"{report['accuracy']:.3f}",
    "-"
)
console.print(table)

# Matrice de confusion
cm = confusion_matrix(y_true, y_pred)
cm_table = Table(title="Matrice de Confusion", show_lines=True)
cm_table.add_column(" ", style="bold")
cm_table.add_column("Prédit 0", justify="center")
cm_table.add_column("Prédit 1", justify="center")
cm_table.add_row("Réel 0", str(cm[0, 0]), str(cm[0, 1]))
cm_table.add_row("Réel 1", str(cm[1, 0]), str(cm[1, 1]))
console.print(cm_table)

# ROC AUC
try:
    auc = roc_auc_score(y_true, df["toxicity_score"])
    console.print(Panel(f"[bold green]ROC AUC Score : {auc:.3f}[/bold green]", title="ROC AUC"))
except:
    console.print(Panel("[bold red]Impossible de calculer le ROC AUC.[/bold red]", title="ROC AUC"))

## Save the predictions

In [13]:
df['toxicity_score'] = df['toxicity_score'].astype(int)
df

Unnamed: 0,msg_id,content,label,toxicity_score,prediction
0,anon_msg_468ffd36870c,AYAAAAAAAAAAAA LES CRACKHEAD ANTI MIGRANT BORD...,1,1,1
1,anon_msg_c30ce8c67bac,Bordel le gars chantait 'FN souffrance' alors ...,1,1,1
2,anon_msg_a7be822bc051,Je vois que ça ddb en masse\nComme en dictatur...,1,1,1
3,anon_msg_8a2c7da3ff85,vous tenez là ? persoent c'est chaud depuis hi...,1,1,1
4,anon_msg_b20f2e8b51ed,Le racisme est devenu le crime ultime en Occid...,0,1,1
...,...,...,...,...,...
1383,anon_msg_a3bb41085bd2,que j’entende plus un média dire le danger c’e...,0,0,0
1384,anon_msg_835f5f8a3e25,il raconte absolument n'importe quoi non stop,0,0,0
1385,anon_msg_5de411f54a1b,mais qu'est ce qu'il raconte saint nikos,0,0,0
1386,anon_msg_48b9b19e3c3b,Oui el famoso 200 bordel\n:rire:\nLE QI bordel...,0,1,1


In [14]:
output_path

PosixPath('../data/benchmark/gemini-2.0-flash_simple_prompt.csv')

In [15]:
df.to_csv(output_path, index=False, encoding="utf-8")