# Benchmarking [OpenAI moderation omni API](https://platform.openai.com/docs/guides/moderation)

## Libraries

In [1]:
from openai import OpenAI
from pathlib import Path
from time import sleep
import os
import pandas as pd
from tqdm.rich import tqdm
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score
)
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.text import Text
import warnings

tqdm.pandas(desc="Fetching moderation scores")
console = Console()
warnings.filterwarnings("ignore")

## Global variables

In [None]:
ROOT = Path('../..')
DATA_DIR = ROOT / "data"
BENCHMARK_PATH = DATA_DIR / "benchmark" / "benchmark_balanced_subset.csv"
output_path = DATA_DIR / "benchmark" / "openai_omni_moderation.csv"
API_KEY_PATH = DATA_DIR / "confidential" / "GPT_API.txt"
console = Console()

In [3]:
os.environ["HTTP_PROXY"] = "socks5h://127.0.0.1:1080"
os.environ["HTTPS_PROXY"] = "socks5h://127.0.0.1:1080"

In [4]:
client = OpenAI(api_key=API_KEY_PATH.read_text().strip())

## Load dataset

In [5]:
df = pd.read_csv(BENCHMARK_PATH, encoding="utf-8")
df = df.dropna(subset=["content", "label"])
df["label"] = df["label"].astype(int)

label_counts = df["label"].value_counts().reset_index()
label_counts.columns = ["label", "count"]
table = Table(title="Label Counts", show_lines=True)
table.add_column("Label", justify="center", style="cyan")
table.add_column("Count", justify="center", style="yellow")
table.add_row("Total Rows", str(len(df)))
for _, row in label_counts.iterrows():
    table.add_row(str(row["label"]), str(row["count"]))
console.print(table, justify="left")
df.sample(5, random_state=42)

Unnamed: 0,msg_id,content,label
650,anon_msg_5ff0a5549f65,"des bovins mon pote, j'ai jamais vu ça sauf en...",0
208,anon_msg_2ea521264d58,Cet accent du sud dégueulasse,1
427,anon_msg_99e41015a911,"Joffrin, je serais capable de faire des trucs ...",1
677,anon_msg_31edc839b8b8,"D'accord le julf, mais vous n'êtes pas BLANC.",1
1272,anon_msg_15c39b3b7e9a,C'est Siemens Espagne hein... Pas le PDG du gr...,0


## Example

In [6]:
text = "Cet accent du sud dégueulasse"
response = client.moderations.create(
    model="omni-moderation-latest",
    input=[{"type": "text", "text": text}]
)


In [7]:
# Assuming your response is stored in a variable called `response`
result = response.results[0]

# 1. Print Flag Status
flag_status = "[bold red]FLAGGED[/bold red]" if result.flagged else "[bold green]NOT FLAGGED[/bold green]"
console.print(Panel(flag_status, title="⚠️ Moderation Result", subtitle="omni-moderation-latest", border_style="bold magenta"))

# 2. Triggered Categories (where categories[cat] is True)
triggered = [cat for cat, val in result.categories.__dict__.items() if val]
if triggered:
    trig_table = Table(title="🚩 Triggered Categories", show_lines=True)
    trig_table.add_column("Category", style="bold red")
    trig_table.add_column("Score", justify="right", style="yellow")
    for cat in triggered:
        score = getattr(result.category_scores, cat.replace("/", "_").replace("-", "_"), 0.0)
        trig_table.add_row(cat, f"{score:.3f}")
    console.print(trig_table)
else:
    console.print(Panel("✅ No categories were flagged.", title="Clean", border_style="green"))

# 3. All Category Scores (sorted)
scores = result.category_scores.__dict__
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)

score_table = Table(title="📊 All Category Scores", show_lines=False)
score_table.add_column("Category", style="cyan")
score_table.add_column("Score", justify="right", style="green")

for cat, val in sorted_scores:
    score_table.add_row(cat.replace("_", "/"), f"{val:.3f}")

console.print(score_table)


## Define prediction function

In [8]:
def predict(text):
    response = client.moderations.create(
        model="omni-moderation-latest",
        input=[{"type": "text", "text": text}]
    )
    result = response.results[0]
    sleep(0.5)
    return result.flagged 

## Run prediction

In [9]:
df["toxicity_score"] = df["content"].progress_apply(predict)
df = df.dropna(subset=["toxicity_score"])

Output()

In [12]:
for i, row in df.sample(5, random_state=42).iterrows():
    content = Text(row['content'], style="bold")
    toxicity = f"[yellow]Toxicity Score:[/yellow] [bold]{int(row['toxicity_score'])}[/bold]"
    label = f"[cyan]Label:[/cyan] [bold]{row['label']}[/bold]"
    panel = Panel.fit(
        f"{content}\n\n{toxicity}\n{label}",
        title=f"Exemple {i+1}",
        border_style="magenta"
    )
    console.print(panel)

## Metrics & Report        

| Metric                     | Formula                                           | Interpretation                                                                                                       |
| -------------------------- | ------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
| **Precision**              | `TP / (TP + FP)`                                  | Of the samples predicted **toxic**, how many were **actually toxic**? <br>→ High precision = **low false positives** |
| **Recall** *(Sensitivity)* | `TP / (TP + FN)`                                  | Of the **actual toxic** samples, how many did we **correctly identify**? <br>→ High recall = **low false negatives** |
| **F1-score**               | `2 * (Precision * Recall) / (Precision + Recall)` | Harmonic mean of precision and recall. <br>→ Best when **balance** is needed                                         |
| **Accuracy**               | `(TP + TN) / (TP + TN + FP + FN)`                 | Fraction of all correct predictions (toxic and non-toxic). <br>→ Can be misleading on imbalanced data                |
| **ROC AUC**                | Area under the ROC Curve                          | Measures the **ranking ability** of the classifier. <br>→ Higher = better separation of toxic vs. non-toxic          |


In [14]:
y_true = df["label"]
y_pred = df["toxicity_score"].astype(int)

In [15]:
# Rapport de classification
report = classification_report(y_true, y_pred, digits=3, output_dict=True)
table = Table(title="Classification Report", show_lines=True)
table.add_column("Classe", style="cyan", justify="center")
table.add_column("Precision", justify="center")
table.add_column("Recall", justify="center")
table.add_column("F1-score", justify="center")
table.add_column("Support", justify="center")

for label, metrics in report.items():
    if label in ["accuracy", "macro avg", "weighted avg"]:
        continue
    table.add_row(
        str(label),
        f"{metrics['precision']:.3f}",
        f"{metrics['recall']:.3f}",
        f"{metrics['f1-score']:.3f}",
        f"{int(metrics['support'])}"
    )
# Ajout de l'accuracy
table.add_row(
    "[bold yellow]Accuracy[/bold yellow]",
    "-",
    "-",
    f"{report['accuracy']:.3f}",
    "-"
)
console.print(table)

# Matrice de confusion
cm = confusion_matrix(y_true, y_pred)
cm_table = Table(title="Matrice de Confusion", show_lines=True)
cm_table.add_column(" ", style="bold")
cm_table.add_column("Prédit 0", justify="center")
cm_table.add_column("Prédit 1", justify="center")
cm_table.add_row("Réel 0", str(cm[0, 0]), str(cm[0, 1]))
cm_table.add_row("Réel 1", str(cm[1, 0]), str(cm[1, 1]))
console.print(cm_table)

# ROC AUC
try:
    auc = roc_auc_score(y_true, df["toxicity_score"])
    console.print(Panel(f"[bold green]ROC AUC Score : {auc:.3f}[/bold green]", title="ROC AUC"))
except:
    console.print(Panel("[bold red]Impossible de calculer le ROC AUC.[/bold red]", title="ROC AUC"))

## Save the predictions

In [19]:
df['toxicity_score'] = df['toxicity_score'].astype(int)
df

Unnamed: 0,msg_id,content,label,toxicity_score
0,anon_msg_468ffd36870c,AYAAAAAAAAAAAA LES CRACKHEAD ANTI MIGRANT BORD...,1,1
1,anon_msg_c30ce8c67bac,Bordel le gars chantait 'FN souffrance' alors ...,1,1
2,anon_msg_a7be822bc051,Je vois que ça ddb en masse\nComme en dictatur...,1,1
3,anon_msg_8a2c7da3ff85,vous tenez là ? persoent c'est chaud depuis hi...,1,1
4,anon_msg_b20f2e8b51ed,Le racisme est devenu le crime ultime en Occid...,0,1
...,...,...,...,...
1383,anon_msg_a3bb41085bd2,que j’entende plus un média dire le danger c’e...,0,0
1384,anon_msg_835f5f8a3e25,il raconte absolument n'importe quoi non stop,0,0
1385,anon_msg_5de411f54a1b,mais qu'est ce qu'il raconte saint nikos,0,0
1386,anon_msg_48b9b19e3c3b,Oui el famoso 200 bordel\n:rire:\nLE QI bordel...,0,0


In [20]:
df.to_csv(output_path, index=False, encoding="utf-8")