# Benchmarking dataset

## Libraries

In [9]:
import pandas as pd
from pathlib import Path
from rich.console import Console
from rich.table import Table

## Global variables

In [23]:
ROOT = Path("..")
DATA_DIR = ROOT / "data"
BENCHMARK_PATH = DATA_DIR / "benchmark" / "benchmark.csv" 
BENCHMARK_SUBSET_PATH = DATA_DIR / "benchmark" / "benchmark_balanced_subset.csv"
console = Console()

## Load dataset

In [21]:
df = pd.read_csv(BENCHMARK_PATH, encoding="utf-8")
df = df.dropna(subset=["content", "label"])
df["label"] = df["label"].astype(int)

label_counts = df["label"].value_counts().reset_index()
label_counts.columns = ["label", "count"]
table = Table(title="Label Counts", show_lines=True)
table.add_column("Label", justify="center", style="cyan")
table.add_column("Count", justify="center", style="yellow")
table.add_row("Total Rows", str(len(df)))
for _, row in label_counts.iterrows():
    table.add_row(str(row["label"]), str(row["count"]))
console.print(table, justify="left")
df.sample(5, random_state=42)

Unnamed: 0,msg_id,content,label
11584,anon_msg_21e0eca196fc,Up\nV2 du topic CDM 2022,0
12107,anon_msg_c9c8b8702ee3,Oui c'est vrai tu partages pas mal de pseudos ...,0
16416,anon_msg_e67bfaff2f43,impatient d'être en 2024 que le monde entier a...,0
17614,anon_msg_fba361a91f84,J'aime les traps mais personne ne verra ce mes...,0
5995,anon_msg_0580c26e680a,renseigne toi sur eux il y a même des villes o...,0


In [22]:
# Keep only 694 toxic and 694 non-toxic samples
df = pd.concat([df[df["label"] == 1].sample(694, random_state=42),
                 df[df["label"] == 0].sample(694, random_state=42)])
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
table = Table(title="Balanced Dataset", show_lines=True)
table.add_column("Label", justify="center", style="cyan")
table.add_column("Count", justify="center", style="yellow")
table.add_row("Total Rows", str(len(df)))
label_counts = df["label"].value_counts().reset_index()
for _, row in label_counts.iterrows():
    table.add_row(str(row["label"]), str(row["count"]))
console.print(table)
df.sample(5, random_state=42) 

Unnamed: 0,msg_id,content,label
650,anon_msg_5ff0a5549f65,"des bovins mon pote, j'ai jamais vu ça sauf en...",0
208,anon_msg_2ea521264d58,Cet accent du sud dégueulasse,1
427,anon_msg_99e41015a911,"Joffrin, je serais capable de faire des trucs ...",1
677,anon_msg_31edc839b8b8,"D'accord le julf, mais vous n'êtes pas BLANC.",1
1272,anon_msg_15c39b3b7e9a,C'est Siemens Espagne hein... Pas le PDG du gr...,0


## Save the benchmark subset in a `csv`

In [25]:
df.to_csv(BENCHMARK_SUBSET_PATH, index=False, encoding="utf-8")
console.print(f"Balanced dataset ({len(df)} rows) saved to {BENCHMARK_SUBSET_PATH}")