In [15]:
# ============================================================
# CELL 2 ‚Äî ‚öôÔ∏è IMPORTS AND CONFIGURATION
# ============================================================
import os
import random
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Colab: upload parquet files (hatexplain.parquet, silent_signals.parquet)
from google.colab import files
uploaded = files.upload()

!mkdir -p /home/ofer/projects/Agent_civility_2.0/data
!mv -f hatexplain.parquet /home/ofer/projects/Agent_civility_2.0/data/ 2>/dev/null || true
!mv -f silent_signals.parquet /home/ofer/projects/Agent_civility_2.0/data/ 2>/dev/null || true

DATA_DIR = "/home/ofer/projects/Agent_civility_2.0/data"


Saving hatexplain.parquet to hatexplain.parquet
Saving silent_signals.parquet to silent_signals.parquet


In [16]:
# ============================================================
# CELL 3 ‚Äî üß© DATA PREPARATION (Binary mapping + 50/50 slices)
# ============================================================
hatexplain_df = pd.read_parquet(os.path.join(DATA_DIR, "hatexplain.parquet"))
dogwhistle_df = pd.read_parquet(os.path.join(DATA_DIR, "silent_signals.parquet"))

# Normalize HateXplain labels to {normal, offensive, hate}
valid = {"normal", "offensive", "hate"}
hatexplain_df = hatexplain_df[hatexplain_df["label"].isin(valid)].copy()

# Map to binary:
# 0 = neutral/dogwhistle (normal + dogwhistle)
# 1 = explicit hateful/offensive (hate + offensive)
neutral_df = hatexplain_df[hatexplain_df["label"] == "normal"].copy()
harm_df    = hatexplain_df[hatexplain_df["label"].isin(["offensive", "hate"])].copy()
dog_df     = dogwhistle_df.copy()

neutral_df["class_label"] = 0
dog_df["class_label"]     = 0
harm_df["class_label"]    = 1

# Merge neutral (normal + dogwhistle) vs harm (hate/offensive)
# Ensure texts are under a unified 'text' column
for df_ in (neutral_df, harm_df, dog_df):
    if "text" not in df_.columns:
        raise ValueError("Expected a 'text' column in the dataframes.")
    df_["text"] = df_["text"].astype(str)

class0_df = pd.concat([neutral_df[["text", "class_label"]],
                       dog_df[["text", "class_label"]]], ignore_index=True)
class1_df = harm_df[["text", "class_label"]].copy()

print(f"Class 0 size (neutral+dog): {len(class0_df)}")
print(f"Class 1 size (harmful):     {len(class1_df)}")

# Build a base pool dataframe (we keep it big, we will draw balanced 20% slices from it)
base_df = pd.concat([class0_df, class1_df], ignore_index=True).sample(frac=1, random_state=SEED).reset_index(drop=True)

# Create a fixed validation set (20% of the full, stratified, balanced-ish by randomization)
train_pool, val_df = train_test_split(
    base_df, test_size=0.20, stratify=base_df["class_label"], random_state=SEED
)
val_df = val_df.reset_index(drop=True)

print("Validation distribution:\n", val_df["class_label"].value_counts())

def draw_balanced_slice(df, frac=0.20, random_state=None):
    """
    Draw a stratified ~20% slice that is 50/50 across the two classes (as much as possible).
    """
    rng = np.random.default_rng(random_state)
    c0 = df[df["class_label"] == 0]
    c1 = df[df["class_label"] == 1]
    target = int(len(df) * frac // 2)  # half from each class

    # Sample with replacement if class imbalance prevents exact size
    s0 = c0.sample(n=target, replace=(len(c0) < target), random_state=int(rng.integers(1e9)))
    s1 = c1.sample(n=target, replace=(len(c1) < target), random_state=int(rng.integers(1e9)))

    slice_df = pd.concat([s0, s1], ignore_index=True).sample(frac=1, random_state=int(rng.integers(1e9))).reset_index(drop=True)
    return slice_df

# First training slice (fixed until stage threshold met)
train_slice_df = draw_balanced_slice(train_pool, frac=0.20, random_state=SEED)

print("Initial train slice distribution:\n", train_slice_df["class_label"].value_counts())


Class 0 size (neutral+dog): 22509
Class 1 size (harmful):     9132
Validation distribution:
 class_label
0    4502
1    1827
Name: count, dtype: int64
Initial train slice distribution:
 class_label
1    2531
0    2531
Name: count, dtype: int64


In [17]:
# ============================================================
# CELL 4 ‚Äî üî§ TOKENIZER & DATASETS (HateBERT)
# ============================================================
MODEL_NAME = "GroNLP/hateBERT"  # base HateBERT (we add a fresh 2-class head)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def to_hf_dataset(df):
    return Dataset.from_pandas(df[["text", "class_label"]].rename(columns={"class_label":"labels"}), preserve_index=False)

def tokenize_batch(batch):
    return tokenizer(batch["text"], truncation=True, max_length=256)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_ds = to_hf_dataset(train_slice_df).map(tokenize_batch, batched=True, remove_columns=["text"])
val_ds   = to_hf_dataset(val_df).map(tokenize_batch, batched=True, remove_columns=["text"])

num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)


Map:   0%|          | 0/5062 [00:00<?, ? examples/s]

Map:   0%|          | 0/6329 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OSError: WeightedHateBERT is designed to be instantiated using the `WeightedHateBERT.from_pretrained(pretrained_model_name_or_path)` or `WeightedHateBERT.from_config(config)` methods.

In [None]:
# ============================================================
# CELL 5 ‚Äî üìè METRICS + TRAINER BUILDER (Colab-Safe)
# ============================================================
import evaluate
from transformers import TrainingArguments, Trainer

# --- Metrics ---
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """Compute accuracy and precision/recall/F1 for class 1 (explicit hate)."""
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    from sklearn.metrics import precision_recall_fscore_support
    p, r, f1, _ = precision_recall_fscore_support(
        labels, preds, labels=[1], average="binary", zero_division=0
    )
    return {"accuracy": acc, "precision_1": p, "recall_1": r, "f1_1": f1}


def mk_trainer(model, train_dataset, eval_dataset, outdir,
               epochs=1, lr=2e-5, batch_size=16, seed=SEED):
    """
    Safe HF Trainer constructor (no unsupported kwargs for older transformers versions).
    """
    args = TrainingArguments(
        output_dir=outdir,
        num_train_epochs=epochs,
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        save_total_limit=1,
        seed=seed,
        fp16=torch.cuda.is_available(),
        logging_dir=f"{outdir}/logs",
        logging_steps=50,
        save_steps=500,
        do_train=True,
        do_eval=True,
    )

    return Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
# ============================================================
# CELL 6 ‚Äî üîÅ TWO-PHASE PRECISION-FOCUSED TRAINING LOOP (FIXED)
# ============================================================
from copy import deepcopy
import json, random, numpy as np

# === Phase 1 (strict precision) thresholds ===
stage1_acc_threshold   = 0.90
stage1_prec1_threshold = 0.97
final1_acc_threshold   = 0.95
final1_prec1_threshold = 0.985
max_stage1             = 15

# === Phase 2 (gentler fine-tuning) thresholds ===
stage2_acc_threshold   = 0.90
stage2_prec1_threshold = 0.95
final2_acc_threshold   = 0.94
final2_prec1_threshold = 0.97
max_stage2             = 15

epochs_per_stage = 1
learning_rate    = 2e-5
batch_size       = 16

history = []
best_ckpt = None
best_score = (-1.0, -1.0)   # (acc, prec1)

# ------------------------------------------------------------
def evaluate_model(trainer):
    metrics = trainer.evaluate()
    acc   = float(metrics.get("eval_accuracy", 0.0))
    prec1 = float(metrics.get("eval_precision_1", 0.0))
    rec1  = float(metrics.get("eval_recall_1", 0.0))
    f1_1  = float(metrics.get("eval_f1_1", 0.0))
    return acc, prec1, rec1, f1_1

# Custom weighted scoring ‚Äî alpha controls precision emphasis
def weighted_score(acc, prec1, rec1, alpha=0.7):
    """Combine accuracy & precision_1 into one score."""
    return alpha * prec1 + (1 - alpha) * acc

# ------------------------------------------------------------
def run_phase(phase_id, max_stages, acc_thr_stage, prec_thr_stage,
              acc_thr_final, prec_thr_final, alpha_weight, continue_from_model):
    global best_ckpt, best_score, history, train_ds

    print(f"\nüö¶ Starting Phase {phase_id}: "
          f"{max_stages} stages (precision weight Œ±={alpha_weight})")

    model_to_train = continue_from_model
    for s in range(1, max_stages + 1):
        print(f"\n===== PHASE {phase_id} | STAGE {s} (epochs={epochs_per_stage}) =====")

        trainer = mk_trainer(
            model=model_to_train,
            train_dataset=train_ds,
            eval_dataset=val_ds,
            outdir=f"./hatebert_phase{phase_id}_stage{s}",
            epochs=epochs_per_stage,
            lr=learning_rate,
            batch_size=batch_size,
            seed=SEED,
        )

        trainer.train()
        acc, prec1, rec1, f1_1 = evaluate_model(trainer)
        stage_score = weighted_score(acc, prec1, rec1, alpha_weight)

        record = {
            "phase": phase_id,
            "stage": s,
            "accuracy": acc,
            "precision_1": prec1,
            "recall_1": rec1,
            "f1_1": f1_1,
            "score": stage_score,
        }
        history.append(record)
        print(json.dumps(record, indent=2))

        # --- Save best so far ---
        prev_acc, prev_prec1 = best_score
        prev_score = weighted_score(prev_acc, prev_prec1, rec1, alpha_weight)
        if stage_score > prev_score:
            best_score = (acc, prec1)
            best_ckpt = deepcopy(model_to_train).cpu()

        # --- Early stop for this phase ---
        if acc >= acc_thr_final and prec1 >= prec_thr_final:
            print(f"üéØ Phase {phase_id}: Final thresholds met ‚Äî stopping early.")
            break

        # --- Rotate data slice if thresholds met ---
        if acc >= acc_thr_stage and prec1 >= prec_thr_stage:
            print(f"üöÄ Phase {phase_id}: Thresholds met ‚Äî rotating to new 20% slice.")
            new_slice = draw_balanced_slice(
                train_pool, frac=0.20, random_state=random.randint(0, 10**9)
            )
            train_ds = to_hf_dataset(new_slice).map(
                tokenize_batch, batched=True, remove_columns=["text"]
            )
        else:
            print(f"üîÅ Phase {phase_id}: Thresholds not met ‚Äî keep training same slice.")

    print(f"‚úÖ Phase {phase_id} complete.")
    return model_to_train

# ------------------------------------------------------------
# === Run Phase 1 (strict precision focus) ===
model = run_phase(
    phase_id=1,
    max_stages=max_stage1,
    acc_thr_stage=stage1_acc_threshold,
    prec_thr_stage=stage1_prec1_threshold,
    acc_thr_final=final1_acc_threshold,
    prec_thr_final=final1_prec1_threshold,
    alpha_weight=0.8,   # heavy precision bias
    continue_from_model=model,
)

# === Run Phase 2 (gentler fine-tuning) ===
model = run_phase(
    phase_id=2,
    max_stages=max_stage2,
    acc_thr_stage=stage2_acc_threshold,
    prec_thr_stage=stage2_prec1_threshold,
    acc_thr_final=final2_acc_threshold,
    prec_thr_final=final2_prec1_threshold,
    alpha_weight=0.6,   # lighter precision bias
    continue_from_model=model,
)

print("\nüèÅ Two-phase training complete.")
print("Best so far:", {"accuracy": best_score[0], "precision_1": best_score[1]})


In [None]:
# ============================================================
# CELL 7 ‚Äî üíæ SAVE MODEL, TOKENIZER, AND LOGS
# ============================================================
from datetime import datetime
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
out_dir = f"/home/ofer/projects/Agent_civility_2.0/models/layer1_hatebert_binary_{stamp}"

os.makedirs(out_dir, exist_ok=True)

# Save best checkpoint if we captured one, else current model
save_model = best_ckpt if best_ckpt is not None else model.cpu()
save_model.save_pretrained(out_dir)
tokenizer.save_pretrained(out_dir)

with open(os.path.join(out_dir, "train_history.json"), "w") as f:
    json.dump(history, f, indent=2)

print(f"‚úÖ Saved model + tokenizer to: {out_dir}")
