In [15]:
# ============================================================
# CELL 2 ‚Äî ‚öôÔ∏è IMPORTS AND CONFIGURATION
# ============================================================
import os
import random
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Colab: upload parquet files (hatexplain.parquet, silent_signals.parquet)
from google.colab import files
uploaded = files.upload()

!mkdir -p /home/ofer/projects/Agent_civility_2.0/data
!mv -f hatexplain.parquet /home/ofer/projects/Agent_civility_2.0/data/ 2>/dev/null || true
!mv -f silent_signals.parquet /home/ofer/projects/Agent_civility_2.0/data/ 2>/dev/null || true

DATA_DIR = "/home/ofer/projects/Agent_civility_2.0/data"


Saving hatexplain.parquet to hatexplain.parquet
Saving silent_signals.parquet to silent_signals.parquet


In [16]:
# ============================================================
# CELL 3 ‚Äî üß© DATA PREPARATION (Binary mapping + 50/50 slices)
# ============================================================
hatexplain_df = pd.read_parquet(os.path.join(DATA_DIR, "hatexplain.parquet"))
dogwhistle_df = pd.read_parquet(os.path.join(DATA_DIR, "silent_signals.parquet"))

# Normalize HateXplain labels to {normal, offensive, hate}
valid = {"normal", "offensive", "hate"}
hatexplain_df = hatexplain_df[hatexplain_df["label"].isin(valid)].copy()

# Map to binary:
# 0 = neutral/dogwhistle (normal + dogwhistle)
# 1 = explicit hateful/offensive (hate + offensive)
neutral_df = hatexplain_df[hatexplain_df["label"] == "normal"].copy()
harm_df    = hatexplain_df[hatexplain_df["label"].isin(["offensive", "hate"])].copy()
dog_df     = dogwhistle_df.copy()

neutral_df["class_label"] = 0
dog_df["class_label"]     = 0
harm_df["class_label"]    = 1

# Merge neutral (normal + dogwhistle) vs harm (hate/offensive)
# Ensure texts are under a unified 'text' column
for df_ in (neutral_df, harm_df, dog_df):
    if "text" not in df_.columns:
        raise ValueError("Expected a 'text' column in the dataframes.")
    df_["text"] = df_["text"].astype(str)

class0_df = pd.concat([neutral_df[["text", "class_label"]],
                       dog_df[["text", "class_label"]]], ignore_index=True)
class1_df = harm_df[["text", "class_label"]].copy()

print(f"Class 0 size (neutral+dog): {len(class0_df)}")
print(f"Class 1 size (harmful):     {len(class1_df)}")

# Build a base pool dataframe (we keep it big, we will draw balanced 20% slices from it)
base_df = pd.concat([class0_df, class1_df], ignore_index=True).sample(frac=1, random_state=SEED).reset_index(drop=True)

# Create a fixed validation set (20% of the full, stratified, balanced-ish by randomization)
train_pool, val_df = train_test_split(
    base_df, test_size=0.20, stratify=base_df["class_label"], random_state=SEED
)
val_df = val_df.reset_index(drop=True)

print("Validation distribution:\n", val_df["class_label"].value_counts())

def draw_balanced_slice(df, frac=0.20, random_state=None):
    """
    Draw a stratified ~20% slice that is 50/50 across the two classes (as much as possible).
    """
    rng = np.random.default_rng(random_state)
    c0 = df[df["class_label"] == 0]
    c1 = df[df["class_label"] == 1]
    target = int(len(df) * frac // 2)  # half from each class

    # Sample with replacement if class imbalance prevents exact size
    s0 = c0.sample(n=target, replace=(len(c0) < target), random_state=int(rng.integers(1e9)))
    s1 = c1.sample(n=target, replace=(len(c1) < target), random_state=int(rng.integers(1e9)))

    slice_df = pd.concat([s0, s1], ignore_index=True).sample(frac=1, random_state=int(rng.integers(1e9))).reset_index(drop=True)
    return slice_df

# First training slice (fixed until stage threshold met)
train_slice_df = draw_balanced_slice(train_pool, frac=0.20, random_state=SEED)

print("Initial train slice distribution:\n", train_slice_df["class_label"].value_counts())


Class 0 size (neutral+dog): 22509
Class 1 size (harmful):     9132
Validation distribution:
 class_label
0    4502
1    1827
Name: count, dtype: int64
Initial train slice distribution:
 class_label
1    2531
0    2531
Name: count, dtype: int64


In [18]:
# ============================================================
# CELL 4 ‚Äî üî§ TOKENIZER & DATASETS (Plain HateBERT)
# ============================================================
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import Dataset

MODEL_NAME = "GroNLP/hateBERT"  # base HateBERT with standard head
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def to_hf_dataset(df):
    return Dataset.from_pandas(
        df[["text", "class_label"]].rename(columns={"class_label":"labels"}),
        preserve_index=False
    )

def tokenize_batch(batch):
    return tokenizer(batch["text"], truncation=True, max_length=256)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_ds = to_hf_dataset(train_slice_df).map(tokenize_batch, batched=True, remove_columns=["text"])
val_ds   = to_hf_dataset(val_df).map(tokenize_batch,   batched=True, remove_columns=["text"])

num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

print("‚úÖ Loaded HateBERT base model and tokenized datasets.")


Map:   0%|          | 0/5062 [00:00<?, ? examples/s]

Map:   0%|          | 0/6329 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Loaded HateBERT base model and tokenized datasets.


In [23]:
# ============================================================
# CELL 5 ‚Äî üìè METRICS + WEIGHTED TRAINER (class weights + label smoothing)
# ============================================================
import os, torch, evaluate
from transformers import TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support

# Silence W&B popups
os.environ["WANDB_DISABLED"] = "true"

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    p, r, f1, _ = precision_recall_fscore_support(
        labels, preds, labels=[1], average="binary", zero_division=0
    )
    return {"accuracy": acc, "precision_1": p, "recall_1": r, "f1_1": f1}

# --- Custom Trainer that applies class-weighted CE + label smoothing ---
class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, label_smoothing=0.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = None
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float)
        self.label_smoothing = float(label_smoothing)
        self._loss_fct = None

    # üîß NOTE: Added **kwargs to swallow new HF args like num_items_in_batch
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """Compute weighted + smoothed loss; ignore any extra HF kwargs."""
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Lazily build loss function on the correct device
        if self._loss_fct is None:
            if self.class_weights is not None:
                self.class_weights = self.class_weights.to(logits.device)
            self._loss_fct = torch.nn.CrossEntropyLoss(
                weight=self.class_weights, label_smoothing=self.label_smoothing
            )

        loss = self._loss_fct(
            logits.view(-1, model.config.num_labels),
            labels.view(-1)
        )
        return (loss, outputs) if return_outputs else loss


def mk_trainer(model, train_dataset, eval_dataset, outdir,
               epochs=1, lr=2e-5, batch_size=16, seed=SEED,
               class_weights=(1.0, 1.5), label_smoothing=0.1):
    args = TrainingArguments(
        output_dir=outdir,
        num_train_epochs=epochs,
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        save_total_limit=1,
        seed=seed,
        fp16=torch.cuda.is_available(),
        logging_dir=f"{outdir}/logs",
        logging_steps=50,
        save_steps=500,
        do_train=True,
        do_eval=True,
        # No evaluation_strategy here (Colab-safe)
    )
    return WeightedTrainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        class_weights=class_weights,       # ‚üµ penalize class 1 FP more
        label_smoothing=label_smoothing,   # ‚üµ prevent overconfidence
    )

print("‚úÖ WeightedTrainer ready (class_weights=(1.0, 1.5), label_smoothing=0.1).")


‚úÖ WeightedTrainer ready (class_weights=(1.0, 1.5), label_smoothing=0.1).


In [20]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [24]:
# ============================================================
# CELL 6 ‚Äî üîÅ TWO-PHASE PRECISION-FOCUSED TRAINING LOOP (FINAL)
# ============================================================
from copy import deepcopy
import json, random, numpy as np

# === Phase 1 (strict precision) thresholds ===
stage1_acc_threshold   = 0.90
stage1_prec1_threshold = 0.97
final1_acc_threshold   = 0.95
final1_prec1_threshold = 0.985
max_stage1             = 15

# === Phase 2 (gentler fine-tuning) thresholds ===
stage2_acc_threshold   = 0.90
stage2_prec1_threshold = 0.95
final2_acc_threshold   = 0.94
final2_prec1_threshold = 0.97
max_stage2             = 15

epochs_per_stage = 1
learning_rate    = 2e-5
batch_size       = 16

history = []
best_ckpt = None
best_score = (-1.0, -1.0)   # (acc, prec1)

# ------------------------------------------------------------
def evaluate_model(trainer):
    metrics = trainer.evaluate()
    acc   = float(metrics.get("eval_accuracy", 0.0))
    prec1 = float(metrics.get("eval_precision_1", 0.0))
    rec1  = float(metrics.get("eval_recall_1", 0.0))
    f1_1  = float(metrics.get("eval_f1_1", 0.0))
    return acc, prec1, rec1, f1_1

def weighted_score(acc, prec1, rec1, alpha=0.7):
    """Combine accuracy & precision‚ÇÅ into one score."""
    return alpha * prec1 + (1 - alpha) * acc

# ------------------------------------------------------------
def run_phase(phase_id, max_stages, acc_thr_stage, prec_thr_stage,
              acc_thr_final, prec_thr_final, alpha_weight, continue_from_model):
    global best_ckpt, best_score, history, train_ds

    print(f"\nüö¶ Starting Phase {phase_id}: "
          f"{max_stages} stages (precision weight Œ±={alpha_weight})")

    model_to_train = continue_from_model
    for s in range(1, max_stages + 1):
        print(f"\n===== PHASE {phase_id} | STAGE {s} (epochs={epochs_per_stage}) =====")

        # Build new trainer each stage
        trainer = mk_trainer(
            model=model_to_train,
            train_dataset=train_ds,
            eval_dataset=val_ds,
            outdir=f"./hatebert_phase{phase_id}_stage{s}",
            epochs=epochs_per_stage,
            lr=learning_rate,
            batch_size=batch_size,
            seed=SEED,
            class_weights=(1.0, 1.5 if phase_id == 1 else 1.3),
            label_smoothing=0.1 if phase_id == 1 else 0.05,
        )

        trainer.train()
        acc, prec1, rec1, f1_1 = evaluate_model(trainer)
        stage_score = weighted_score(acc, prec1, rec1, alpha_weight)

        record = {
            "phase": phase_id,
            "stage": s,
            "accuracy": acc,
            "precision_1": prec1,
            "recall_1": rec1,
            "f1_1": f1_1,
            "score": stage_score,
        }
        history.append(record)
        print(json.dumps(record, indent=2))

        # --- Save best so far ---
        prev_acc, prev_prec1 = best_score
        prev_score = weighted_score(prev_acc, prev_prec1, rec1, alpha_weight)
        if stage_score > prev_score:
            best_score = (acc, prec1)
            best_ckpt = deepcopy(model_to_train).cpu()

        # --- Early stop for this phase ---
        if acc >= acc_thr_final and prec1 >= prec_thr_final:
            print(f"üéØ Phase {phase_id}: Final thresholds met ‚Äî stopping early.")
            break

        # --- Rotate data slice if thresholds met ---
        if acc >= acc_thr_stage and prec1 >= prec_thr_stage:
            print(f"üöÄ Phase {phase_id}: Thresholds met ‚Äî rotating to new 20% slice.")
            new_slice = draw_balanced_slice(
                train_pool, frac=0.20, random_state=random.randint(0, 10**9)
            )
            train_ds = to_hf_dataset(new_slice).map(
                tokenize_batch, batched=True, remove_columns=["text"]
            )
        else:
            print(f"üîÅ Phase {phase_id}: Thresholds not met ‚Äî keep training same slice.")

    print(f"‚úÖ Phase {phase_id} complete.")
    return model_to_train

# ------------------------------------------------------------
# === Run Phase 1 (strict precision focus) ===
model = run_phase(
    phase_id=1,
    max_stages=max_stage1,
    acc_thr_stage=stage1_acc_threshold,
    prec_thr_stage=stage1_prec1_threshold,
    acc_thr_final=final1_acc_threshold,
    prec_thr_final=final1_prec1_threshold,
    alpha_weight=0.8,
    continue_from_model=model,
)

# === Run Phase 2 (gentler fine-tuning) ===
model = run_phase(
    phase_id=2,
    max_stages=max_stage2,
    acc_thr_stage=stage2_acc_threshold,
    prec_thr_stage=stage2_prec1_threshold,
    acc_thr_final=final2_acc_threshold,
    prec_thr_final=final2_prec1_threshold,
    alpha_weight=0.6,
    continue_from_model=model,
)

print("\nüèÅ Two-phase training complete.")
print("Best so far:", {"accuracy": best_score[0], "precision_1": best_score[1]})


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



üö¶ Starting Phase 1: 15 stages (precision weight Œ±=0.8)

===== PHASE 1 | STAGE 1 (epochs=1) =====


  super().__init__(*args, **kwargs)


Step,Training Loss
50,0.573
100,0.4248
150,0.4035
200,0.3856
250,0.3859
300,0.3736


{
  "phase": 1,
  "stage": 1,
  "accuracy": 0.8258808658555854,
  "precision_1": 0.6324442820606504,
  "recall_1": 0.9474548440065681,
  "f1_1": 0.7585451358457493,
  "score": 0.6711315988196374
}
üîÅ Phase 1: Thresholds not met ‚Äî keep training same slice.

===== PHASE 1 | STAGE 2 (epochs=1) =====


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  super().__init__(*args, **kwargs)


Step,Training Loss
50,0.3165
100,0.3058
150,0.2884
200,0.2825
250,0.3162
300,0.3525


{
  "phase": 1,
  "stage": 2,
  "accuracy": 0.8486332753989572,
  "precision_1": 0.6742880064179703,
  "recall_1": 0.9200875752599891,
  "f1_1": 0.7782407407407408,
  "score": 0.7091570602141677
}


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


üîÅ Phase 1: Thresholds not met ‚Äî keep training same slice.

===== PHASE 1 | STAGE 3 (epochs=1) =====


  super().__init__(*args, **kwargs)


Step,Training Loss
50,0.2295
100,0.2181
150,0.2245
200,0.2234
250,0.2408
300,0.3362


{
  "phase": 1,
  "stage": 3,
  "accuracy": 0.8557434033812609,
  "precision_1": 0.6925021061499579,
  "recall_1": 0.8998357963875205,
  "f1_1": 0.7826707926684123,
  "score": 0.7251503655962185
}
üîÅ Phase 1: Thresholds not met ‚Äî keep training same slice.

===== PHASE 1 | STAGE 4 (epochs=1) =====


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  super().__init__(*args, **kwargs)


Step,Training Loss
50,0.208
100,0.1973
150,0.2047
200,0.199
250,0.2075
300,0.2906


{
  "phase": 1,
  "stage": 4,
  "accuracy": 0.8617475114552062,
  "precision_1": 0.7044673539518901,
  "recall_1": 0.8976464148877942,
  "f1_1": 0.789410348977136,
  "score": 0.7359233854525533
}


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


üîÅ Phase 1: Thresholds not met ‚Äî keep training same slice.

===== PHASE 1 | STAGE 5 (epochs=1) =====


  super().__init__(*args, **kwargs)


Step,Training Loss
50,0.2047
100,0.1975
150,0.1975
200,0.1986
250,0.1965
300,0.2548


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


{
  "phase": 1,
  "stage": 5,
  "accuracy": 0.8601674830146943,
  "precision_1": 0.703719723183391,
  "recall_1": 0.8905309250136836,
  "f1_1": 0.7861802367721672,
  "score": 0.7350092751496518
}
üîÅ Phase 1: Thresholds not met ‚Äî keep training same slice.

===== PHASE 1 | STAGE 6 (epochs=1) =====


  super().__init__(*args, **kwargs)


Step,Training Loss
50,0.2072
100,0.1982
150,0.1976
200,0.1967
250,0.1953
300,0.2371


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


{
  "phase": 1,
  "stage": 6,
  "accuracy": 0.8622215199873597,
  "precision_1": 0.7147998200629779,
  "recall_1": 0.8697318007662835,
  "f1_1": 0.7846913580246914,
  "score": 0.7442841600478542
}
üîÅ Phase 1: Thresholds not met ‚Äî keep training same slice.

===== PHASE 1 | STAGE 7 (epochs=1) =====


  super().__init__(*args, **kwargs)


Step,Training Loss
50,0.2046
100,0.194
150,0.1971
200,0.1968
250,0.1964
300,0.2378


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


{
  "phase": 1,
  "stage": 7,
  "accuracy": 0.8620635171433085,
  "precision_1": 0.7118117229129662,
  "recall_1": 0.8773946360153256,
  "f1_1": 0.7859769551360628,
  "score": 0.7418620817590347
}
üîÅ Phase 1: Thresholds not met ‚Äî keep training same slice.

===== PHASE 1 | STAGE 8 (epochs=1) =====


  super().__init__(*args, **kwargs)


Step,Training Loss
50,0.2007
100,0.1942
150,0.1969
200,0.1974
250,0.1972
300,0.2255


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


{
  "phase": 1,
  "stage": 8,
  "accuracy": 0.8598514773265918,
  "precision_1": 0.71190261496844,
  "recall_1": 0.8642583470169677,
  "f1_1": 0.7807169344870211,
  "score": 0.7414923874400703
}
üîÅ Phase 1: Thresholds not met ‚Äî keep training same slice.

===== PHASE 1 | STAGE 9 (epochs=1) =====


  super().__init__(*args, **kwargs)


Step,Training Loss
50,0.2002
100,0.1956
150,0.199
200,0.1964
250,0.1951
300,0.2156


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


{
  "phase": 1,
  "stage": 9,
  "accuracy": 0.8592194659503871,
  "precision_1": 0.703125,
  "recall_1": 0.8866995073891626,
  "f1_1": 0.7843137254901961,
  "score": 0.7343438931900774
}
üîÅ Phase 1: Thresholds not met ‚Äî keep training same slice.

===== PHASE 1 | STAGE 10 (epochs=1) =====


  super().__init__(*args, **kwargs)


Step,Training Loss
50,0.2038
100,0.1976
150,0.1977


KeyboardInterrupt: 

In [None]:
# ============================================================
# CELL 7 ‚Äî üíæ SAVE MODEL, TOKENIZER, AND LOGS
# ============================================================
from datetime import datetime
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
out_dir = f"/home/ofer/projects/Agent_civility_2.0/models/layer1_hatebert_binary_{stamp}"

os.makedirs(out_dir, exist_ok=True)

# Save best checkpoint if we captured one, else current model
save_model = best_ckpt if best_ckpt is not None else model.cpu()
save_model.save_pretrained(out_dir)
tokenizer.save_pretrained(out_dir)

with open(os.path.join(out_dir, "train_history.json"), "w") as f:
    json.dump(history, f, indent=2)

print(f"‚úÖ Saved model + tokenizer to: {out_dir}")
