In [1]:
# Disable Weights & Biases (W&B) logging to avoid external tracking prompts and to keep runs fully local.

import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"


In [2]:
# Install HuggingFace's `evaluate` library (used later for computing metrics in a standardized way).

!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


# Libraries Import and Data Preprocessing

In [3]:
# ============================================
# Roman-Urdu Setup (LOCAL /content)
#  - Saves everything under /content/Thesis_RomanUrdu_SA
#  - Pre-clean (drop nulls, dedupe by text, normalize columns+labels)
#  - Exports: cleaned CSVs + label distribution stats
#
# Purpose of this cell:
#   1) Define a consistent local folder structure.
#   2) Copy raw dataset files into the project directory for reproducibility.
#   3) Standardize dataset column names and sentiment labels across splits.
#   4) Persist cleaned splits + few-shot pack for later phases (Phase 2.5 / Phase 3).
# ============================================

import os, json, math, random, shutil, gc, time
import numpy as np, pandas as pd, torch, torch.nn.functional as F
from sklearn.metrics import f1_score, accuracy_score, classification_report
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, EarlyStoppingCallback,
                          DataCollatorWithPadding, set_seed, pipeline)
from datasets import Dataset

# ---------------- Base paths ----------------
# All artifacts (datasets, model checkpoints, metrics, predictions) are stored under BASE_DIR
# so the entire project can be zipped/downloaded from /content easily.
BASE_DIR       = "/content/Thesis_RomanUrdu_SA"       # <— root for this project
DATA_DIR_RAW   = f"{BASE_DIR}/datasets/rusa19_raw"
DATA_DIR_CLEAN = f"{BASE_DIR}/datasets/rusa19_clean"
OUT_DIR        = f"{BASE_DIR}/outputs/phase1"
CKPT_DIR       = f"{OUT_DIR}/supervised_xlmr_base"

# Create folders if they don't exist (safe to re-run).
for d in [BASE_DIR, DATA_DIR_RAW, DATA_DIR_CLEAN, OUT_DIR, CKPT_DIR]:
    os.makedirs(d, exist_ok=True)

print(f"Saving all artifacts under: {BASE_DIR}")

# ---------------- User paths (expected files already in /content) ----------------
# These are the "incoming" raw files (e.g., uploaded to Colab). We copy them into our project folder
# so later phases can rely on fixed paths under BASE_DIR.
# If your raw files live elsewhere, update these four paths:
SRC_TRAIN = "/content/rusa19_train.csv"
SRC_DEV   = "/content/rusa19_dev.csv"
SRC_TEST  = "/content/rusa19_test.csv"
SRC_FEW   = "/content/rusa19_fewshot64.csv"
SRC_LEX   = "/content/roman_urdu_lexicon.csv"   # not used in Phase 1; for later phases

def _safe_copy(src, dst_dir, fallback_name):
    """
    Copy a file into dst_dir if it exists. If the source doesn't exist, return a fallback path.
    This prevents crashes if a file is missing while still keeping consistent variable names.
    """
    dst = os.path.join(dst_dir, os.path.basename(src)) if os.path.exists(src) else os.path.join(dst_dir, fallback_name)
    if os.path.exists(src):
        shutil.copy(src, dst)
    return dst

# Copy raw splits and lexicon into the project directory for traceability and future phases.
RAW_TRAIN = _safe_copy(SRC_TRAIN, DATA_DIR_RAW, "rusa19_train.csv")
RAW_DEV   = _safe_copy(SRC_DEV,   DATA_DIR_RAW, "rusa19_dev.csv")
RAW_TEST  = _safe_copy(SRC_TEST,  DATA_DIR_RAW, "rusa19_test.csv")
RAW_FEW   = _safe_copy(SRC_FEW,   DATA_DIR_RAW, "rusa19_fewshot64.csv")
RAW_LEX   = _safe_copy(SRC_LEX,   DATA_DIR_RAW, "roman_urdu_lexicon.csv")

# ---------------- Reproducibility ----------------
# Seed all major RNG sources to reduce run-to-run variation.
# NOTE: Full determinism is not always guaranteed on GPU, but this improves stability.
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
set_seed(SEED)

# Enforce deterministic CUDA kernels where possible (may reduce speed).
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")

# ---------------- Labels ----------------
# Our unified 3-class sentiment space used throughout the framework (Phase 1–3).
LABEL_LIST = ["negative", "neutral", "positive"]
LABEL2ID = {l: i for i, l in enumerate(LABEL_LIST)}
ID2LABEL = {v: k for k, v in LABEL2ID.items()}

def normalize_label(x):
    """
    Normalize label variants into the canonical label set.
    Supports common encodings from different datasets (e.g., -1/0/1 or string labels).
    Any unknown/invalid label becomes NaN and will be dropped during cleaning.
    """
    if pd.isna(x): return np.nan
    s = str(x).strip().lower()
    if s in {"-1","neg","n","negative","2"}: return "negative"
    if s in {"0","neu","neutral"}:           return "neutral"
    if s in {"1","pos","p","positive"}:      return "positive"
    if s in LABEL_LIST:                      return s
    return np.nan

def standardize_columns(df):
    """
    Standardize dataset schema:
      - map common column names -> {"text", "sentiment"}
      - strip/normalize text
      - normalize sentiment labels when available
    This allows later phases to assume a consistent format across all splits.
    """
    colmap = {}
    for c in df.columns:
        cl = c.strip().lower()
        if cl in {"text","review","utterance","cleantext","cleantweet"}:
            colmap[c] = "text"
        elif cl in {"sentiment","label","polarity"}:
            colmap[c] = "sentiment"
    df = df.rename(columns=colmap)

    # We require a text column; sentiment may be absent in unlabeled pools.
    assert "text" in df.columns, f"Missing text column. Found: {df.columns.tolist()}"

    # Normalize sentiment labels to our canonical space when present.
    if "sentiment" in df.columns:
        df["sentiment"] = df["sentiment"].map(normalize_label)

    # Clean up text values: trim whitespace and drop empty strings.
    df["text"] = df["text"].astype(str).str.strip()
    df["text"] = df["text"].replace({"": np.nan})
    return df

def pre_clean_split(path_csv, split_name):
    """
    Pre-clean a labeled split:
      1) load CSV
      2) standardize schema + normalize labels
      3) drop null/empty text
      4) deduplicate by text (within split)
      5) drop invalid labels (if labels exist)
      6) save cleaned CSV to DATA_DIR_CLEAN for reuse in later phases
    """
    df = pd.read_csv(path_csv, encoding="utf-8-sig")
    df = standardize_columns(df)
    before = len(df)

    # Text cleaning (always required).
    df = df.dropna(subset=["text"])                # drop null/empty text
    df = df.drop_duplicates(subset=["text"])       # dedupe by text within split

    # Label cleaning (only for labeled splits).
    if "sentiment" in df.columns:
        df = df.dropna(subset=["sentiment"])
        df = df[df["sentiment"].isin(LABEL_LIST)]

    out_path = os.path.join(DATA_DIR_CLEAN, f"rusa19_{split_name}_clean.csv")
    df.to_csv(out_path, index=False, encoding="utf-8-sig")
    print(f"[{split_name}] {before} -> {len(df)} after clean. Saved: {out_path}")
    return df

print("=== Pre-clean & save to /content ===")
df_train = pre_clean_split(RAW_TRAIN, "train")
df_dev   = pre_clean_split(RAW_DEV,   "dev")
df_test  = pre_clean_split(RAW_TEST,  "test")

# Few-shot pack (store for Phase-2.5 combiner training and Phase-3 Stage-B refinement).
# This file is the ONLY small gold supervision used by the core framework beyond the held-out TEST set.
if os.path.exists(RAW_FEW):
    df_few = pd.read_csv(RAW_FEW, encoding="utf-8-sig")
    df_few = standardize_columns(df_few).dropna(subset=["text","sentiment"])
    few_out = os.path.join(DATA_DIR_CLEAN, "rusa19_fewshot64_clean.csv")
    df_few.to_csv(few_out, index=False, encoding="utf-8-sig")
    print(f"[fewshot] Saved: {few_out}")
else:
    # If missing, later phases that depend on few-shot will fail fast (by design).
    df_few = pd.DataFrame(columns=["text","sentiment"])

# Label stats (persist)
# We store label distributions to (a) document imbalance and (b) verify cleaning didn't break class balance.
def label_counts(df):
    if "sentiment" not in df.columns: return {"has_labels": False}
    return {"has_labels": True, "n": len(df), **df["sentiment"].value_counts().reindex(LABEL_LIST, fill_value=0).to_dict()}

stats = {"train": label_counts(df_train), "dev": label_counts(df_dev), "test": label_counts(df_test), "few": label_counts(df_few)}
with open(os.path.join(DATA_DIR_CLEAN, "label_stats.json"), "w", encoding="utf-8") as f:
    json.dump(stats, ensure_ascii=False, indent=2, fp=f)
print(json.dumps(stats, indent=2, ensure_ascii=False))


Saving all artifacts under: /content/Thesis_RomanUrdu_SA
Device: cuda
=== Pre-clean & save to /content ===
[train] 6806 -> 6806 after clean. Saved: /content/Thesis_RomanUrdu_SA/datasets/rusa19_clean/rusa19_train_clean.csv
[dev] 971 -> 971 after clean. Saved: /content/Thesis_RomanUrdu_SA/datasets/rusa19_clean/rusa19_dev_clean.csv
[test] 1943 -> 1943 after clean. Saved: /content/Thesis_RomanUrdu_SA/datasets/rusa19_clean/rusa19_test_clean.csv
[fewshot] Saved: /content/Thesis_RomanUrdu_SA/datasets/rusa19_clean/rusa19_fewshot64_clean.csv
{
  "train": {
    "has_labels": true,
    "n": 6806,
    "negative": 2022,
    "neutral": 2202,
    "positive": 2582
  },
  "dev": {
    "has_labels": true,
    "n": 971,
    "negative": 288,
    "neutral": 314,
    "positive": 369
  },
  "test": {
    "has_labels": true,
    "n": 1943,
    "negative": 577,
    "neutral": 628,
    "positive": 738
  },
  "few": {
    "has_labels": true,
    "n": 192,
    "negative": 64,
    "neutral": 64,
    "positive": 64

# Fully Supervised Baseline

In [4]:
# ------------------------------------------------------------------
# This cell establishes two baselines to contextualize the proposed
# weak-supervision + self-training framework:
#
#   (1) Fully supervised fine-tuning of XLM-R (xlm-roberta-base)
#       - Trained on the full labeled TRAIN split
#       - Monitored on DEV split for early stopping / best checkpoint selection
#       - Evaluated on both DEV and TEST
#
#   (2) Zero-shot baseline using XNLI (joeddav/xlm-roberta-large-xnli)
#       - No Roman-Urdu task fine-tuning
#       - Provides a reference point for "out-of-the-box" performance
#
# Outputs (saved under OUT_DIR):
#   - checkpoints/ (CKPT_DIR)
#   - train_args.json
#   - supervised_{dev,test}_preds.csv + report json
#   - zeroshot_{dev,test}_preds.csv + metrics json
#   - phase1_summary.json
# ------------------------------------------------------------------


# ============================================
# Phase 1 — Roman-Urdu Baselines & Setup (LOCAL /content)
#  - Supervised baseline: xlm-roberta-base
#  - Zero-shot baseline: joeddav/xlm-roberta-large-xnli
#  - Exports: checkpoints, metrics, predictions, summary
# ============================================




# ---------------- HF datasets ----------------
# Convert pandas splits into HuggingFace Datasets with integer labels.
# This format is required by HuggingFace Trainer for training/evaluation.
def to_hf_dataset(df):
    d = df[["text","sentiment"]].copy()
    d["label"] = d["sentiment"].map(LABEL2ID)
    return Dataset.from_pandas(d[["text","label"]], preserve_index=False)

ds_train = to_hf_dataset(df_train)
ds_dev   = to_hf_dataset(df_dev)
ds_test  = to_hf_dataset(df_test)

# Supervised baseline model: multilingual encoder with strong cross-lingual transfer.
MODEL_SUP = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_SUP, use_fast=True)

# Tokenization is performed once up-front for efficiency.
# Padding is handled later by DataCollatorWithPadding (dynamic padding per batch).
def tok_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding=False, max_length=256)

ds_train_tok = ds_train.map(tok_fn, batched=True)
ds_dev_tok   = ds_dev.map(tok_fn, batched=True)
ds_test_tok  = ds_test.map(tok_fn, batched=True)

# Dynamic padding makes training faster than padding all samples to max length.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ---------------- Metrics ----------------
# Compute accuracy + macro-F1 + per-class F1 (NEG/NEU/POS).
# Macro-F1 is the key metric due to class imbalance and to treat classes equally.
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.tensor(logits).softmax(-1).numpy()
    preds = probs.argmax(-1)
    acc   = accuracy_score(labels, preds)
    macro = f1_score(labels, preds, average="macro")
    f1s   = f1_score(labels, preds, average=None, labels=[0,1,2])
    return {"accuracy": acc, "macro_f1": macro,
            "f1_neg": float(f1s[0]), "f1_neu": float(f1s[1]), "f1_pos": float(f1s[2])}

# ---------------- Class weights (mild imbalance) ----------------
# Use inverse-frequency weights to reduce bias toward majority classes.
# This is only for the fully-supervised baseline (not used in the weak-supervision pipeline).
y = np.array(ds_train["label"])
freq = np.bincount(y, minlength=len(LABEL_LIST)).astype(np.float32)
inv = 1.0 / np.clip(freq, 1.0, None)
class_weights = torch.tensor(inv / inv.sum() * len(LABEL_LIST), dtype=torch.float32)

# ---------------- Trainer with weighted CE ----------------
# Custom Trainer subclass to inject weighted cross-entropy during training.
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        # Do NOT mutate inputs; drop labels when passing to the model
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss = F.cross_entropy(logits, labels, weight=class_weights.to(logits.device))
        return (loss, outputs) if return_outputs else loss

# Initialize the supervised classifier head for 3-class sentiment.
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_SUP, num_labels=3, id2label=ID2LABEL, label2id=LABEL2ID
)

bsz = 16

# TrainingArguments define training schedule, evaluation cadence, checkpointing, and reproducibility.
# Here evaluation/saving/logging are done per epoch to keep reporting interpretable (epoch-wise curves).
args = TrainingArguments(
    output_dir=CKPT_DIR,
    per_device_train_batch_size=bsz,
    per_device_eval_batch_size=bsz,
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,          # keep the best checkpoint according to metric_for_best_model
    metric_for_best_model="eval_macro_f1",
    greater_is_better=True,
    gradient_accumulation_steps=1,
    fp16=torch.cuda.is_available(),
    report_to=[],                          # disable external loggers (W&B etc.)
    seed=SEED
)

# Trainer wires together model + datasets + tokenizer/collator + metrics + early stopping.
trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=ds_train_tok,
    eval_dataset=ds_dev_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # stop if no dev improvement for 3 evals
)

print("=== Training supervised baseline (xlm-roberta-base) ===")
train_out = trainer.train()
print(train_out)

# Save tokenizer (model already saved by Trainer)
tokenizer.save_pretrained(CKPT_DIR)
with open(os.path.join(OUT_DIR, "train_args.json"), "w") as f:
    json.dump(args.to_dict(), f, indent=2)

# ---------------- Evaluate on DEV & TEST (save CSVs/JSONs) ----------------
# Persist predictions and detailed per-class classification reports for later thesis tables/plots.
def eval_and_dump(ds_tok, df_ref, split_name):
    metrics = trainer.evaluate(ds_tok)
    print(f"\n[{split_name.upper()}] Supervised metrics:\n", json.dumps(metrics, indent=2))
    preds  = trainer.predict(ds_tok)
    logits = preds.predictions
    probs  = torch.tensor(logits).softmax(-1).numpy()
    y_true = preds.label_ids
    y_pred = probs.argmax(-1)
    out_df = pd.DataFrame({
        "text": df_ref["text"].tolist(),
        "gold": [ID2LABEL[int(i)] for i in y_true],
        "pred": [ID2LABEL[int(i)] for i in y_pred],
        "p_negative": probs[:, 0],
        "p_neutral":  probs[:, 1],
        "p_positive": probs[:, 2],
    })
    out_df.to_csv(os.path.join(OUT_DIR, f"supervised_{split_name}_preds.csv"),
                  index=False, encoding="utf-8-sig")
    rep = classification_report(y_true, y_pred, target_names=LABEL_LIST, output_dict=True)
    with open(os.path.join(OUT_DIR, f"supervised_{split_name}_report.json"), "w", encoding="utf-8") as f:
        json.dump({"metrics": metrics, "report": rep}, f, indent=2, ensure_ascii=False)
    return metrics

m_dev_sup  = eval_and_dump(ds_dev_tok,  df_dev,  "dev")
m_test_sup = eval_and_dump(ds_test_tok, df_test, "test")

# ---------------- Zero-shot baseline (XNLI) ----------------
# This baseline uses NLI-based zero-shot classification with the same 3 class names as candidate labels.
# It provides an "off-the-shelf" reference without any training on Roman-Urdu sentiment data.
print("\n=== Zero-shot baseline (joeddav/xlm-roberta-large-xnli) ===")
zsc = pipeline(
    "zero-shot-classification",
    model="joeddav/xlm-roberta-large-xnli",
    device=0 if torch.cuda.is_available() else -1
)
CANDS = LABEL_LIST

def zs_predict(texts, batch_size=16):
    # Returns a 3-dim probability distribution per text (normalized).
    probs_list = []
    for i in range(0, len(texts), batch_size):
        res = zsc(texts[i:i+batch_size], candidate_labels=CANDS, multi_label=False)
        if isinstance(res, dict): res = [res]
        for r in res:
            p = np.zeros(3, dtype=np.float32)
            for lbl, sc in zip(r["labels"], r["scores"]):
                ln = lbl.strip().lower()
                if ln in LABEL2ID: p[LABEL2ID[ln]] = float(sc)
            s = p.sum()
            probs_list.append(p / s if s > 0 else np.ones(3)/3)
    return np.vstack(probs_list)

def zs_eval(df, split_name):
    # Evaluate zero-shot predictions using the same metrics as the supervised baseline.
    texts = df["text"].tolist()
    gold  = df["sentiment"].map(LABEL2ID).to_numpy()
    probs = zs_predict(texts, batch_size=16)
    pred  = probs.argmax(-1)
    acc   = accuracy_score(gold, pred)
    macro = f1_score(gold, pred, average="macro")
    f1s   = f1_score(gold, pred, average=None, labels=[0,1,2])
    metrics = {"accuracy": acc, "macro_f1": macro,
               "f1_neg": float(f1s[0]), "f1_neu": float(f1s[1]), "f1_pos": float(f1s[2])}
    print(f"[{split_name.upper()}] Zero-shot metrics:\n", json.dumps(metrics, indent=2))
    out_df = pd.DataFrame({
        "text": df["text"],
        "gold": df["sentiment"],
        # NOTE: the conditional branch is disabled (if False), so we always use `pred` below.
        "pred": [ID2LABEL[int(i)] for i in prob.argmax(-1)] if False else [ID2LABEL[int(i)] for i in pred],
        "p_negative": probs[:,0], "p_neutral": probs[:,1], "p_positive": probs[:,2],
    })
    out_df.to_csv(os.path.join(OUT_DIR, f"zeroshot_{split_name}_preds.csv"),
                  index=False, encoding="utf-8-sig")
    with open(os.path.join(OUT_DIR, f"zeroshot_{split_name}_metrics.json"), "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=2, ensure_ascii=False)
    return metrics

m_dev_zs  = zs_eval(df_dev,  "dev")
m_test_zs = zs_eval(df_test, "test")

# ---------------- Summary ----------------
# Save a compact summary of key outputs for quick reporting and reproducibility.
summary = {
    "paths": {
        "base_dir": BASE_DIR,
        "data_clean": DATA_DIR_CLEAN,
        "phase1_out": OUT_DIR,
        "ckpt_dir": CKPT_DIR
    },
    "supervised": {"dev": m_dev_sup, "test": m_test_sup},
    "zeroshot":   {"dev": m_dev_zs,  "test": m_test_zs},
    "notes": [
        "All artifacts saved under /content/Thesis_RomanUrdu_SA (session-local).",
        "Download via Colab Files pane or zip using the helper snippet below."
    ]
}
with open(os.path.join(OUT_DIR, "phase1_summary.json"), "w", encoding="utf-8") as f:
    json.dump(summary, f, indent=2, ensure_ascii=False)

print("\n=== Phase 1 complete ===")
print(json.dumps(summary, indent=2, ensure_ascii=False))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/6806 [00:00<?, ? examples/s]

Map:   0%|          | 0/971 [00:00<?, ? examples/s]

Map:   0%|          | 0/1943 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(


=== Training supervised baseline (xlm-roberta-base) ===


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,F1 Neg,F1 Neu,F1 Pos
1,1.0698,0.886879,0.624099,0.620089,0.614476,0.518519,0.727273
2,0.8932,0.77104,0.698249,0.690554,0.650794,0.643815,0.777053
3,0.7816,0.65115,0.738414,0.737478,0.721368,0.683432,0.807636
4,0.6964,0.628996,0.747683,0.746541,0.722022,0.691323,0.826277
5,0.6368,0.649445,0.744593,0.740333,0.716469,0.669797,0.834734


TrainOutput(global_step=2130, training_loss=0.8155854847509536, metrics={'train_runtime': 821.1449, 'train_samples_per_second': 41.442, 'train_steps_per_second': 2.594, 'total_flos': 1316893961383308.0, 'train_loss': 0.8155854847509536, 'epoch': 5.0})



[DEV] Supervised metrics:
 {
  "eval_loss": 0.6289963722229004,
  "eval_accuracy": 0.7476828012358393,
  "eval_macro_f1": 0.7465406449206036,
  "eval_f1_neg": 0.7220216606498195,
  "eval_f1_neu": 0.6913229018492176,
  "eval_f1_pos": 0.8262773722627738,
  "eval_runtime": 1.2091,
  "eval_samples_per_second": 803.106,
  "eval_steps_per_second": 50.453,
  "epoch": 5.0
}



[TEST] Supervised metrics:
 {
  "eval_loss": 0.624123215675354,
  "eval_accuracy": 0.7472979927946475,
  "eval_macro_f1": 0.7459384117300433,
  "eval_f1_neg": 0.7325475974614687,
  "eval_f1_neu": 0.6850108616944244,
  "eval_f1_pos": 0.8202567760342369,
  "eval_runtime": 2.4135,
  "eval_samples_per_second": 805.041,
  "eval_steps_per_second": 50.548,
  "epoch": 5.0
}

=== Zero-shot baseline (joeddav/xlm-roberta-large-xnli) ===


config.json:   0%|          | 0.00/734 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[DEV] Zero-shot metrics:
 {
  "accuracy": 0.5612770339855818,
  "macro_f1": 0.5041316855450778,
  "f1_neg": 0.6137931034482759,
  "f1_neu": 0.2127659574468085,
  "f1_pos": 0.6858359957401491
}
[TEST] Zero-shot metrics:
 {
  "accuracy": 0.56201749871333,
  "macro_f1": 0.5026193100571188,
  "f1_neg": 0.5904255319148937,
  "f1_neu": 0.22327790973871733,
  "f1_pos": 0.6941544885177453
}

=== Phase 1 complete ===
{
  "paths": {
    "base_dir": "/content/Thesis_RomanUrdu_SA",
    "data_clean": "/content/Thesis_RomanUrdu_SA/datasets/rusa19_clean",
    "phase1_out": "/content/Thesis_RomanUrdu_SA/outputs/phase1",
    "ckpt_dir": "/content/Thesis_RomanUrdu_SA/outputs/phase1/supervised_xlmr_base"
  },
  "supervised": {
    "dev": {
      "eval_loss": 0.6289963722229004,
      "eval_accuracy": 0.7476828012358393,
      "eval_macro_f1": 0.7465406449206036,
      "eval_f1_neg": 0.7220216606498195,
      "eval_f1_neu": 0.6913229018492176,
      "eval_f1_pos": 0.8262773722627738,
      "eval_runtime":

# Thesis Framework Pipeline

In [5]:
# ------------------------------------------------------------------
# This cell implements Phase 2 (Weak Supervision) for Roman-Urdu.
#
# Goal:
#   Generate pseudo-labels for the TRAIN pool using an ensemble of 3 weak labelers:
#     (A) XNLI zero-shot classifier (multilingual NLI → sentiment via label prompts)
#     (B) Translation-based sentiment:
#           Roman-Urdu → Urdu (transliteration) → English (MT) → English sentiment model
#         with a strong multilingual fallback if transliteration/MT fails
#     (C) Lexicon heuristic (rule-based sentiment using Roman-Urdu lexicon cues)
#
# Output:
#   - train_probs_{xnli,en,lex}.npy          (3×probability sources, aligned with train_texts.csv)
#   - train_texts.csv                       (pool text order used across phases)
#   - rusa19_train_pseudo.csv               (pseudo_label + confidence + vote agreement + debug columns)
#   - phase2_summary.json                   (weights, transliteration status, vote counts)
#
# Notes:
#   - This is "Phase 2 (simple weighted vote)" — Phase 2.5 later learns a meta-combiner on few-shot gold.
#   - The saved *.npy probability files are reused in Phase 2.5 to avoid recomputation.
# ------------------------------------------------------------------



# ============================================
# Phase 2 — Weak Supervision & Pseudo-Labels (Roman-Urdu)
# ============================================

import os, re, json, numpy as np, pandas as pd, torch
from tqdm.auto import tqdm
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, set_seed, GenerationConfig
)
from huggingface_hub import snapshot_download

# ---------------- Paths ----------------
# Everything stays under /content/Thesis_RomanUrdu_SA for session-local reproducibility.
BASE_DIR       = "/content/Thesis_RomanUrdu_SA"
DATA_DIR_CLEAN = f"{BASE_DIR}/datasets/rusa19_clean"
OUT_DIR        = f"{BASE_DIR}/outputs/phase2"
os.makedirs(OUT_DIR, exist_ok=True)

# TRAIN is treated as the unlabeled pool for weak labeling.
# In later phases we train using these pseudo-labels rather than the gold (human labeled) labels.
P_TRAIN = f"{DATA_DIR_CLEAN}/rusa19_train_clean.csv"
P_LEX   = f"{BASE_DIR}/datasets/rusa19_raw/roman_urdu_lexicon.csv"

# ---------------- Reproducibility ----------------
SEED = 42
set_seed(SEED)
DEVICE = 0 if torch.cuda.is_available() else -1
print(f"Device: {'cuda' if DEVICE==0 else 'cpu'}")

# ---------------- Label space ----------------
LABEL_LIST = ["negative","neutral","positive"]
LABEL2ID   = {l:i for i,l in enumerate(LABEL_LIST)}
ID2LABEL   = {i:l for l,i in LABEL2ID.items()}

# ---------------- Load pool ----------------
# We only need the text column here; gold labels are ignored in Phase 2.
df_pool = pd.read_csv(P_TRAIN, encoding="utf-8-sig")
texts   = df_pool["text"].astype(str).tolist()

# Persist the exact pool order to avoid misalignment when loading cached probabilities later (Phase 2.5).
df_pool[["text"]].to_csv(f"{OUT_DIR}/train_texts.csv", index=False, encoding="utf-8-sig")


# ---------------- (A) XNLI ----------------
# Weak labeler A:
#   Use NLI-based zero-shot classification where candidate labels are sentiment strings.
#   Produces a 3-dim probability distribution for each Roman-Urdu text.
print("Loading XNLI zero-shot pipeline ...")
zsc = pipeline(
    "zero-shot-classification",
    model="joeddav/xlm-roberta-large-xnli",
    device=DEVICE
)
CANDS = LABEL_LIST

def run_xnli(texts, batch_size=16):
    # Run in batches for speed; normalize to ensure probabilities sum to 1.
    probs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="[A] XNLI"):
        batch = texts[i:i+batch_size]
        res = zsc(batch, candidate_labels=CANDS, multi_label=False)
        if isinstance(res, dict): res = [res]  # HF returns dict for single input
        for r in res:
            p = np.zeros(3, dtype=np.float32)
            for lbl, sc in zip(r["labels"], r["scores"]):
                ln = lbl.strip().lower()
                if ln in LABEL2ID: p[LABEL2ID[ln]] = float(sc)
            s = p.sum()
            probs.append(p/s if s>0 else np.ones(3)/3)
    return np.vstack(probs)

xnli_probs = run_xnli(texts)

# Save probabilities to reuse in Phase 2.5 (stacking) without re-running the pipelines.
np.save(f"{OUT_DIR}/train_probs_xnli.npy", xnli_probs.astype(np.float32))


# ---------------- (B) RUR→UR (Mavkif) → EN MT → EN sentiment ----------------
# Weak labeler B:
#   Roman-Urdu → Urdu transliteration (Mavkif m2m100)
#   Urdu → English translation (Helsinki opus-mt)
#   English sentiment classification (CardiffNLP RoBERTa sentiment)
#
# Why this chain?
#   Roman-Urdu resources are limited; English have stronger translation/sentiment tooling.
#   The pipeline converts Roman-Urdu into a space where sentiment models are more reliable.
#
# Robustness:
#   HF generation configs sometimes contain invalid fields; we patch generation_config.json if needed.
#   If transliteration/MT fails, we fall back to a multilingual sentiment classifier (CardiffNLP XLM-R sentiment).
#
# --- Mavkif transliterator loader (robust) ---
TOK_RUP = "Mavkif/m2m100_rup_tokenizer_both"   # tokenizer with special tokens
MOD_R2U = "Mavkif/m2m100_rup_rur_to_ur"        # Roman-Urdu → Urdu

tok_rup = AutoTokenizer.from_pretrained(TOK_RUP)

def _patch_generation_config_in_place(dir_path: str):
    # Patch generation_config.json to satisfy newer transformers validations (HF>=4.44).
    # This prevents ValueError when early_stopping is invalid / missing.
    gen_cfg_path = os.path.join(dir_path, "generation_config.json")
    g = {}
    if os.path.exists(gen_cfg_path):
        try:
            with open(gen_cfg_path, "r", encoding="utf-8") as f:
                g = json.load(f)
        except Exception:
            g = {}
    # Set safe defaults expected by HF>=4.44
    if g.get("early_stopping", None) not in (True, False, "never"):
        g["early_stopping"] = True
    if g.get("max_new_tokens", None) is None:
        g["max_new_tokens"] = 128
    if g.get("num_beams", None) is None:
        g["num_beams"] = 1
    if g.get("do_sample", None) is None:
        g["do_sample"] = False
    with open(gen_cfg_path, "w", encoding="utf-8") as f:
        json.dump(g, f, ensure_ascii=False, indent=2)

def load_mavkif_transliterator(device=DEVICE):
    # Load Mavkif model; if HF throws config validation errors, download locally and patch.
    from huggingface_hub import snapshot_download
    try:
        mdl = AutoModelForSeq2SeqLM.from_pretrained(MOD_R2U)
    except ValueError as e:
        # If HF validation explodes (e.g., early_stopping=None), patch locally then load
        if "early_stopping" in str(e):
            local_dir = snapshot_download(
                repo_id=MOD_R2U,
                local_dir="/content/mavkif_r2u",
                local_dir_use_symlinks=False
            )
            _patch_generation_config_in_place(local_dir)
            mdl = AutoModelForSeq2SeqLM.from_pretrained(local_dir, local_files_only=True)
        else:
            raise

    # Attach a safe GenerationConfig anyway (acts as a second line of defense).
    gen_cfg = GenerationConfig.from_model_config(mdl.config)
    if getattr(gen_cfg, "early_stopping", None) not in (True, False, "never"):
        gen_cfg.early_stopping = True
    if getattr(gen_cfg, "max_new_tokens", None) is None:
        gen_cfg.max_new_tokens = 128
    if getattr(gen_cfg, "num_beams", None) is None:
        gen_cfg.num_beams = 1
    if getattr(gen_cfg, "do_sample", None) is None:
        gen_cfg.do_sample = False

    # Force target-language token (__ur__) to ensure outputs are Urdu.
    forced_bos_id = tok_rup.convert_tokens_to_ids("__ur__")
    gen_cfg.forced_bos_token_id = forced_bos_id
    if getattr(gen_cfg, "decoder_start_token_id", None) is None:
        gen_cfg.decoder_start_token_id = forced_bos_id

    # Ensure pad token exists (needed for batch generation with padding).
    if tok_rup.pad_token_id is None:
        tok_rup.add_special_tokens({"pad_token": "<pad>"})
        mdl.resize_token_embeddings(len(tok_rup))

    # Ensure model config aligns with tokenizer IDs.
    if getattr(mdl.config, "pad_token_id", None) is None:
        mdl.config.pad_token_id = tok_rup.pad_token_id
    if getattr(mdl.config, "decoder_start_token_id", None) is None:
        mdl.config.decoder_start_token_id = forced_bos_id

    mdl.generation_config = gen_cfg
    return mdl.to("cuda" if device == 0 else "cpu")

mdl_r2u = load_mavkif_transliterator(DEVICE)


def translit_rur_to_ur(batch_texts, max_new_tokens=128):
    # Mavkif tokenizer expects language prefix tokens.
    prefixed = [f"__roman-ur__ {t}" for t in batch_texts]
    inputs = tok_rup(prefixed, return_tensors="pt", padding=True, truncation=True, max_length=256)
    if DEVICE == 0:
        inputs = {k:v.to("cuda") for k,v in inputs.items()}
    with torch.no_grad():
        gen = mdl_r2u.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            forced_bos_token_id=mdl_r2u.generation_config.forced_bos_token_id,
            num_beams=1,
            do_sample=False
        )
    out = tok_rup.batch_decode(gen, skip_special_tokens=True)
    # Remove any leftover "__ur__" markers for readability/logging.
    return [o.replace("__ur__", "").strip() for o in out]

# Urdu -> English MT (machine translation)
TOK_UR_EN = "Helsinki-NLP/opus-mt-ur-en"
tok_ur_en = AutoTokenizer.from_pretrained(TOK_UR_EN)
mdl_ur_en = AutoModelForSeq2SeqLM.from_pretrained(TOK_UR_EN).to("cuda" if DEVICE==0 else "cpu")

def ur_to_en(batch_ur, max_new_tokens=128):
    inputs = tok_ur_en(batch_ur, return_tensors="pt", padding=True, truncation=True, max_length=256)
    if DEVICE == 0:
        inputs = {k:v.to("cuda") for k,v in inputs.items()}
    with torch.no_grad():
        gen = mdl_ur_en.generate(**inputs, max_new_tokens=max_new_tokens)
    return tok_ur_en.batch_decode(gen, skip_special_tokens=True)

# English sentiment classifier (CardiffNLP Twitter RoBERTa sentiment).
# top_k=None returns all class scores per example in newer Transformers.
EN_CLS = pipeline(
    "text-classification",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    top_k=None,   # replaces return_all_scores=True
    device=DEVICE
)

def en_sentiment_probs(batch_en):
    # Convert pipeline output (list of label-score dicts) into a fixed 3-dim vector per text.
    out = []
    for i in range(0, len(batch_en), 32):
        res = EN_CLS(batch_en[i:i+32])
        for scores in res:
            vec = np.zeros(3, dtype=np.float32)
            for s in scores:
                lbl = s["label"].lower()
                # Some models return LABEL_0/LABEL_1/LABEL_2; map to indices.
                if "label_" in lbl:
                    idx = int(lbl.split("_")[-1])
                else:
                    idx = {"negative":0, "neutral":1, "positive":2}.get(lbl, None)
                if idx is not None and 0 <= idx <= 2:
                    vec[idx] = float(s["score"])
            ssum = vec.sum()
            out.append(vec/ssum if ssum>0 else np.ones(3)/3)
    return np.vstack(out)

def run_translit_mt_cls(texts, bs=16):
    # End-to-end batch runner:
    #   Roman-Urdu -> Urdu -> English -> sentiment probs
    ur_list, en_list, probs = [], [], []
    for i in tqdm(range(0, len(texts), bs), desc="[B] RUR→UR→EN→Sent"):
        batch = texts[i:i+bs]
        ur = translit_rur_to_ur(batch)
        en = ur_to_en(ur)
        pr = en_sentiment_probs(en)
        ur_list.extend(ur); en_list.extend(en); probs.append(pr)
    return ur_list, en_list, np.vstack(probs)

try:
    # Preferred path: transliteration + MT + English sentiment
    ur_texts, en_texts, enclf_probs = run_translit_mt_cls(texts, bs=16)
    translit_ok = True
except Exception as e:
    # Robust fallback: direct multilingual sentiment on Roman-Urdu (no translation)
    print("WARNING: Transliteration+MT branch failed; using multilingual sentiment as source B.\n", repr(e))
    # CardiffNLP multilingual sentiment (XLM-R), labels: negative/neutral/positive
    MULTI_CLS = pipeline(
        "text-classification",
        model="cardiffnlp/twitter-xlm-roberta-base-sentiment",
        top_k=None,
        device=DEVICE
    )
    def multi_sentiment_probs(batch_texts):
        out = []
        for i in range(0, len(batch_texts), 32):
            res = MULTI_CLS(batch_texts[i:i+32])
            for scores in res:
                vec = np.zeros(3, dtype=np.float32)
                for s in scores:
                    lbl = s["label"].lower()
                    idx = {"negative":0, "neutral":1, "positive":2}.get(lbl, None)
                    if idx is not None: vec[idx] = float(s["score"])
                ssum = vec.sum()
                out.append(vec/ssum if ssum>0 else np.ones(3)/3)
        return np.vstack(out)

    # Keep placeholder translation outputs to maintain consistent column schema in the final CSV.
    ur_texts, en_texts = [""]*len(texts), [""]*len(texts)
    enclf_probs = multi_sentiment_probs(texts)   # strong fallback instead of uniform
    translit_ok = False

# Save source-B probabilities for reuse in Phase 2.5 (stacking).
np.save(f"{OUT_DIR}/train_probs_en.npy", enclf_probs.astype(np.float32))



# ---------------- (C) Lexicon heuristic ----------------
# Weak labeler C:
#   Rule-based scoring from a Roman-Urdu lexicon with polarity words, negators, intensifiers, etc.
#   Returns a soft 3-class distribution using a simple logit→softmax transformation.
def load_lexicon(path):
    if not os.path.exists(path):
        print("WARNING: lexicon file not found, lexicon source will be uniform.")
        return None
    lex = pd.read_csv(path, encoding="utf-8-sig").fillna("")
    lex["word"] = lex["word"].astype(str).str.strip().str.lower()
    lex["sentiment"] = lex["sentiment"].astype(str).str.lower()
    lex["category"] = lex["category"].astype(str).str.lower()
    pol = lex[lex["category"].eq("polarity")]
    # polarity dictionaries: word -> score (separate for pos/neg)
    pol_pos = {w:s for w,s,sen in pol[["word","score","sentiment"]].itertuples(index=False) if sen=="positive"}
    pol_neg = {w:s for w,s,sen in pol[["word","score","sentiment"]].itertuples(index=False) if sen=="negative"}
    # lexical operators
    negators = set(lex.loc[lex["category"].eq("negator"), "word"])
    intens  = set(lex.loc[lex["category"].eq("intensifier"), "word"])
    dimin   = set(lex.loc[lex["category"].eq("diminisher"), "word"])
    return {"pol_pos":pol_pos, "pol_neg":pol_neg, "negators":negators, "intens":intens, "dimin":dimin}

LEX = load_lexicon(P_LEX)
TOKEN_SPLIT = re.compile(r"[^\w']+", flags=re.UNICODE)

def lexicon_score(text):
    # Return uniform distribution if lexicon is missing or text is empty.
    if LEX is None or not text:
        return np.array([1/3,1/3,1/3], dtype=np.float32)

    # Tokenize using a simple regex split (works decently for Roman script).
    t = text.lower()
    toks = [w for w in TOKEN_SPLIT.split(t) if w]

    # Accumulate polarity scores and apply simple heuristics.
    pos_hits = 0.0; neg_hits = 0.0
    intens_count = 0; dimin_count = 0

    for i, w in enumerate(toks):
        if w in LEX["pol_pos"]: pos_hits += float(LEX["pol_pos"][w])
        if w in LEX["pol_neg"]: neg_hits += float(LEX["pol_neg"][w])
        if w in LEX["intens"]:  intens_count += 1
        if w in LEX["dimin"]:   dimin_count  += 1

        # Negation heuristic: flip nearby polarity contributions within a small window.
        if w in LEX["negators"]:
            for j in range(max(0,i-3), min(len(toks), i+4)):
                w2 = toks[j]
                if w2 in LEX["pol_pos"]: pos_hits -= float(LEX["pol_pos"][w2])*0.8
                if w2 in LEX["pol_neg"]: neg_hits -= float(LEX["pol_neg"][w2])*0.8

    # Handle multi-word expressions in the lexicon (phrase matching).
    for w, s in LEX["pol_pos"].items():
        if " " in w and w in t: pos_hits += float(s)
    for w, s in LEX["pol_neg"].items():
        if " " in w and w in t: neg_hits += float(s)

    # Intensifiers/diminishers act as simple scaling on polarity totals.
    pos_hits *= (1.0 + 0.15*intens_count - 0.10*dimin_count)
    neg_hits *= (1.0 + 0.15*intens_count - 0.10*dimin_count)

    # Construct [neg, neu, pos] "logits" and softmax them.
    neg_score, pos_score = float(neg_hits), float(pos_hits)
    neu_score = max(0.0, 0.6 - 0.3*(abs(pos_score-neg_score)))  # heuristic: neutrality when pos≈neg
    vec = np.array([neg_score, neu_score, pos_score], dtype=np.float32)

    # Safety: if scores are invalid, return uniform.
    if not np.isfinite(vec).all() or vec.sum() <= 0:
        return np.array([1/3,1/3,1/3], dtype=np.float32)

    expv = np.exp(vec - vec.max())
    return (expv / expv.sum()).astype(np.float32)

lex_probs = np.vstack([lexicon_score(t) for t in texts])

# Save lexicon probabilities for reuse in Phase 2.5.
np.save(f"{OUT_DIR}/train_probs_lex.npy", lex_probs.astype(np.float32))


# ---------------- Aggregate + votes ----------------
# Phase 2 uses a simple weighted probability average of the three sources.
# Weights reflect trust/strength of each source in this setting (lexicon is lower-weight).
W_XNLI, W_EN, W_LEX = 0.45, 0.45, 0.10
agg = W_XNLI*xnli_probs + W_EN*enclf_probs + W_LEX*lex_probs
agg = agg / np.clip(agg.sum(axis=1, keepdims=True), 1e-8, None)  # renormalize
pred_ids = agg.argmax(axis=1)     # final pseudo label (hard)
conf     = agg.max(axis=1)        # confidence = max prob under aggregated distribution

# votes_agree counts how many weak sources agree with the final aggregated label.
# This becomes a useful curriculum signal in Phase 3 (high/mid/all buckets).
votes = []
for i in range(len(texts)):
    v = 0
    if xnli_probs[i].argmax()==pred_ids[i]: v+=1
    if enclf_probs[i].argmax()==pred_ids[i]: v+=1
    if lex_probs[i].argmax()==pred_ids[i]:  v+=1
    votes.append(v)

# ---------------- Save ----------------
# Save a "pseudo pack" CSV with:
#   - final pseudo_label + confidence
#   - per-source predicted label/conf
#   - translations (if available) for debugging/analysis
out = pd.DataFrame({
    "text": texts,
    "pseudo_label": [ID2LABEL[int(i)] for i in pred_ids],
    "pseudo_confidence": conf,
    "votes_agree": votes,
    "xnli_label": [ID2LABEL[int(i)] for i in xnli_probs.argmax(axis=1)],
    "xnli_conf":  xnli_probs.max(axis=1),
    "en_label":   [ID2LABEL[int(i)] for i in enclf_probs.argmax(axis=1)],
    "en_conf":    enclf_probs.max(axis=1),
    "lex_label":  [ID2LABEL[int(i)] for i in lex_probs.argmax(axis=1)],
    "lex_conf":   lex_probs.max(axis=1),
    "translit_ur": ur_texts,   # Urdu transliteration output (empty strings if fallback path)
    "trans_en":    en_texts,   # English MT output (empty strings if fallback path)
})
OUT_CSV = f"{OUT_DIR}/rusa19_train_pseudo.csv"
out.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")

# Persist key run metadata for later phases and for writing the thesis methodology section.
summary = {
    "paths": {"phase2_out": OUT_DIR, "pseudo_csv": OUT_CSV},
    "weights": {"xnli": W_XNLI, "en_cls": W_EN, "lex": W_LEX},
    "transliteration_ok": translit_ok,   # indicates whether the preferred transliteration+MT path succeeded
    "counts": {
        "n_rows": len(out),
        "votes_3": int((out["votes_agree"]==3).sum()),
        "votes_2": int((out["votes_agree"]==2).sum()),
        "votes_1": int((out["votes_agree"]==1).sum()),
        "votes_0": int((out["votes_agree"]==0).sum())
    }
}
with open(f"{OUT_DIR}/phase2_summary.json", "w", encoding="utf-8") as f:
    json.dump(summary, f, indent=2, ensure_ascii=False)

print("\n=== Phase 2 complete ===")
print(json.dumps(summary, indent=2, ensure_ascii=False))


Device: cuda
Loading XNLI zero-shot pipeline ...


Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


[A] XNLI:   0%|          | 0/426 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/900 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/848k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/816k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


[B] RUR→UR→EN→Sent:   0%|          | 0/426 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



=== Phase 2 complete ===
{
  "paths": {
    "phase2_out": "/content/Thesis_RomanUrdu_SA/outputs/phase2",
    "pseudo_csv": "/content/Thesis_RomanUrdu_SA/outputs/phase2/rusa19_train_pseudo.csv"
  },
  "weights": {
    "xnli": 0.45,
    "en_cls": 0.45,
    "lex": 0.1
  },
  "transliteration_ok": true,
  "counts": {
    "n_rows": 6806,
    "votes_3": 1137,
    "votes_2": 3475,
    "votes_1": 2165,
    "votes_0": 29
  }
}


In [6]:
# ============================================
# Phase 2.5 — Learned Combiner (Stacking / Meta-model)
# ============================================
# This cell implements the *learned aggregation* step of the thesis framework.
#
# Why Phase 2.5?
#   Phase 2 used a fixed weighted average of three weak sources. In Phase 2.5 we:
#     1) Use a small gold set (here: FEWSHOT 64×3) to learn how to combine those weak sources.
#     2) Train a meta-model (stacker) that maps weak-source outputs → final calibrated probabilities.
#     3) Apply the stacker to the whole TRAIN pool to produce a richer pseudo-label pack.
#
# Weak Sources (same as Phase 2):
#   (A) XNLI zero-shot (joeddav/xlm-roberta-large-xnli)
#   (B) Transliteration + MT + English sentiment (with robust fallback)
#   (C) Lexicon heuristic
#
# Key outputs used downstream:
#   - rusa19_train_pseudo_stacked.csv
#       Contains: pseudo_label, pseudo_confidence, votes_agree
#                + calibrated combiner probabilities (p_negative/p_neutral/p_positive)
#       These combiner probs are used in Phase 3 as *soft targets* (KLDiv training).
#
# Reproducibility / efficiency:
#   - If Phase-2 cached probability files exist (train_probs_*.npy), we reuse them for the pool.
#   - We also store dev_probs_* and pool_probs_* inside outputs/phase2_5 for traceability.
# ============================================


import math, random, shutil, gc, time
import os, re, json, numpy as np, pandas as pd, torch, joblib
from tqdm.auto import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score, accuracy_score
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification,
    pipeline, set_seed, GenerationConfig
)

# ---------------- Paths & constants ----------------
# OUT_DIR holds *all* artifacts for Phase 2.5: combiner, features, dev/pool probs, and final stacked CSV.
BASE_DIR       = "/content/Thesis_RomanUrdu_SA"
DATA_DIR_CLEAN = f"{BASE_DIR}/datasets/rusa19_clean"
RAW_DIR        = f"{BASE_DIR}/datasets/rusa19_raw"
OUT_DIR        = f"{BASE_DIR}/outputs/phase2_5"
os.makedirs(OUT_DIR, exist_ok=True)

# Phase-2 cached outputs (if present) — enables skipping the expensive weak labeler runs for the pool.
PH2_DIR   = f"{BASE_DIR}/outputs/phase2"
PH2_XNLI  = f"{PH2_DIR}/train_probs_xnli.npy"
PH2_EN    = f"{PH2_DIR}/train_probs_en.npy"
PH2_LEX   = f"{PH2_DIR}/train_probs_lex.npy"
PH2_TEXTS = f"{PH2_DIR}/train_texts.csv"
PH2_SUMMARY = f"{PH2_DIR}/phase2_summary.json"



P_TRAIN = f"{DATA_DIR_CLEAN}/rusa19_train_clean.csv"  # pool texts (treated as unlabeled for re-labeling)
P_LEX   = f"{RAW_DIR}/roman_urdu_lexicon.csv"
P_FEWSHOT = f"{DATA_DIR_CLEAN}/rusa19_fewshot64_clean.csv"  # 64 each (few-shot gold for combiner learning)



# ---------------- Reproducibility ----------------
SEED = 42
set_seed(SEED)
DEVICE = 0 if torch.cuda.is_available() else -1
print(f"Device: {'cuda' if DEVICE==0 else 'cpu'}")

# ---------------- Label space ----------------
LABEL_LIST = ["negative","neutral","positive"]
LABEL2ID   = {l:i for i,l in enumerate(LABEL_LIST)}
ID2LABEL   = {i:l for l,i in LABEL2ID.items()}

# ---------------- Utility: label normalization ----------------
# Normalizes different dataset label formats into the canonical {negative, neutral, positive}.
def normalize_label(x):
    s=str(x).strip().lower()
    if s in {"-1","neg","n","negative","2"}: return "negative"
    if s in {"0","neu","neutral"}:           return "neutral"
    if s in {"1","pos","p","positive"}:      return "positive"
    return np.nan

def load_split(path):
    # Load a split and standardize to columns: text, label(id), sentiment(str).
    df = pd.read_csv(path, encoding="utf-8-sig").rename(columns={"Text":"text","Sentiment":"sentiment"})
    df["sentiment"] = df["sentiment"].map(normalize_label)
    df = df.dropna(subset=["text","sentiment"]).copy()
    df["label"] = df["sentiment"].map(LABEL2ID)
    return df[["text","label","sentiment"]]

# Pool (TRAIN) is treated as unlabeled; we only keep text and later attach pseudo labels.
df_pool = load_split(P_TRAIN)[["text"]]              # unlabeled pool (we’ll re-label)

# FEWSHOT-only combiner:
# We intentionally use a small gold set to learn the stacking weights/mapping.
if not os.path.exists(P_FEWSHOT):
    raise FileNotFoundError(
        f"Few-shot file not found: {P_FEWSHOT}\n"
        "This Phase-2.5 combiner is FEWSHOT-only by design. "
        "Create/export the few-shot CSV first, then re-run Phase-2.5."
    )

df_dev = load_split(P_FEWSHOT)
print(f"Using FEWSHOT split for combiner: {P_FEWSHOT} (n={len(df_dev)})")

texts_pool = df_pool["text"].astype(str).tolist()
texts_dev  = df_dev["text"].astype(str).tolist()
y_dev      = df_dev["label"].to_numpy()  # gold labels for combiner training/evaluation


# =========================================================
# Weak labelers = same 3 sources as Phase-2
# =========================================================
# IMPORTANT:
#   The combiner learns based on *probabilities* from these sources (not only hard labels).
#   This enables learning patterns like "trust XNLI more when its confidence is high"
#   or "trust lexicon more when negators appear", etc.

# -------- (A) XNLI zero-shot --------
print("Loading XNLI zero-shot pipeline ...")
zsc = pipeline(
    "zero-shot-classification",
    model="joeddav/xlm-roberta-large-xnli",
    device=DEVICE
)
CANDS = LABEL_LIST

def run_xnli(texts, batch_size=16):
    # Returns shape: [N, 3] probabilities aligned to LABEL_LIST order.
    probs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="[A] XNLI"):
        batch = texts[i:i+batch_size]
        res = zsc(batch, candidate_labels=CANDS, multi_label=False)
        if isinstance(res, dict): res = [res]
        for r in res:
            p = np.zeros(3, dtype=np.float32)
            for lbl, sc in zip(r["labels"], r["scores"]):
                ln = lbl.strip().lower()
                if ln in LABEL2ID: p[LABEL2ID[ln]] = float(sc)
            s = p.sum()
            probs.append(p/s if s>0 else np.ones(3)/3)
    return np.vstack(probs).astype(np.float32)

# -------- (B) Transliterator + MT + EN sentiment (robust) --------
# Same robust transliteration pipeline as Phase 2, including:
#   - HF generation_config patching for compatibility with newer Transformers versions
#   - fallback to multilingual sentiment model if transliteration/MT fails
from huggingface_hub import snapshot_download

TOK_RUP = "Mavkif/m2m100_rup_tokenizer_both"
MOD_R2U = "Mavkif/m2m100_rup_rur_to_ur"
tok_rup = AutoTokenizer.from_pretrained(TOK_RUP)

def _patch_generation_config_in_place(dir_path: str):
    # Ensures generation_config.json has valid/safe defaults.
    gen_cfg_path = os.path.join(dir_path, "generation_config.json")
    g = {}
    if os.path.exists(gen_cfg_path):
        try:
            with open(gen_cfg_path, "r", encoding="utf-8") as f:
                g = json.load(f)
        except Exception:
            g = {}
    if g.get("early_stopping", None) not in (True, False, "never"):
        g["early_stopping"] = True
    if g.get("max_new_tokens", None) is None:
        g["max_new_tokens"] = 128
    if g.get("num_beams", None) is None:
        g["num_beams"] = 1
    if g.get("do_sample", None) is None:
        g["do_sample"] = False
    with open(gen_cfg_path, "w", encoding="utf-8") as f:
        json.dump(g, f, ensure_ascii=False, indent=2)

def load_mavkif_transliterator(device=DEVICE):
    # Load transliteration model; patch generation config locally if HF raises validation error.
    try:
        mdl = AutoModelForSeq2SeqLM.from_pretrained(MOD_R2U)
    except ValueError as e:
        if "early_stopping" in str(e):
            local_dir = snapshot_download(
                repo_id=MOD_R2U, local_dir="/content/mavkif_r2u", local_dir_use_symlinks=False
            )
            _patch_generation_config_in_place(local_dir)
            mdl = AutoModelForSeq2SeqLM.from_pretrained(local_dir, local_files_only=True)
        else:
            raise
    # safe generation config + force target lang
    gen_cfg = GenerationConfig.from_model_config(mdl.config)
    if getattr(gen_cfg, "early_stopping", None) not in (True, False, "never"):
        gen_cfg.early_stopping = True
    if getattr(gen_cfg, "max_new_tokens", None) is None:
        gen_cfg.max_new_tokens = 128
    if getattr(gen_cfg, "num_beams", None) is None:
        gen_cfg.num_beams = 1
    if getattr(gen_cfg, "do_sample", None) is None:
        gen_cfg.do_sample = False
    forced_bos_id = tok_rup.convert_tokens_to_ids("__ur__")
    gen_cfg.forced_bos_token_id = forced_bos_id
    if getattr(gen_cfg, "decoder_start_token_id", None) is None:
        gen_cfg.decoder_start_token_id = forced_bos_id
    if tok_rup.pad_token_id is None:
        tok_rup.add_special_tokens({"pad_token": "<pad>"})
        mdl.resize_token_embeddings(len(tok_rup))
    if getattr(mdl.config, "pad_token_id", None) is None:
        mdl.config.pad_token_id = tok_rup.pad_token_id
    if getattr(mdl.config, "decoder_start_token_id", None) is None:
        mdl.config.decoder_start_token_id = forced_bos_id
    mdl.generation_config = gen_cfg
    return mdl.to("cuda" if device == 0 else "cpu")

mdl_r2u = load_mavkif_transliterator(DEVICE)

def translit_rur_to_ur(batch_texts, max_new_tokens=128):
    # Prefix-based generation ensures correct source language tokenization.
    prefixed = [f"__roman-ur__ {t}" for t in batch_texts]
    inputs = tok_rup(prefixed, return_tensors="pt", padding=True, truncation=True, max_length=256)
    if DEVICE == 0:
        inputs = {k:v.to("cuda") for k,v in inputs.items()}
    with torch.no_grad():
        gen = mdl_r2u.generate(
            **inputs, max_new_tokens=max_new_tokens,
            forced_bos_token_id=mdl_r2u.generation_config.forced_bos_token_id,
            num_beams=1, do_sample=False
        )
    out = tok_rup.batch_decode(gen, skip_special_tokens=True)
    return [o.replace("__ur__", "").strip() for o in out]

# Urdu -> English MT
TOK_UR_EN = "Helsinki-NLP/opus-mt-ur-en"
tok_ur_en = AutoTokenizer.from_pretrained(TOK_UR_EN)
mdl_ur_en = AutoModelForSeq2SeqLM.from_pretrained(TOK_UR_EN).to("cuda" if DEVICE==0 else "cpu")

def ur_to_en(batch_ur, max_new_tokens=128):
    inputs = tok_ur_en(batch_ur, return_tensors="pt", padding=True, truncation=True, max_length=256)
    if DEVICE == 0:
        inputs = {k:v.to("cuda") for k,v in inputs.items()}
    with torch.no_grad():
        gen = mdl_ur_en.generate(**inputs, max_new_tokens=max_new_tokens)
    return tok_ur_en.batch_decode(gen, skip_special_tokens=True)

# English sentiment (CardiffNLP)
EN_CLS = pipeline(
    "text-classification",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    top_k=None,
    device=DEVICE
)

def en_sentiment_probs(batch_en):
    # Convert per-example list-of-dicts output into a fixed [neg, neu, pos] vector.
    out = []
    for i in range(0, len(batch_en), 32):
        res = EN_CLS(batch_en[i:i+32])
        for scores in res:
            vec = np.zeros(3, dtype=np.float32)
            for s in scores:
                lbl = s["label"].lower()
                # Some models emit LABEL_0/1/2; map accordingly.
                if "label_" in lbl:
                    idx = int(lbl.split("_")[-1])
                else:
                    idx = {"negative":0, "neutral":1, "positive":2}.get(lbl, None)
                if idx is not None and 0 <= idx <= 2:
                    vec[idx] = float(s["score"])
            ssum = vec.sum()
            out.append(vec/ssum if ssum>0 else np.ones(3)/3)
    return np.vstack(out).astype(np.float32)

def run_translit_mt_cls(texts, bs=16):
    # End-to-end Roman-Urdu → Urdu → English → sentiment probs
    ur_list, en_list, probs = [], [], []
    for i in tqdm(range(0, len(texts), bs), desc="[B] RUR→UR→EN→Sent"):
        batch = texts[i:i+bs]
        ur = translit_rur_to_ur(batch)
        en = ur_to_en(ur)
        pr = en_sentiment_probs(en)
        ur_list.extend(ur); en_list.extend(en); probs.append(pr)
    return ur_list, en_list, np.vstack(probs).astype(np.float32)

# -------- (C) Lexicon heuristic --------
# Rule-based probabilities using Roman-Urdu lexicon signals.
def load_lexicon(path):
    if not os.path.exists(path):
        print("WARNING: lexicon file not found, lexicon source will be uniform.")
        return None
    lex = pd.read_csv(path, encoding="utf-8-sig").fillna("")
    lex["word"] = lex["word"].astype(str).str.strip().str.lower()
    lex["sentiment"] = lex["sentiment"].astype(str).str.lower()
    lex["category"] = lex["category"].astype(str).str.lower()
    pol = lex[lex["category"].eq("polarity")]
    pol_pos = {w:s for w,s,sen in pol[["word","score","sentiment"]].itertuples(index=False) if sen=="positive"}
    pol_neg = {w:s for w,s,sen in pol[["word","score","sentiment"]].itertuples(index=False) if sen=="negative"}
    negators = set(lex.loc[lex["category"].eq("negator"), "word"])
    intens  = set(lex.loc[lex["category"].eq("intensifier"), "word"])
    dimin   = set(lex.loc[lex["category"].eq("diminisher"), "word"])
    return {"pol_pos":pol_pos, "pol_neg":pol_neg, "negators":negators, "intens":intens, "dimin":dimin}

LEX = load_lexicon(P_LEX)
TOKEN_SPLIT = re.compile(r"[^\w']+", flags=re.UNICODE)

def lexicon_score(text):
    # Uniform if lexicon missing or empty text.
    if LEX is None or not text:
        return np.array([1/3,1/3,1/3], dtype=np.float32)
    t = str(text).lower()
    toks = [w for w in TOKEN_SPLIT.split(t) if w]
    pos_hits = 0.0; neg_hits = 0.0
    intens_count = 0; dimin_count = 0
    for i, w in enumerate(toks):
        if w in LEX["pol_pos"]: pos_hits += float(LEX["pol_pos"][w])
        if w in LEX["pol_neg"]: neg_hits += float(LEX["pol_neg"][w])
        if w in LEX["intens"]:  intens_count += 1
        if w in LEX["dimin"]:   dimin_count  += 1
        # Negation heuristic: reduce/flip polarity contributions within a local window.
        if w in LEX["negators"]:
            for j in range(max(0,i-3), min(len(toks), i+4)):
                w2 = toks[j]
                if w2 in LEX["pol_pos"]: pos_hits -= float(LEX["pol_pos"][w2])*0.8
                if w2 in LEX["pol_neg"]: neg_hits -= float(LEX["pol_neg"][w2])*0.8
    # Multi-word expressions (phrase match).
    for w, s in LEX["pol_pos"].items():
        if " " in w and w in t: pos_hits += float(s)
    for w, s in LEX["pol_neg"].items():
        if " " in w and w in t: neg_hits += float(s)
    # Intensifier/diminisher scaling.
    pos_hits *= (1.0 + 0.15*intens_count - 0.10*dimin_count)
    neg_hits *= (1.0 + 0.15*intens_count - 0.10*dimin_count)
    neg_score, pos_score = float(neg_hits), float(pos_hits)
    neu_score = max(0.0, 0.6 - 0.3*(abs(pos_score-neg_score)))
    vec = np.array([neg_score, neu_score, pos_score], dtype=np.float32)
    if not np.isfinite(vec).all() or vec.sum() <= 0:
        return np.array([1/3,1/3,1/3], dtype=np.float32)
    expv = np.exp(vec - vec.max())
    return (expv / expv.sum()).astype(np.float32)

def run_lex(texts):
    return np.vstack([lexicon_score(t) for t in texts]).astype(np.float32)

# =========================================================
# Compute weak-labeler probabilities on DEV (for training combiner)
# and on POOL (for re-labeling)
# =========================================================
# DEV probabilities are *always computed here* because DEV is FEWSHOT-only and not cached in Phase 2.
xnli_dev  = run_xnli(texts_dev)

# Source-B probabilities can fail due to model load / generation config / translation issues.
# We handle this using a multilingual sentiment fallback to keep the framework robust.
try:
    _, _, en_dev = run_translit_mt_cls(texts_dev, bs=16)
    translit_ok_dev = True
except Exception as e:
    print("WARNING: Transliteration+MT failed on DEV; using multilingual sentiment as fallback.\n", repr(e))
    MULTI = pipeline("text-classification", model="cardiffnlp/twitter-xlm-roberta-base-sentiment", top_k=None, device=DEVICE)
    def multi_probs(batch_texts):
        out=[]
        for i in range(0,len(batch_texts),32):
            res = MULTI(batch_texts[i:i+32])
            for scores in res:
                vec=np.zeros(3, dtype=np.float32)
                for s in scores:
                    idx={"negative":0,"neutral":1,"positive":2}.get(s["label"].lower(),None)
                    if idx is not None: vec[idx]=float(s["score"])
                ssum=vec.sum(); out.append(vec/ssum if ssum>0 else np.ones(3)/3)
        return np.vstack(out).astype(np.float32)
    en_dev = multi_probs(texts_dev)
    translit_ok_dev = False

lex_dev   = run_lex(texts_dev)

# Save DEV probs for traceability (so we can later reproduce the combiner training exactly).
np.save(f"{OUT_DIR}/dev_probs_xnli.npy", xnli_dev)
np.save(f"{OUT_DIR}/dev_probs_en.npy",   en_dev)
np.save(f"{OUT_DIR}/dev_probs_lex.npy",  lex_dev)

# ---------------------------------------------------------
# Feature engineering for the stacking model
# ---------------------------------------------------------
# Base features: concatenated probabilities from the 3 weak sources => 9 dims total.
# Additional meta features: confidence/entropy/agreement + simple text/lexicon flags => extra dims.
# This helps logistic regression learn when to trust which source.
def entropy(p):
    p = np.clip(p, 1e-9, 1.0)
    return float(-(p*np.log(p)).sum() / math.log(3))  # normalized [0,1]

def meta_features(texts, p_xnli, p_en, p_lex, lex=LEX):
    feats = []
    for t, a, b, c in zip(texts, p_xnli, p_en, p_lex):
        # max prob = a simple confidence proxy for each source
        maxs = [a.max(), b.max(), c.max()]
        # entropy = uncertainty proxy for each source
        ents = [entropy(a), entropy(b), entropy(c)]
        # strict agreement across all sources (strong indicator of reliability)
        agree = int(a.argmax()==b.argmax()==c.argmax())
        # very lightweight text cues (word count + presence of negators/intensifiers)
        length = len(str(t).split())
        tlow = str(t).lower()
        toks = [w for w in TOKEN_SPLIT.split(tlow) if w]
        has_neg = int(any(w in lex["negators"] for w in toks)) if lex else 0
        has_int = int(any(w in lex["intens"]   for w in toks)) if lex else 0
        feats.append(maxs + ents + [agree, length, has_neg, has_int])
    return np.asarray(feats, dtype=np.float32)

meta_dev  = meta_features(texts_dev,  xnli_dev,  en_dev,  lex_dev,  LEX)
# Final DEV feature matrix = [9 probs] + [meta features] = 19 dims per example.
X_dev     = np.concatenate([xnli_dev, en_dev, lex_dev, meta_dev], axis=1)  # 9 + 10 = 19 dims


# =========================================================
# Train meta-model (multinomial logistic regression + calibration)
# =========================================================
# The combiner learns a mapping: (weak-source probability signals + meta-features) -> final class probabilities.
# We use:
#   - GridSearchCV to tune LR regularization C under stratified CV (macro-F1 objective).
#   - CalibratedClassifierCV (isotonic) to improve probability calibration for downstream soft targets.
print("Training meta-model (stacking) on FEWSHOT ...")
param_grid = {"C": [0.5, 1.0, 2.0, 5.0], "penalty": ["l2"], "solver": ["lbfgs"], "max_iter": [1000], "multi_class": ["multinomial"], "class_weight": ["balanced"]}
base = LogisticRegression()
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
gcv = GridSearchCV(base, param_grid=param_grid, scoring="f1_macro", cv=cv, n_jobs=-1, refit=True)
gcv.fit(X_dev, y_dev)
best_lr = gcv.best_estimator_
cv_score = gcv.best_score_
print("Best LR:", best_lr)
print("CV macro-F1 on FEWSHOT (stacking):", round(cv_score, 4))

# Optional calibration (isotonic)
# Calibration matters because Phase 3 uses these probabilities as *soft targets* (KLDiv loss).
cal = CalibratedClassifierCV(best_lr, method="isotonic", cv=5)
cal.fit(X_dev, y_dev)

# Save models & feature info so Phase 2.5 results are fully reproducible.
joblib.dump(cal, os.path.join(OUT_DIR, "combiner.joblib"))
with open(os.path.join(OUT_DIR, "feature_info.json"), "w", encoding="utf-8") as f:
    json.dump({"feature_order":"[xnli_neg,xnli_neu,xnli_pos, en_neg,en_neu,en_pos, lex_neg,lex_neu,lex_pos]",
               "labels": LABEL_LIST}, f, indent=2)

with open(os.path.join(OUT_DIR, "dev_cv_results.json"), "w", encoding="utf-8") as f:
    json.dump({"best_params": gcv.best_params_, "best_cv_macro_f1": float(cv_score)}, f, indent=2)

# =========================================================
# Apply weak labelers to POOL (LOAD from Phase-2 if available)
# =========================================================
# Pool probabilities are usually expensive to compute; if Phase 2 already ran, we reuse the cached .npy files.
if os.path.exists(PH2_XNLI) and os.path.exists(PH2_EN) and os.path.exists(PH2_LEX):
    print("Loading POOL probs from Phase-2 outputs ...")
    xnli_pool = np.load(PH2_XNLI).astype(np.float32)
    en_pool   = np.load(PH2_EN).astype(np.float32)
    lex_pool  = np.load(PH2_LEX).astype(np.float32)

    # Basic shape sanity: ensures each probability row corresponds to one pool example.
    if not (xnli_pool.shape[0] == en_pool.shape[0] == lex_pool.shape[0] == len(texts_pool)):
        raise ValueError(
            f"Phase-2 cache size mismatch: "
            f"xnli={xnli_pool.shape[0]}, en={en_pool.shape[0]}, lex={lex_pool.shape[0]}, pool_texts={len(texts_pool)}"
        )

    # Optional: verify text order matches cached Phase-2 train_texts.csv.
    # This protects against silent misalignment (wrong probabilities attached to wrong texts).
    if os.path.exists(PH2_TEXTS):
        cached_texts = pd.read_csv(PH2_TEXTS, encoding="utf-8-sig")["text"].astype(str).tolist()
        if len(cached_texts) == len(texts_pool):
            idxs = [0, 1, 2, len(texts_pool)//2, len(texts_pool)-1]
            if any(cached_texts[i] != texts_pool[i] for i in idxs):
                print("WARNING: text order differs from Phase-2 train_texts.csv. Results may misalign!")
        else:
            print("WARNING: cached train_texts.csv length mismatch; skipping order check.")

    # Track whether transliteration path was used in Phase 2 (for reporting in summary JSON).
    translit_ok_pool = "loaded"
    if os.path.exists(PH2_SUMMARY):
        try:
            with open(PH2_SUMMARY, "r", encoding="utf-8") as f:
                translit_ok_pool = json.load(f).get("transliteration_ok", "loaded")
        except Exception:
            translit_ok_pool = "loaded"


else:
    # If Phase-2 cache is missing, recompute pool probabilities (slower but fully self-contained).
    print("Phase-2 cache not found. Recomputing POOL probs ...")
    xnli_pool = run_xnli(texts_pool)
    try:
        _, _, en_pool = run_translit_mt_cls(texts_pool, bs=16)
        translit_ok_pool = True
    except Exception as e:
        print("WARNING: Transliteration+MT failed on POOL; using multilingual sentiment as fallback.\n", repr(e))
        MULTI = pipeline("text-classification", model="cardiffnlp/twitter-xlm-roberta-base-sentiment", top_k=None, device=DEVICE)
        def multi_probs(batch_texts):
            out=[]
            for i in range(0,len(batch_texts),32):
                res = MULTI(batch_texts[i:i+32])
                for scores in res:
                    vec=np.zeros(3, dtype=np.float32)
                    for s in scores:
                        idx={"negative":0,"neutral":1,"positive":2}.get(s["label"].lower(),None)
                        if idx is not None: vec[idx]=float(s["score"])
                    ssum=vec.sum(); out.append(vec/ssum if ssum>0 else np.ones(3)/3)
            return np.vstack(out).astype(np.float32)
        en_pool = multi_probs(texts_pool)
        translit_ok_pool = False

    lex_pool  = run_lex(texts_pool)


# Save pool probabilities for traceability and for potential reuse without rerunning weak labelers.
np.save(f"{OUT_DIR}/pool_probs_xnli.npy", xnli_pool)
np.save(f"{OUT_DIR}/pool_probs_en.npy",   en_pool)
np.save(f"{OUT_DIR}/pool_probs_lex.npy",  lex_pool)

# Build POOL feature matrix with the same schema as DEV (must match training feature dimension/order).
meta_pool = meta_features(texts_pool, xnli_pool, en_pool, lex_pool, LEX)
X_pool    = np.concatenate([xnli_pool, en_pool, lex_pool, meta_pool], axis=1)


# ---------------------------------------------------------
# Combiner predictions (calibrated probabilities)
# ---------------------------------------------------------
# The calibrated probabilities are the *main artifact*:
#   They serve as soft targets in Phase 3 (KLDiv), not just hard pseudo labels.
comb_probs = cal.predict_proba(X_pool).astype(np.float32)
pred_ids   = comb_probs.argmax(axis=1)
conf       = comb_probs.max(axis=1)

# save calibrated combiner probabilities for Phase-3 soft targets ---
p_neg, p_neu, p_pos = comb_probs[:, 0], comb_probs[:, 1], comb_probs[:, 2]


# For transparency/debugging:
#   compute each source's argmax & confidence and how many sources match the combiner label (votes_agree).
xnli_arg = xnli_pool.argmax(1); xnli_max = xnli_pool.max(1)
en_arg   = en_pool.argmax(1);   en_max   = en_pool.max(1)
lex_arg  = lex_pool.argmax(1);  lex_max  = lex_pool.max(1)
votes = (xnli_arg==pred_ids).astype(int) + (en_arg==pred_ids).astype(int) + (lex_arg==pred_ids).astype(int)

# Save stacked pseudo pack (richer columns)
# This file is the Phase-3 input (self-training curriculum + soft-target training).
out = pd.DataFrame({
    "text": texts_pool,
    "pseudo_label": [ID2LABEL[int(i)] for i in pred_ids],
    "pseudo_confidence": conf,
    "votes_agree": votes,
    # per-source summary
    "xnli_label": [ID2LABEL[int(i)] for i in xnli_arg],
    "xnli_conf":  xnli_max,
    "en_label":   [ID2LABEL[int(i)] for i in en_arg],
    "en_conf":    en_max,
    "lex_label":  [ID2LABEL[int(i)] for i in lex_arg],
    "lex_conf":   lex_max,
    # combiner probs (calibrated) ---
    "p_negative": p_neg,
    "p_neutral":  p_neu,
    "p_positive": p_pos,
    # (optional) per-source probabilities for future analysis
    "xnli_p_neg": xnli_pool[:,0], "xnli_p_neu": xnli_pool[:,1], "xnli_p_pos": xnli_pool[:,2],
    "en_p_neg":   en_pool[:,0],   "en_p_neu":   en_pool[:,1],   "en_p_pos":   en_pool[:,2],
    "lex_p_neg":  lex_pool[:,0],  "lex_p_neu":  lex_pool[:,1],  "lex_p_pos":  lex_pool[:,2],
})
OUT_CSV = f"{OUT_DIR}/rusa19_train_pseudo_stacked.csv"
out.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")

# ---------------------------------------------------------
# Sanity check: DEV fit (in-sample)
# ---------------------------------------------------------
# This is NOT the CV score; it measures how well the calibrated combiner fits the FEWSHOT set after training.
# For reporting, the CV macro-F1 is more meaningful, but dev-fit is still a useful quick check.
yhat_dev = cal.predict(X_dev)
probs_dev= cal.predict_proba(X_dev)
m_dev = dict(
    accuracy = float((yhat_dev==y_dev).mean()),
    macro_f1 = float(f1_score(y_dev, yhat_dev, average="macro")),
    f1_neg   = float(f1_score(y_dev, yhat_dev, average=None, labels=[0,1,2])[0]),
    f1_neu   = float(f1_score(y_dev, yhat_dev, average=None, labels=[0,1,2])[1]),
    f1_pos   = float(f1_score(y_dev, yhat_dev, average=None, labels=[0,1,2])[2]),
)

# Run summary (saved for reproducibility and thesis reporting).
summary = {
    "paths": {"phase2_5_out": OUT_DIR, "pseudo_csv": OUT_CSV},
    "combiner": {"type": "LogisticRegression + isotonic calibration", "cv_macro_f1_dev": float(cv_score)},
    "dev_fit_metrics": m_dev,
    "transliteration_ok": {"dev": translit_ok_dev, "pool": translit_ok_pool},
    "notes": [
        "Use the stacked CSV as Phase-3 input to measure value of learned combination.",
        "All features/probabilities saved under phase2_5 for reuse."
    ]
}
with open(f"{OUT_DIR}/phase2_5_summary.json","w",encoding="utf-8") as f:
    json.dump(summary, f, indent=2, ensure_ascii=False)

print("\n=== Phase 2.5 complete ===")
print(json.dumps(summary, indent=2, ensure_ascii=False))


Device: cuda
Using FEWSHOT split for combiner: /content/Thesis_RomanUrdu_SA/datasets/rusa19_clean/rusa19_fewshot64_clean.csv (n=192)
Loading XNLI zero-shot pipeline ...


Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaFor

[A] XNLI:   0%|          | 0/12 [00:00<?, ?it/s]

[B] RUR→UR→EN→Sent:   0%|          | 0/12 [00:00<?, ?it/s]

Training meta-model (stacking) on FEWSHOT ...




Best LR: LogisticRegression(C=0.5, class_weight='balanced', max_iter=1000,
                   multi_class='multinomial')
CV macro-F1 on FEWSHOT (stacking): 0.6695




Loading POOL probs from Phase-2 outputs ...

=== Phase 2.5 complete ===
{
  "paths": {
    "phase2_5_out": "/content/Thesis_RomanUrdu_SA/outputs/phase2_5",
    "pseudo_csv": "/content/Thesis_RomanUrdu_SA/outputs/phase2_5/rusa19_train_pseudo_stacked.csv"
  },
  "combiner": {
    "type": "LogisticRegression + isotonic calibration",
    "cv_macro_f1_dev": 0.6695057832497936
  },
  "dev_fit_metrics": {
    "accuracy": 0.734375,
    "macro_f1": 0.7336471522460767,
    "f1_neg": 0.732824427480916,
    "f1_neu": 0.6774193548387096,
    "f1_pos": 0.7906976744186046
  },
  "transliteration_ok": {
    "dev": true,
    "pool": true
  },
  "notes": [
    "Use the stacked CSV as Phase-3 input to measure value of learned combination.",
    "All features/probabilities saved under phase2_5 for reuse."
  ]
}


In [1]:
# NOTE (Colab memory management):
# If RAM/GPU memory runs low, you can restart the Colab runtime before Phase 3 to free resources.
# This is safe because Phase 2.5 writes all required inputs to /content/Thesis_RomanUrdu_SA,
# so Phase 3 can be resumed by re-running this cell after a restart (no need to rerun Phase 2/2.5).

In [2]:

# ============================================
# Phase 3 — Self-Training + Few-shot Refine (Curriculum + Soft Targets)
# ============================================
# This phase is the core "training" stage of the framework:
#
#   Stage-A (Self-training / curriculum):
#     - Train a student model using pseudo-labeled TRAIN pool (from Phase 2.5).
#     - Use a curriculum schedule (HIGH → MID → ALL) instead of dropping examples:
#         HIGH: most reliable pseudo-labels
#         MID : moderately reliable pseudo-labels
#         ALL : all pseudo-labels (keep-all mode)
#     - Train using *soft targets* when available (combiner probabilities) via KLDiv,
#       and optionally fall back to hard CE if soft targets are missing.
#
#   Stage-B (Few-shot refinement):
#     - Initialize from Stage-A checkpoint.
#     - Fine-tune on a small gold FEWSHOT set (64 per class).
#
# Inputs:
#   - Phase 2.5 stacked pseudo CSV (includes pseudo_label/confidence + optional combiner probs)
#   - Gold TEST split (for final evaluation)
#   - FEWSHOT pack (used for dev evaluation in this implementation and also Stage-B training)
#
# Outputs:
#   - /outputs/phase3/stageA_selftrain/ : Stage-A model + predictions/reports
#   - /outputs/phase3/stageB_fewshot_refine/ : Stage-B refined model + predictions/reports
#   - curriculum_high/mid/all.csv for traceability of what was trained at each step
# ============================================

import os, json, math, random, gc
import numpy as np, pandas as pd, torch, torch.nn.functional as F
from sklearn.metrics import f1_score, accuracy_score, classification_report
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, EarlyStoppingCallback,
                          DataCollatorWithPadding, set_seed)
from datasets import Dataset

# ---------------- Paths ----------------
# Keep everything session-local under /content/Thesis_RomanUrdu_SA for easy download + resumability.
BASE_DIR       = "/content/Thesis_RomanUrdu_SA"
DATA_DIR_CLEAN = f"{BASE_DIR}/datasets/rusa19_clean"

# Phase-1 checkpoint path (used only if warm-starting Stage-A; COLD_START=True keeps it unused).
PH1_CKPT       = f"{BASE_DIR}/outputs/phase1/supervised_xlmr_base"

# Phase-2.5 stacked pseudo-label pack produced by the learned combiner.
PH2_PSEUDO     = f"{BASE_DIR}/outputs/phase2_5/rusa19_train_pseudo_stacked.csv"

# Phase-3 outputs: Stage-A and Stage-B directories.
OUT_DIR        = f"{BASE_DIR}/outputs/phase3"
A_DIR          = f"{OUT_DIR}/stageA_selftrain"
B_DIR          = f"{OUT_DIR}/stageB_fewshot_refine"
os.makedirs(OUT_DIR, exist_ok=True); os.makedirs(A_DIR, exist_ok=True); os.makedirs(B_DIR, exist_ok=True)

# ---------------- Reproducibility ----------------
# Seed everything so results are as stable as possible across runs (GPU nondeterminism can still cause tiny drift).
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
set_seed(SEED)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

# ---------------- Label space ----------------
LABEL_LIST = ["negative","neutral","positive"]
LABEL2ID   = {l:i for i,l in enumerate(LABEL_LIST)}
ID2LABEL   = {i:l for l,i in LABEL2ID.items()}

# ---------------- Load stacked pseudo pack (Phase 2.5) ----------------
# This CSV contains the pool texts + pseudo label info. If combiner probabilities were saved,
# we can use them directly as soft targets (preferred).
df_pseudo = pd.read_csv(PH2_PSEUDO, encoding="utf-8-sig")
assert {"text","pseudo_label","pseudo_confidence","votes_agree"}.issubset(df_pseudo.columns)

# If soft probabilities (combiner probs) exist, Phase 3 trains with KLDiv soft targets.
# Otherwise, we synthesize a soft distribution from (hard label + confidence).
HAS_SOFT = set(["p_negative","p_neutral","p_positive"]).issubset(df_pseudo.columns)

# ---------------- Gold DEV/TEST + FEWSHOT (Stage-B) ----------------
# NOTE: In this Roman-Urdu variant, FEWSHOT is reused as the "dev" set for evaluation/early stopping.
# The gold TEST split is used only for final reporting.
def normalize_label(x):
    s = str(x).strip().lower()
    if s in {"-1","neg","n","negative","2"}: return "negative"
    if s in {"0","neu","neutral"}:           return "neutral"
    if s in {"1","pos","p","positive"}:      return "positive"
    return np.nan

def load_split(path):
    # Standardize to: text, label(id), sentiment(str)
    df = pd.read_csv(path, encoding="utf-8-sig").rename(columns={"Text":"text","Sentiment":"sentiment"})
    df["sentiment"] = df["sentiment"].map(normalize_label)
    df = df.dropna(subset=["text","sentiment"])
    df["label"] = df["sentiment"].map(LABEL2ID)
    return df[["text","label","sentiment"]]

# Gold TEST split (held-out evaluation).
df_test = load_split(f"{DATA_DIR_CLEAN}/rusa19_test_clean.csv")

# FEWSHOT pack is required for Stage-B training and (here) for dev evaluation.
few_path = f"{DATA_DIR_CLEAN}/rusa19_fewshot64_clean.csv"
if not os.path.exists(few_path):
    raise FileNotFoundError(f"Few-shot file required for Phase 3 but not found: {few_path}")

df_few = load_split(few_path)  # expected: 64 examples per class

# Use FEWSHOT as eval split (acts as "dev" for early stopping + model selection).
df_dev = df_few.copy()


# ---------------- Tokenizer ----------------
# Student model tokenizer: we use xlm-roberta-base for self-training and refinement.
MODEL_NAME = "xlm-roberta-base"
tokenizer  = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tok_fn(batch):
    # Truncate to 256 tokens to balance quality vs memory/runtime.
    return tokenizer(batch["text"], truncation=True, padding=False, max_length=256)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ---------------- Metrics ----------------
# Macro-F1 is the key model-selection metric because the dataset can be class-imbalanced.
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.tensor(logits).softmax(-1).argmax(-1).numpy()
    acc   = accuracy_score(labels, preds)
    macro = f1_score(labels, preds, average="macro")
    f1s   = f1_score(labels, preds, average=None, labels=[0,1,2])
    return {"accuracy": acc, "macro_f1": macro,
            "f1_neg": float(f1s[0]), "f1_neu": float(f1s[1]), "f1_pos": float(f1s[2])}

# ---------------- Weights & soft-targets helpers ----------------
# Each pseudo-labeled example gets a training weight based on confidence and vote agreement.
# This reduces the harm of noisier pseudo labels without discarding them (keep-all philosophy).
def weight_row(conf, votes):
    """Confidence-weighted importance; 'keep-all' mode spreads weights wider.
       Conf∈[0,1], votes∈{0,1,2,3}."""
    base  = 0.2 + 1.2 * float(conf)                # maps confidence to ~[0.2, 1.4]
    bonus = {3:0.15, 2:0.06, 1:0.00, 0:-0.10}.get(int(votes), 0.0)  # agreement bonus/penalty
    return float(np.clip(base + bonus, 0.1, 1.6))

def attach_weights_labels_soft(df):
    """
    Convert Phase-2.5 pseudo CSV into a training table that includes:
      - label: integer id for Trainer API
      - weight: per-example importance weight
      - st_*: soft target distribution (neg/neu/pos) used for KLDiv training
    """
    out = df.copy()
    out["label"]  = out["pseudo_label"].map(LABEL2ID)
    out["weight"] = [weight_row(c, v) for c, v in zip(out["pseudo_confidence"], out["votes_agree"])]

    if HAS_SOFT:
        # Preferred: use combiner-calibrated probabilities saved in Phase 2.5.
        p = out[["p_negative","p_neutral","p_positive"]].astype(float).to_numpy()
        p = np.clip(p, 1e-9, None); p = p / p.sum(axis=1, keepdims=True)
    else:
        # Fallback: construct a "soft" distribution from (hard label + pseudo_confidence).
        # The predicted class gets probability=confidence; remaining mass is split equally.
        lab  = out["label"].astype(int).to_numpy()
        conf = out["pseudo_confidence"].astype(float).to_numpy()
        p = np.full((len(out), 3), 0.0, dtype=np.float32)
        for i, (y, c) in enumerate(zip(lab, conf)):
            rest = (1.0 - float(c)) / 2.0
            p[i, :] = rest
            p[i, int(y)] = float(c)

    out["st_neg"] = p[:,0]; out["st_neu"] = p[:,1]; out["st_pos"] = p[:,2]
    return out

# ---- KEEP-ALL (no-drop) base table (we will do curriculum instead of dropping)
df_all  = attach_weights_labels_soft(df_pseudo)

# ---- Curriculum slices (progressively expand training data reliability → coverage)
hi  = df_all[(df_all.votes_agree==3) | ((df_all.votes_agree==2) & (df_all.pseudo_confidence>=0.70))].copy()
mid = df_all[(df_all.votes_agree>=2) & (df_all.pseudo_confidence>=0.50)].copy()
# 'all' uses every pseudo example with weights & soft targets
allp = df_all.copy()

# (Optional) save these for traceability (helps thesis reporting + reproducibility)
hi.to_csv(f"{OUT_DIR}/curriculum_high.csv", index=False, encoding="utf-8-sig")
mid.to_csv(f"{OUT_DIR}/curriculum_mid.csv",  index=False, encoding="utf-8-sig")
allp.to_csv(f"{OUT_DIR}/curriculum_all.csv", index=False, encoding="utf-8-sig")

print(f"Curriculum sizes — high:{len(hi)}  mid:{len(mid)}  all:{len(allp)}")

# ---------------- HF datasets (include weights + soft targets) ----------------
# Convert pandas → HF Dataset with the extra fields needed by the custom Trainer loss.
def to_ds(df, use_weight=True, use_soft=True):
    d = {"text": df["text"].tolist(), "label": df["label"].astype(int).tolist()}
    if use_weight: d["weights"] = df["weight"].astype(float).tolist()
    if use_soft:
        st = np.stack([df["st_neg"].to_numpy(float),
                       df["st_neu"].to_numpy(float),
                       df["st_pos"].to_numpy(float)], axis=1)
        d["soft_targets"] = st.tolist()
    return Dataset.from_dict(d)

ds_hi   = to_ds(hi)
ds_mid  = to_ds(mid)
ds_all  = to_ds(allp)

# Gold eval datasets (dev=fewshot in this setup, test=held-out).
ds_dev  = Dataset.from_dict({"text": df_dev["text"].tolist(),"label": df_dev["label"].tolist()})
ds_test = Dataset.from_dict({"text": df_test["text"].tolist(),"label": df_test["label"].tolist()})

# Few-shot dataset used as Stage-B training set (same content as df_dev here).
ds_few  = Dataset.from_dict({"text": df_few["text"].tolist(),"label": df_few["label"].tolist()})

# Tokenize all HF datasets (batched for speed).
ds_hi_tok   = ds_hi.map(tok_fn, batched=True)
ds_mid_tok  = ds_mid.map(tok_fn, batched=True)
ds_all_tok  = ds_all.map(tok_fn, batched=True)
ds_dev_tok  = ds_dev.map(tok_fn, batched=True)
ds_test_tok = ds_test.map(tok_fn, batched=True)
dsB_train_tok = ds_few.map(tok_fn, batched=True)

# ---------------- Weighted Trainer (supports soft targets) ----------------
# This custom Trainer implements two training modes:
#   1) Soft mode (preferred): KLDiv between model distribution and soft_targets (from combiner probs).
#   2) Hard fallback: cross-entropy with global class weights if soft_targets are not present.
# Additionally, it applies per-example weights to down-weight noisy pseudo labels.
class SoftWeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels  = inputs.get("labels")
        weights = inputs.get("weights", None)
        soft    = inputs.get("soft_targets", None)

        # Remove extra keys not supported by model.forward()
        model_inputs = {k:v for k,v in inputs.items() if k not in {"labels","weights","soft_targets"}}
        outputs = model(**model_inputs)
        logits  = outputs.get("logits")  # [B,3]

        if soft is not None:
            # Soft-target training: KLDiv to (calibrated or synthesized) soft targets.
            # NOTE: class weights are typically for hard CE; we intentionally skip them in KL mode.
            logp = F.log_softmax(logits, dim=-1)
            targ = soft.to(logp.dtype)
            loss_vec = F.kl_div(logp, targ, reduction="none").sum(dim=-1)  # [B]
        else:
            # Hard-label fallback: weighted CE (rarely used if Phase-2.5 probs are present).
            loss_vec = F.cross_entropy(
                logits, labels,
                weight=GLOBAL_CLASS_WEIGHTS.to(logits.device),
                reduction="none"
            )

        # Apply per-example weights (confidence/vote-based) if present.
        if weights is not None:
            w = weights.to(logits.device).float()
            loss = (loss_vec * w).mean()
        else:
            loss = loss_vec.mean()

        return (loss, outputs) if return_outputs else loss

# ---------------- Class weights (used only in hard-CE fallback) ----------------
# Computed from the pseudo-label distribution of the full pool to reduce imbalance impact if CE mode is used.
freq_all = np.bincount(allp["label"].astype(int).to_numpy(), minlength=3).astype(np.float32)
inv = 1.0 / np.clip(freq_all, 1.0, None)
GLOBAL_CLASS_WEIGHTS = torch.tensor(inv / inv.sum() * len(LABEL_LIST), dtype=torch.float32)  # scaled so sum≈3

# ---------------- Utility: evaluate + dump ----------------
# Saves both raw predictions and a classification_report JSON for reproducible reporting.
def eval_and_dump(tr, ds_tok, ref_df, split, odir):
    m = tr.evaluate(ds_tok)
    print(f"\n[{split.upper()}] metrics:\n", json.dumps(m, indent=2))
    pr = tr.predict(ds_tok)
    probs = torch.tensor(pr.predictions).softmax(-1).numpy()
    y = pr.label_ids; yhat = probs.argmax(-1)

    out = pd.DataFrame({
        "text": ref_df["text"].tolist(),
        "gold": [ID2LABEL[int(i)] for i in y],
        "pred": [ID2LABEL[int(i)] for i in yhat],
        "p_negative": probs[:,0], "p_neutral": probs[:,1], "p_positive": probs[:,2]
    })
    os.makedirs(odir, exist_ok=True)
    out.to_csv(f"{odir}/preds_{split}.csv", index=False, encoding="utf-8-sig")

    rep = classification_report(y, yhat, target_names=LABEL_LIST, output_dict=True)
    with open(f"{odir}/report_{split}.json","w",encoding="utf-8") as f:
        json.dump({"metrics": m, "report": rep}, f, indent=2, ensure_ascii=False)
    return m

# ---------------- Stage-A: COLD start  ----------------
# COLD_START=True starts from the base pretrained model (no supervised warm-start),
# helping quantify the value of pseudo labels alone.
COLD_START = True  # set False to warm-start from Phase-1 best

def resolve_best_checkpoint(root_dir: str, fallback_model: str = "xlm-roberta-base"):
    """
    Utility to resolve a best checkpoint path from a Trainer output directory.
    If no checkpoint is found, falls back to the base model name.
    """
    for fn in ("pytorch_model.bin", "model.safetensors"):
        if os.path.exists(os.path.join(root_dir, fn)):
            return root_dir
    ts = os.path.join(root_dir, "trainer_state.json")
    if os.path.exists(ts):
        try:
            with open(ts, "r", encoding="utf-8") as f: state = json.load(f)
            best = state.get("best_model_checkpoint")
            if best and os.path.isdir(best): return best
        except Exception:
            pass
    if os.path.isdir(root_dir):
        cands = [os.path.join(root_dir, d) for d in os.listdir(root_dir)
                 if d.startswith("checkpoint-") and os.path.isdir(os.path.join(root_dir, d))]
        if cands:
            import re
            def step_of(p):
                m = re.search(r"checkpoint-(\d+)", p)
                return int(m.group(1)) if m else -1
            cands.sort(key=step_of, reverse=True)
            return cands[0]
    return fallback_model

# Choose Stage-A initialization checkpoint.
init_path = MODEL_NAME if COLD_START else resolve_best_checkpoint(PH1_CKPT, fallback_model=MODEL_NAME)
print("Stage-A init from:", init_path)

modelA = AutoModelForSequenceClassification.from_pretrained(
    init_path, num_labels=3, id2label=ID2LABEL, label2id=LABEL2ID
)

# ---------------- Stage-A Curriculum: hi -> mid -> all ----------------
# TrainingArguments factory for Stage-A curriculum runs.
# NOTE: Here evaluation/saving/logging are per-epoch to report epoch-wise progress.
def args_stageA(lr, epochs):
    return TrainingArguments(
        output_dir=A_DIR,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        gradient_accumulation_steps=2,  # effective batch size = 16*2
        num_train_epochs=epochs,
        learning_rate=lr,
        warmup_ratio=0.1,
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_macro_f1",
        greater_is_better=True,
        label_smoothing_factor=0.0,  # soft targets already provide smoothing
        fp16=torch.cuda.is_available(),
        report_to=[],
        seed=SEED,
    )

def make_trainer(model, args, train_ds):
    # Builds a SoftWeightedTrainer for a given curriculum subset.
    return SoftWeightedTrainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=ds_dev_tok,         # eval on FEWSHOT dev
        processing_class=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )

# === HIGH ===
print("\n=== Stage-A [curriculum: HIGH] ===")
trainer_hi = make_trainer(modelA, args_stageA(lr=2e-5, epochs=2), ds_hi_tok)
trainer_hi.train()
modelA = trainer_hi.model  # carry weights forward to next curriculum stage
del trainer_hi; gc.collect(); torch.cuda.empty_cache()

# === MID ===
print("\n=== Stage-A [curriculum: MID] ===")
trainer_mid = make_trainer(modelA, args_stageA(lr=2e-5, epochs=2), ds_mid_tok)
trainer_mid.train()
modelA = trainer_mid.model
del trainer_mid; gc.collect(); torch.cuda.empty_cache()

# === ALL ===
print("\n=== Stage-A [curriculum: ALL] ===")
trainer_all = make_trainer(modelA, args_stageA(lr=1e-5, epochs=2), ds_all_tok)
trainer_all.train()
modelA = trainer_all.model

# Keep a handle named trainerA for downstream eval/save (avoid changing later code structure).
trainerA = trainer_all

# Save Stage-A final/best checkpoint at A_DIR (used as init for Stage-B).
trainerA.save_model(A_DIR)
tokenizer.save_pretrained(A_DIR)

# ---- Eval + dumps (dev=fewshot, test=held-out)
mA_dev  = eval_and_dump(trainerA, ds_dev_tok,  df_dev,  "dev",  A_DIR)
mA_test = eval_and_dump(trainerA, ds_test_tok, df_test, "test", A_DIR)


Device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Curriculum sizes — high:2633  mid:3481  all:6806


Map:   0%|          | 0/2633 [00:00<?, ? examples/s]

Map:   0%|          | 0/3481 [00:00<?, ? examples/s]

Map:   0%|          | 0/6806 [00:00<?, ? examples/s]

Map:   0%|          | 0/192 [00:00<?, ? examples/s]

Map:   0%|          | 0/1942 [00:00<?, ? examples/s]

Map:   0%|          | 0/192 [00:00<?, ? examples/s]

Stage-A init from: xlm-roberta-base


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Stage-A [curriculum: HIGH] ===


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,F1 Neg,F1 Neu,F1 Pos
1,1.025,1.036826,0.53125,0.42523,0.0,0.630952,0.644737
2,0.814,0.936787,0.515625,0.416246,0.0,0.568182,0.680556



=== Stage-A [curriculum: MID] ===


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,F1 Neg,F1 Neu,F1 Pos
1,0.828,0.9293,0.604167,0.594583,0.485437,0.621622,0.676692
2,0.6389,0.912959,0.645833,0.641274,0.574074,0.652778,0.69697



=== Stage-A [curriculum: ALL] ===


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,F1 Neg,F1 Neu,F1 Pos
1,0.8313,0.744434,0.703125,0.704617,0.676692,0.666667,0.770492
2,0.7491,0.705259,0.723958,0.724077,0.70229,0.68254,0.787402



[DEV] metrics:
 {
  "eval_loss": 0.7052589058876038,
  "eval_accuracy": 0.7239583333333334,
  "eval_macro_f1": 0.7240771112262366,
  "eval_f1_neg": 0.7022900763358778,
  "eval_f1_neu": 0.6825396825396826,
  "eval_f1_pos": 0.7874015748031497,
  "eval_runtime": 0.4548,
  "eval_samples_per_second": 422.152,
  "eval_steps_per_second": 13.192,
  "epoch": 2.0
}



[TEST] metrics:
 {
  "eval_loss": 0.7415622472763062,
  "eval_accuracy": 0.7075180226570545,
  "eval_macro_f1": 0.7046154808217854,
  "eval_f1_neg": 0.7119796091758709,
  "eval_f1_neu": 0.6282557221783741,
  "eval_f1_pos": 0.7736111111111111,
  "eval_runtime": 1.9075,
  "eval_samples_per_second": 1018.108,
  "eval_steps_per_second": 31.98,
  "epoch": 2.0
}


In [3]:
# ---------------- Stage-B: few-shot refine (optional) ----------------
# Stage-B is the final refinement step of the framework:
#   - Initialize from the Stage-A self-trained student checkpoint (A_DIR).
#   - Fine-tune on a small gold FEWSHOT set (64 per class).
#   - Evaluate on the same "dev" (FEWSHOT) used for early stopping + also report on held-out TEST.
#
# Why Stage-B may help:
#   Stage-A learns from noisy pseudo supervision (even with soft targets + weights).
#   Stage-B uses clean gold labels to correct systematic noise/collapse learned in pseudo training.

print("\nStage-B init from:", A_DIR)

# Load Stage-A trained checkpoint as the starting point for refinement.
modelB = AutoModelForSequenceClassification.from_pretrained(
    A_DIR, num_labels=3, id2label=ID2LABEL, label2id=LABEL2ID
)

# Training config for Stage-B:
# - epoch-based reporting (eval/save/log each epoch) for easier thesis reporting.
# - Early stopping prevents overfitting on the tiny few-shot set.
# - label_smoothing provides mild regularization for small-data fine-tuning.
argsB = TrainingArguments(
    output_dir=B_DIR,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_macro_f1",  # select best checkpoint by macro-F1
    greater_is_better=True,
    label_smoothing_factor=0.05,           # mild smoothing for tiny gold set
    fp16=torch.cuda.is_available(),
    report_to=[],
    seed=SEED
)

# Reuse the same custom Trainer class:
# - In Stage-B, the training dataset (dsB_train_tok) contains only hard labels (no soft_targets/weights),
#   so SoftWeightedTrainer naturally falls back to standard CE (with label smoothing via TrainingArguments).
# - eval_dataset remains ds_dev_tok (FEWSHOT) to keep the selection criterion consistent.
trainerB = SoftWeightedTrainer(
    model=modelB,
    args=argsB,
    train_dataset=dsB_train_tok,   # FEWSHOT training set (gold)
    eval_dataset=ds_dev_tok,       # FEWSHOT dev (gold) for early stopping / model selection
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("\n=== Stage-B training (Refine on few-shot 64×3) ===")
train_out_B = trainerB.train()
print(train_out_B)

# Save & evaluate Stage-B:
# - Model + tokenizer saved for downstream inference/reproducibility.
# - preds_*.csv and report_*.json exported for thesis reporting.
trainerB.save_model(B_DIR); tokenizer.save_pretrained(B_DIR)
mB_dev  = eval_and_dump(trainerB, ds_dev_tok,  df_dev,  "dev",  B_DIR)
mB_test = eval_and_dump(trainerB, ds_test_tok, df_test, "test", B_DIR)

# ---------------- Summary ----------------
# Persist a single JSON summary so Phase-3 outputs are self-describing:
# - file paths
# - curriculum sizes
# - Stage-A and Stage-B results (dev/test)
summary = {
    "paths": {"phase3_out": OUT_DIR, "stageA": A_DIR, "stageB": B_DIR,
              "pseudo_all_csv": PH2_PSEUDO,
              "curriculum": {"high_csv": f"{OUT_DIR}/curriculum_high.csv",
                             "mid_csv":  f"{OUT_DIR}/curriculum_mid.csv",
                             "all_csv":  f"{OUT_DIR}/curriculum_all.csv"}},
    "sizes": {"high": len(hi), "mid": len(mid), "all": len(allp)},
    "stageA": {"dev": mA_dev, "test": mA_test, "init": init_path},
    "stageB": {"dev": mB_dev, "test": mB_test, "init": A_DIR},
    "notes": ["Keep-all pseudo via curriculum + soft targets (KLDiv).",
              "Class weights are used only in CE fallback (not active in soft mode)."]
}
with open(f"{OUT_DIR}/phase3_summary.json","w",encoding="utf-8") as f:
    json.dump(summary, f, indent=2, ensure_ascii=False)

print("\n=== Phase 3 complete ===")
print(json.dumps(summary, indent=2, ensure_ascii=False))



Stage-B init from: /content/Thesis_RomanUrdu_SA/outputs/phase3/stageA_selftrain

=== Stage-B training (Refine on few-shot 64×3) ===


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,F1 Neg,F1 Neu,F1 Pos
1,0.8578,0.619906,0.765625,0.766863,0.748092,0.732824,0.819672
2,0.7249,0.51776,0.817708,0.814339,0.844444,0.747826,0.850746
3,0.6496,0.386129,0.901042,0.900933,0.883721,0.896,0.923077
4,0.5215,0.366688,0.901042,0.900933,0.883721,0.896,0.923077
5,0.469,0.35687,0.901042,0.900933,0.883721,0.896,0.923077


TrainOutput(global_step=60, training_loss=0.6445666710535686, metrics={'train_runtime': 744.1456, 'train_samples_per_second': 1.29, 'train_steps_per_second': 0.081, 'total_flos': 41407474114944.0, 'train_loss': 0.6445666710535686, 'epoch': 5.0})



[DEV] metrics:
 {
  "eval_loss": 0.38612890243530273,
  "eval_accuracy": 0.9010416666666666,
  "eval_macro_f1": 0.900932617769827,
  "eval_f1_neg": 0.8837209302325582,
  "eval_f1_neu": 0.896,
  "eval_f1_pos": 0.9230769230769231,
  "eval_runtime": 0.4521,
  "eval_samples_per_second": 424.723,
  "eval_steps_per_second": 13.273,
  "epoch": 5.0
}



[TEST] metrics:
 {
  "eval_loss": 0.7673312425613403,
  "eval_accuracy": 0.7018537590113285,
  "eval_macro_f1": 0.6980489288191302,
  "eval_f1_neg": 0.7074164629176855,
  "eval_f1_neu": 0.6166394779771615,
  "eval_f1_pos": 0.7700908455625437,
  "eval_runtime": 2.6728,
  "eval_samples_per_second": 726.577,
  "eval_steps_per_second": 22.822,
  "epoch": 5.0
}

=== Phase 3 complete ===
{
  "paths": {
    "phase3_out": "/content/Thesis_RomanUrdu_SA/outputs/phase3",
    "stageA": "/content/Thesis_RomanUrdu_SA/outputs/phase3/stageA_selftrain",
    "stageB": "/content/Thesis_RomanUrdu_SA/outputs/phase3/stageB_fewshot_refine",
    "pseudo_all_csv": "/content/Thesis_RomanUrdu_SA/outputs/phase2_5/rusa19_train_pseudo_stacked.csv",
    "curriculum": {
      "high_csv": "/content/Thesis_RomanUrdu_SA/outputs/phase3/curriculum_high.csv",
      "mid_csv": "/content/Thesis_RomanUrdu_SA/outputs/phase3/curriculum_mid.csv",
      "all_csv": "/content/Thesis_RomanUrdu_SA/outputs/phase3/curriculum_all.csv"


In [None]:
# ============================================================
# NOTE — Running the same pipeline with other backbones
# ============================================================
# This notebook is written with XLM-R as the default backbone:
#   MODEL_NAME = "xlm-roberta-base"
#
# To run the *exact same framework* with a different pretrained model,
# you only need to replace the model name wherever the backbone is defined
# (i.e., the string passed to AutoTokenizer.from_pretrained(...) and
#  AutoModelForSequenceClassification.from_pretrained(...)).
#
# Recommended backbone strings (HuggingFace model IDs):
#   1) XLM-T (Twitter XLM-R variant):        "cardiffnlp/twitter-xlm-roberta-base"
#      - Use when you want a model more tuned to social-media / short-text style.
#
#   2) mDeBERTa-v3 (Microsoft):              "microsoft/mdeberta-v3-base"
#      - Strong multilingual encoder; often good transfer and robustness.
#
#   3) MuRIL (Google, Indic + Urdu focus):   "google/muril-base-cased"
#      - Often strong for Urdu/Hindi scripts and related languages.
#
#   4) mBERT (Multilingual BERT):            "bert-base-multilingual-cased"
#      - Classic baseline multilingual encoder (older but widely used).
#
# Minimal changes you typically make:
#   - Update MODEL_NAME in the "Tokenizer" section:
#       MODEL_NAME = "<one of the model IDs above>"
#
#   - If warm-starting (COLD_START=False), ensure PH1_CKPT points to a checkpoint
#     trained with the *same* backbone (do not mix checkpoints across backbones).
#
# the rest of the pipeline (pseudo labels, curriculum, soft targets,
# trainers, evaluation, saving) stays the same.
# ============================================================
