In [1]:
# Disable Weights & Biases (W&B) logging to avoid external tracking prompts and to keep runs fully local.

import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"


In [2]:
# Install HuggingFace's `evaluate` library (used later for computing metrics in a standardized way).

!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [3]:
# ==========================================================
# OUTPUTS BOOTSTRAP — single source of truth for all paths
# ==========================================================
import os, json, time
from pathlib import Path
import pandas as pd

# ----------------------------------------------------------
# Define ONE root directory for ALL artifacts produced by the
# thesis framework (data splits, pseudo-labels, models, logs).
# Why: keeps the whole pipeline reproducible and avoids
# scattered files across /content.
# ----------------------------------------------------------
OUT_ROOT = Path("/content/outputs")

# Folder structure for each phase/component of the pipeline.
# - data:    cached train/test/fewshot (and optionally dev) splits
# - phase1:  baseline outputs (if Phase-1 is executed)
# - phase2:  weak supervision outputs (raw pseudo labels)
# - phase2_5:stacking + calibration outputs (combiner + calibrated pseudo-probs)
# - phase3:  Stage-A training outputs (curriculum + weighted soft targets)
# - phase3b: Stage-A+B outputs (few-shot refinement checkpoint + metrics)
OUT_PATHS = {
    "data":            OUT_ROOT / "data",
    "phase1":          OUT_ROOT / "phase1",
    "phase2":          OUT_ROOT / "phase2",
    "phase2_5":        OUT_ROOT / "phase2_5_stacked_cal",
    "phase3":          OUT_ROOT / "phase3_noextra",
    "phase3b":         OUT_ROOT / "phase3_noextra_refine",
}

# Create all directories up-front so later cells can safely write files.
for p in OUT_PATHS.values():
    p.mkdir(parents=True, exist_ok=True)

# ----------------------------------------------------------
# Central registry of file paths used across cells.
# Why: downstream phases reference OUT_FILES[...] instead of
# hardcoding paths, preventing mismatches when reorganizing.
# ----------------------------------------------------------
OUT_FILES = {
    "splits": {
        # Canonical split files used by the framework pipeline
        "train":   OUT_PATHS["data"] / "train.csv",
        "dev":     OUT_PATHS["data"] / "dev.csv",       # exist but used only in Fully Supervised Baseline
        "test":    OUT_PATHS["data"] / "test.csv",
        "fewshot": OUT_PATHS["data"] / "fewshot.csv",   # required for calibration / refine steps
        "manifest":OUT_PATHS["data"] / "manifest.json",
    },
    "phase2": {
        # Output of weak supervision voting/aggregation
        "pseudo_csv": OUT_PATHS["phase2"] / "sau18_train_pseudo.csv",
        "manifest":   OUT_PATHS["phase2"] / "manifest.json",
    },
    "phase2_5": {
        # Output of calibrated stacking combiner (LR + isotonic)
        # Produces calibrated class probabilities used as soft targets in Phase-3.
        "out_csv":   OUT_PATHS["phase2_5"] / "train_pseudo_stacked_cal.csv",
        "combiner":  OUT_PATHS["phase2_5"] / "combiner.joblib",
        "meta":      OUT_PATHS["phase2_5"] / "meta.json",
        "manifest":  OUT_PATHS["phase2_5"] / "manifest.json",
    },
    "phase3": {
        # Final Stage-A checkpoint path (best checkpoint after curriculum training)
        "final_ckpt": OUT_PATHS["phase3"] / "stageA_cal_soft" / "all" / "checkpoint-best",
        "manifest":   OUT_PATHS["phase3"] / "manifest.json",
    },
    "phase3b": {
        # Final Stage-A+B checkpoint (few-shot refinement)
        "final_ckpt": OUT_PATHS["phase3b"] / "checkpoint-best",
        "manifest":   OUT_PATHS["phase3b"] / "manifest.json",
    }
}

# ----------------------------------------------------------
# Utility to write JSON with consistent formatting.
# Used for manifests/metadata so results are easy to inspect.
# ----------------------------------------------------------
def save_json(obj, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

# Quick visibility into where everything will be saved.
print("[OUT] Root:", OUT_ROOT)
for k, v in OUT_PATHS.items():
    print(f" - {k}: {v}")


[OUT] Root: /content/outputs
 - data: /content/outputs/data
 - phase1: /content/outputs/phase1
 - phase2: /content/outputs/phase2
 - phase2_5: /content/outputs/phase2_5_stacked_cal
 - phase3: /content/outputs/phase3_noextra
 - phase3b: /content/outputs/phase3_noextra_refine


# Libraries import and Data Preprocessing

In [4]:
# ------------------------------------------------------------
# Phase 1 (Baselines) — Imports + Reproducibility + Data Loader
# ------------------------------------------------------------
# This cell sets up:
# 1) libraries used for baseline experiments (zero-shot + supervised fine-tune),
# 2) deterministic-ish behavior via seeding,
# 3) file paths and a robust CSV loader that normalizes column names/labels.
# NOTE: Phase-1 baseline is separate from the thesis weak-supervision pipeline.
# ------------------------------------------------------------

import os, sys, math, json, time, random, gc, shutil
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from dataclasses import dataclass
from typing import Dict, List, Any, Optional

import torch
from torch import nn

from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# HuggingFace evaluate is used to compute standard metrics consistently
import evaluate

# Transformers utilities:
# - pipeline: convenient wrapper for zero-shot inference and translation/sentiment pipelines (Phase-1 baseline use)
# - Trainer / TrainingArguments: standard fine-tuning loop for the supervised baseline
# - AutoConfig/AutoTokenizer/AutoModel*: load models/tokenizers by name from HuggingFace hub
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding,
    Trainer, TrainingArguments, EarlyStoppingCallback, pipeline, AutoConfig
)

# ---- 1) Reproducibility ----
# Set seeds for Python, NumPy, and PyTorch.
# This reduces randomness in training/inference; however, exact reproducibility on GPU
# can still vary slightly due to non-deterministic CUDA kernels in some ops.
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Force deterministic behavior where possible (may reduce speed).
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# ---- 2) Paths & I/O helpers ----
# Baseline section reads raw CSVs directly from /content and writes outputs to:
#   /content/outputs/phase1
# (The thesis pipeline later uses OUT_FILES/OUT_PATHS as its centralized path registry.)
BASE_DIR = "/content"
IN_DIR   = BASE_DIR
OUT_DIR  = os.path.join(BASE_DIR, "outputs", "phase1")
os.makedirs(OUT_DIR, exist_ok=True)

# Expected dataset files (Urdu SAU-18 style splits + few-shot pack)
TRAIN_CSV = os.path.join(IN_DIR, "sau18_train.csv")
DEV_CSV   = os.path.join(IN_DIR, "sau18_dev.csv")
TEST_CSV  = os.path.join(IN_DIR, "sau18_test.csv")
FS_CSV    = os.path.join(IN_DIR, "sau18_fewshot_64_per_class.csv")  # not used in Phase-1 training; only loaded for availability

# If any required CSV is missing, prompt user to upload via Colab file picker.
missing = [p for p in [TRAIN_CSV, DEV_CSV, TEST_CSV, FS_CSV] if not os.path.exists(p)]
if missing:
    print("⚠️ Missing files:", missing)
    print("Use the file picker to upload the four CSVs now.")
    from google.colab import files
    uploaded = files.upload()  # upload the four CSVs
    # Re-check after upload
    missing = [p for p in [TRAIN_CSV, DEV_CSV, TEST_CSV, FS_CSV] if not os.path.exists(p)]
    if missing:
        raise FileNotFoundError(f"Still missing after upload: {missing}")

# ---- 3) Load data ----
# Normalize any label variants (pos/neg/neu, numeric encodings, +/-) into the canonical
# 3-class label space used throughout the thesis work.
def normalize_sentiment(x: str):
    if pd.isna(x): return np.nan
    s = str(x).strip().lower()
    mapping = {
        "pos": "positive", "positive": "positive", "+": "positive", "1": "positive",
        "neg": "negative", "negative": "negative", "-": "negative", "-1": "negative",
        "neu": "neutral", "neutral": "neutral", "0": "neutral"
    }
    return mapping.get(s, s)

def load_csv(path):
    # Read CSV with utf-8-sig to safely handle BOM and Urdu text in Colab exports.
    df = pd.read_csv(path, encoding="utf-8-sig")

    # Standardize column names.
    # The code tries to "guess" common alternatives so the notebook works even if
    # column names slightly differ across dataset exports.
    # Expected (preferred): text, sentiment, sourcesheet, unanimous
    cols = {c.lower(): c for c in df.columns}

    def pick(*names):
        # Helper: return the best matching column name (case/space-insensitive)
        for n in names:
            for k, v in cols.items():
                if k == n.lower() or k.replace(" ", "") == n.lower().replace(" ", ""):
                    return v
        return None

    text_col = pick("text","review","utterance")
    sent_col = pick("sentiment","label")
    src_col  = pick("sourcesheet","domain","category")
    uni_col  = pick("unanimous")

    # Hard requirement: at least text + sentiment must exist for baseline evaluation.
    if text_col is None or sent_col is None:
        raise ValueError(f"CSV {path} must contain text & sentiment columns. Found: {df.columns.tolist()}")

    # Rename into a consistent schema: text + sentiment (plus optional metadata columns).
    df = df.rename(columns={text_col:"text", sent_col:"sentiment"})
    if src_col and src_col != "sourcesheet": df = df.rename(columns={src_col:"sourcesheet"})
    if uni_col and uni_col != "unanimous":   df = df.rename(columns={uni_col:"unanimous"})

    # Normalize labels + remove invalid rows
    df["sentiment"] = df["sentiment"].apply(normalize_sentiment)
    df = df.dropna(subset=["text","sentiment"]).reset_index(drop=True)
    return df

# Load baseline splits into memory (used by Phase-1 only).
train_df = load_csv(TRAIN_CSV)
dev_df   = load_csv(DEV_CSV)
test_df  = load_csv(TEST_CSV)
fewshot_df = load_csv(FS_CSV)

# Quick sanity checks: sizes + class balance in train split.
print("Loaded rows:", { "train":len(train_df), "dev":len(dev_df), "test":len(test_df), "fewshot":len(fewshot_df) })
print(train_df["sentiment"].value_counts().to_string(), "\n")

# ---- 4) Label mapping ----
# Fixed label order is critical: it ensures metrics, probabilities, and IDs remain stable
# across runs and across phases (and matches later pipeline assumptions).
LABEL_LIST = ["negative","neutral","positive"]  # fixed order for stability
LABEL2ID = {l:i for i,l in enumerate(LABEL_LIST)}
ID2LABEL = {i:l for l,i in LABEL2ID.items()}

# Convert string labels -> integer IDs for model training/evaluation.
def to_ids(series):
    return series.map(LABEL2ID).astype(int)


Loaded rows: {'train': 7372, 'dev': 819, 'test': 1509, 'fewshot': 192}
sentiment
neutral     2768
positive    2663
negative    1941 



# Fully Supervised Baseline for Comparision

In [5]:
# ============================================
# Phase 1 — Baselines (Urdu sentiment)
# Zero-shot (XNLI) + Fully-supervised fine-tune (XLM-R base)
# Outputs: metrics & predictions for DEV/TEST
#
# Purpose of this phase:
# - Provide two reference baselines that use FULL gold supervision (supervised) or NO supervision (zero-shot)
# - These are used for comparison against the thesis framework pipeline (weak supervision + self-training)
#
# Saves (into /content/outputs/phase1):
# - dev_zero_shot_predictions.csv, test_zero_shot_predictions.csv
# - dev_zero_shot_metrics.json,      test_zero_shot_metrics.json
# - dev_xlmr_predictions.csv,        test_xlmr_predictions.csv
# - dev_xlmr_metrics.json,           test_xlmr_metrics.json
# ============================================





# ---- 1) ZERO-SHOT baseline (XNLI) ----
# Uses multilingual NLI as a zero-shot classifier:
# - We treat each sentiment class as a candidate "hypothesis label"
# - The model returns the most likely label + a score (confidence-like)
ZS_MODEL = "joeddav/xlm-roberta-large-xnli"  # large XLM-R fine-tuned on XNLI; strong multilingual zero-shot
device = 0 if torch.cuda.is_available() else -1  # GPU if available, otherwise CPU
zs_pipe = pipeline("zero-shot-classification", model=ZS_MODEL, device=device)

def zero_shot_predict(texts: List[str], batch_size: int = 16) -> List[str]:
    # Runs XNLI over all texts and returns the top predicted label for each text
    preds = []
    cand_labels = LABEL_LIST  # fixed candidate label order: ["negative","neutral","positive"]
    for i in tqdm(range(0, len(texts), batch_size), desc="Zero-shot (XNLI) inference"):
        batch = texts[i:i+batch_size]
        # multi_label=False enforces a single label prediction among candidates
        outs = zs_pipe(batch, candidate_labels=cand_labels, multi_label=False)
        # pipeline returns dict for single item; list for batch
        if isinstance(outs, dict): outs = [outs]
        for out in outs:
            # out["labels"] are sorted descending by score; take the top-1 label
            preds.append(out["labels"][0])
    return preds

def evaluate_and_save(df: pd.DataFrame, preds: List[str], prefix: str):
    # Evaluates predictions against gold labels and writes:
    # - predictions CSV (original rows + "pred")
    # - metrics JSON (accuracy, macro-F1, per-class F1)
    gold = df["sentiment"].tolist()
    acc = accuracy_score(gold, preds)
    macro = f1_score(gold, preds, average="macro", labels=LABEL_LIST)
    per = f1_score(gold, preds, average=None, labels=LABEL_LIST)
    print(f"\n[{prefix}] Zero-shot — Accuracy: {acc:.4f}  Macro-F1: {macro:.4f}")
    print("Per-class F1 (neg, neu, pos):", [f"{x:.4f}" for x in per])
    print(classification_report(gold, preds, labels=LABEL_LIST, digits=4))
    # Save predictions
    out_pred = df.copy()
    out_pred["pred"] = preds
    out_pred.to_csv(os.path.join(OUT_DIR, f"{prefix}_zero_shot_predictions.csv"), index=False, encoding="utf-8-sig")
    # Save metrics
    metrics = {
        "accuracy": acc,
        "macro_f1": macro,
        "f1_negative": float(per[0]),
        "f1_neutral": float(per[1]),
        "f1_positive": float(per[2]),
        "n": len(df)
    }
    with open(os.path.join(OUT_DIR, f"{prefix}_zero_shot_metrics.json"), "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=2, ensure_ascii=False)
    return metrics

print("▶ Running ZERO-SHOT baseline on DEV and TEST ...")
# Evaluate zero-shot on DEV (for reference) and TEST (final baseline report)
zs_dev_preds  = zero_shot_predict(dev_df["text"].tolist(), batch_size=16)
zs_test_preds = zero_shot_predict(test_df["text"].tolist(), batch_size=16)
zs_dev_metrics  = evaluate_and_save(dev_df, zs_dev_preds, "dev")
zs_test_metrics = evaluate_and_save(test_df, zs_test_preds, "test")

# ---- 2) Fully-supervised fine-tuning (XLM-R base) ----
# This is a strong upper-bound style baseline:
# - Uses ALL gold-labeled TRAIN data
# - Tunes on DEV and reports results on DEV and TEST
# - Not part of the weak-supervision thesis pipeline; only for comparison
MODEL_NAME = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tokenize_fn(batch):
    # Tokenize text with truncation to MAX_LEN=256 (consistent with later phases)
    return tokenizer(batch["text"], truncation=True, max_length=256)

# Build HuggingFace Datasets from pandas DataFrames.
# We convert label strings -> integer IDs using to_ids() defined in the previous cell.
from datasets import Dataset, DatasetDict
train_hf = Dataset.from_pandas(pd.DataFrame({"text": train_df["text"], "label": to_ids(train_df["sentiment"])}))
dev_hf   = Dataset.from_pandas(pd.DataFrame({"text": dev_df["text"],   "label": to_ids(dev_df["sentiment"])}))
test_hf  = Dataset.from_pandas(pd.DataFrame({"text": test_df["text"],  "label": to_ids(test_df["sentiment"])}))
ds = DatasetDict({"train":train_hf, "dev":dev_hf, "test":test_hf})

# Apply tokenizer; remove original "text" because the model expects token IDs + attention masks
ds = ds.map(tokenize_fn, batched=True, remove_columns=["text"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)  # dynamic padding per batch

# Config & model
# id2label/label2id help produce readable outputs and keep label ordering correct.
config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=len(LABEL_LIST), id2label=ID2LABEL, label2id=LABEL2ID)
model  = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)

# Optional: class weights (helpful if the training set is imbalanced)
# Here we use inverse-frequency weights (normalized to mean 1.0).
counts = train_df["sentiment"].value_counts()
class_weights = torch.tensor([1.0 / counts.get(lbl, 1.0) for lbl in LABEL_LIST], dtype=torch.float)
class_weights = class_weights / class_weights.mean()  # normalize around 1.0
class_weights = class_weights.to("cuda" if torch.cuda.is_available() else "cpu")

class WeightedCELossTrainer(Trainer):
    # Custom Trainer to apply weighted CrossEntropyLoss instead of default unweighted loss.
    # This affects TRAINING only (loss), not evaluation metric computation.
    def __init__(self, *args, **kwargs):
        self.class_weights = kwargs.pop("class_weights", None)
        super().__init__(*args, **kwargs)
        self.loss_fct = nn.CrossEntropyLoss(weight=self.class_weights) if self.class_weights is not None else nn.CrossEntropyLoss()

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = self.loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Metrics (via HuggingFace evaluate)
acc_metric = evaluate.load("accuracy")
f1_metric  = evaluate.load("f1")

def compute_metrics(eval_pred):
    # eval_pred = (logits, labels)
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    # Accuracy + macro-F1 + per-class F1 (aligned with LABEL2ID mapping)
    acc = acc_metric.compute(references=labels, predictions=preds)["accuracy"]
    macro = f1_metric.compute(references=labels, predictions=preds, average="macro")["f1"]
    per = f1_metric.compute(references=labels, predictions=preds, average=None)["f1"]
    return {
        "accuracy": acc,
        "macro_f1": macro,
        "f1_negative": float(per[LABEL2ID["negative"]]),
        "f1_neutral": float(per[LABEL2ID["neutral"]]),
        "f1_positive": float(per[LABEL2ID["positive"]]),
    }

# Training args
BATCH = 16
args = TrainingArguments(
    output_dir=os.path.join(OUT_DIR, "xlmr_base"),  # supervised baseline checkpoints/logs
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",          # evaluate at end of each epoch on DEV
    save_strategy="epoch",          # save a checkpoint each epoch
    load_best_model_at_end=True,    # restore best checkpoint based on metric_for_best_model
    metric_for_best_model="eval_macro_f1",
    greater_is_better=True,
    save_total_limit=2,             # keep only a small number of checkpoints
    logging_steps=50,
    seed=SEED,
    fp16=torch.cuda.is_available(), # enable mixed precision on GPU for speed/memory
    report_to="none"                # disable external experiment tracking
)

trainer = WeightedCELossTrainer(
    model=model,
    args=args,
    train_dataset=ds["train"],      # gold TRAIN
    eval_dataset=ds["dev"],         # gold DEV for early-stopping/selection
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    class_weights=class_weights
)

print("\n▶ Fine-tuning XLM-R base on TRAIN; early-stopping on DEV ...")
# Early stopping watches the evaluation metric across epochs and stops if it stops improving.
callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
trainer.add_callback(callbacks[0])

train_out = trainer.train()
print("Best checkpoint:", trainer.state.best_model_checkpoint)

# DEV/TEST evaluation helper:
# - runs trainer.predict()
# - computes metrics again (for printing + saving)
# - writes a predictions CSV aligned with original text rows
def eval_and_save_split(split_name: str, dataset):
    preds = trainer.predict(dataset)
    y_true = preds.label_ids
    # convert logits -> probabilities -> argmax label IDs
    y_prob = torch.tensor(preds.predictions).softmax(-1).numpy()
    y_pred = y_prob.argmax(axis=1)

    gold = [ID2LABEL[i] for i in y_true]
    pred = [ID2LABEL[i] for i in y_pred]
    acc = accuracy_score(gold, pred)
    macro = f1_score(gold, pred, average="macro", labels=LABEL_LIST)
    per = f1_score(gold, pred, average=None, labels=LABEL_LIST)

    print(f"\n[{split_name}] Supervised XLM-R — Accuracy: {acc:.4f}  Macro-F1: {macro:.4f}")
    print("Per-class F1 (neg, neu, pos):", [f"{x:.4f}" for x in per])
    print(classification_report(gold, pred, labels=LABEL_LIST, digits=4))

    out_dir = OUT_DIR
    # Save predictions CSV (attach original text by selecting the matching source DF)
    src_df = dev_df if split_name=="dev" else test_df
    out_df = src_df.copy()
    out_df["pred"] = pred
    out_df.to_csv(os.path.join(out_dir, f"{split_name}_xlmr_predictions.csv"), index=False, encoding="utf-8-sig")

    # Save metrics JSON
    metrics = {
        "accuracy": acc,
        "macro_f1": macro,
        "f1_negative": float(per[0]),
        "f1_neutral": float(per[1]),
        "f1_positive": float(per[2]),
        "n": len(src_df)
    }
    with open(os.path.join(out_dir, f"{split_name}_xlmr_metrics.json"), "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=2, ensure_ascii=False)
    return metrics

# Report baseline results
xlmr_dev_metrics  = eval_and_save_split("dev", ds["dev"])
xlmr_test_metrics = eval_and_save_split("test", ds["test"])

print("\n✅ Phase-1 complete. Files saved to:", OUT_DIR)
print("  - dev_zero_shot_metrics.json / test_zero_shot_metrics.json")
print("  - dev_xlmr_metrics.json / test_xlmr_metrics.json")
print("  - *_predictions.csv")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/734 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cuda:0


▶ Running ZERO-SHOT baseline on DEV and TEST ...


Zero-shot (XNLI) inference:   0%|          | 0/52 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Zero-shot (XNLI) inference:   0%|          | 0/95 [00:00<?, ?it/s]


[dev] Zero-shot — Accuracy: 0.5263  Macro-F1: 0.4810
Per-class F1 (neg, neu, pos): ['0.5446', '0.2414', '0.6570']
              precision    recall  f1-score   support

    negative     0.5789    0.5140    0.5446       214
     neutral     0.5000    0.1591    0.2414       308
    positive     0.5122    0.9158    0.6570       297

    accuracy                         0.5263       819
   macro avg     0.5304    0.5296    0.4810       819
weighted avg     0.5251    0.5263    0.4713       819


[test] Zero-shot — Accuracy: 0.5368  Macro-F1: 0.4917
Per-class F1 (neg, neu, pos): ['0.5548', '0.2510', '0.6693']
              precision    recall  f1-score   support

    negative     0.5801    0.5316    0.5548       395
     neutral     0.4677    0.1715    0.2510       548
    positive     0.5349    0.8940    0.6693       566

    accuracy                         0.5368      1509
   macro avg     0.5276    0.5324    0.4917      1509
weighted avg     0.5223    0.5368    0.4874      1509



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/7372 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/1509 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  super().__init__(*args, **kwargs)



▶ Fine-tuning XLM-R base on TRAIN; early-stopping on DEV ...


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,F1 Negative,F1 Neutral,F1 Positive
1,0.7072,0.612454,0.746032,0.747319,0.741419,0.696343,0.804196
2,0.6046,0.54283,0.794872,0.795274,0.788671,0.752475,0.844677
3,0.4919,0.534112,0.791209,0.792646,0.794457,0.751592,0.831889


Best checkpoint: /content/outputs/phase1/xlmr_base/checkpoint-922



[dev] Supervised XLM-R — Accuracy: 0.7949  Macro-F1: 0.7953
Per-class F1 (neg, neu, pos): ['0.7887', '0.7525', '0.8447']
              precision    recall  f1-score   support

    negative     0.7388    0.8458    0.7887       214
     neutral     0.7651    0.7403    0.7525       308
    positive     0.8768    0.8148    0.8447       297

    accuracy                         0.7949       819
   macro avg     0.7936    0.8003    0.7953       819
weighted avg     0.7987    0.7949    0.7954       819




[test] Supervised XLM-R — Accuracy: 0.8197  Macro-F1: 0.8180
Per-class F1 (neg, neu, pos): ['0.8117', '0.7799', '0.8625']
              precision    recall  f1-score   support

    negative     0.7543    0.8785    0.8117       395
     neutral     0.8279    0.7372    0.7799       548
    positive     0.8663    0.8587    0.8625       566

    accuracy                         0.8197      1509
   macro avg     0.8162    0.8248    0.8180      1509
weighted avg     0.8230    0.8197    0.8192      1509


✅ Phase-1 complete. Files saved to: /content/outputs/phase1
  - dev_zero_shot_metrics.json / test_zero_shot_metrics.json
  - dev_xlmr_metrics.json / test_xlmr_metrics.json
  - *_predictions.csv


# Thesis Framework Pipeline

In [6]:
# ==========================================================
# SPLITS RESOLVER — cache or link into /content/outputs/data and reload
# ==========================================================
# Goal of this cell:
# - Standardize where later phases read data from (single canonical location: /content/outputs/data)
# - Avoid duplicated copies of datasets in memory / disk across notebook restarts
# - Enforce the thesis framework "no-dev" rule by requiring FEWSHOT explicitly (no dev fallback)
#
# Inputs expected (typical Colab locations):
# - /content/sau18_train.csv
# - /content/sau18_test.csv
# - /content/sau18_fewshot_64_per_class.csv   (REQUIRED for the framework pipeline)
#
# Outputs written/linked (canonical paths):
# - /content/outputs/data/train.csv
# - /content/outputs/data/test.csv
# - /content/outputs/data/fewshot.csv
#
# After this cell runs, later phases should use:
# - df_train, df_test, df_fewshot
# ==========================================================
import os
from pathlib import Path
import pandas as pd

# If Phase-1 already produced CSVs in /content, link them; otherwise save in-memory DFs.
# NOTE: We intentionally do NOT include DEV here to keep the framework consistent:
#       the pipeline should only rely on TRAIN (unlabeled pool) + FEWSHOT (small gold set) + TEST (final evaluation).
ROOT_TRAIN   = Path("/content/sau18_train.csv")
ROOT_TEST    = Path("/content/sau18_test.csv")
ROOT_FEWSHOT = Path("/content/sau18_fewshot_64_per_class.csv")  # few-shot gold set (required for pipeline)

def _ensure_link_or_save(df: pd.DataFrame, src_path: Path, dst_path: Path, name: str):
    """
    Create the canonical split file at dst_path via:
    1) Symlink to src_path if src exists (fast, no duplication), else
    2) Save the in-memory dataframe df to dst_path (when running from RAM)
    If dst_path already exists, do nothing (idempotent).
    """
    if dst_path.exists(): return
    if src_path is not None and src_path.exists():
        try:
            # Prefer symlinks so we don't duplicate large CSVs
            os.symlink(src_path, dst_path)
            print(f"[SPLITS] linked {name}: {dst_path} -> {src_path}")
            return
        except Exception:
            # Some environments disallow symlinks; in that case we fall back to saving df if provided
            pass
    if df is not None:
        df.to_csv(dst_path, index=False)
        print(f"[SPLITS] wrote {name}: {dst_path} (rows={len(df)})")

# Save/link splits into /content/outputs/data
try:
    # If Phase-1 ran earlier, train_df/test_df/fewshot_df exist in RAM; we can write/link from them.
    _ensure_link_or_save(train_df,   ROOT_TRAIN,   OUT_FILES["splits"]["train"], "train")
    _ensure_link_or_save(test_df,    ROOT_TEST,    OUT_FILES["splits"]["test"], "test")

    # FEWSHOT is required for the thesis pipeline:
    # - Prefer in-memory fewshot_df if available
    # - Else link from /content/sau18_fewshot_64_per_class.csv
    # - Else raise an error (no dev fallback allowed)
    if "fewshot_df" in globals():
        _ensure_link_or_save(fewshot_df, ROOT_FEWSHOT, OUT_FILES["splits"]["fewshot"], "fewshot")
    else:
        if ROOT_FEWSHOT.exists():
            _ensure_link_or_save(None, ROOT_FEWSHOT, OUT_FILES["splits"]["fewshot"], "fewshot")
        else:
            raise FileNotFoundError(
                "Fewshot file is required for thesis framework pipeline (no dev fallback). "
                "Upload /content/sau18_fewshot_64_per_class.csv or ensure outputs/data/fewshot.csv exists."
            )

except NameError:
    # If Phase-1 did NOT run, train_df/test_df/fewshot_df may not exist in RAM.
    # In that case, only attempt to link from the expected /content/*.csv files.
    _ensure_link_or_save(None, ROOT_TRAIN, OUT_FILES["splits"]["train"], "train")
    _ensure_link_or_save(None, ROOT_TEST,  OUT_FILES["splits"]["test"],  "test")
    if ROOT_FEWSHOT.exists():
        _ensure_link_or_save(None, ROOT_FEWSHOT, OUT_FILES["splits"]["fewshot"], "fewshot")
    else:
        raise FileNotFoundError(
            "Fewshot file is required for thesis framework pipeline (no dev fallback). "
            "Upload /content/sau18_fewshot_64_per_class.csv or ensure outputs/data/fewshot.csv exists."
        )

# Reload to RAM for all later phases (single place to load from)
def load_splits_from_outputs():
    """
    Load canonical splits from /content/outputs/data into memory.

    IMPORTANT:
    - This pipeline enforces FEWSHOT as mandatory.
    - If fewshot.csv is missing, we raise immediately so later phases don't silently use DEV.
    """
    req = OUT_FILES["splits"]
    if not req["fewshot"].exists():
        raise FileNotFoundError(
            "outputs/data/fewshot.csv is required for thesis framework pipeline (no dev fallback)."
        )
    return (pd.read_csv(req["train"]),
            pd.read_csv(req["test"]),
            pd.read_csv(req["fewshot"]))

# These aliases are used by Phase 2 / 2.5 / 3 / 3B
df_train, df_test, df_fewshot = load_splits_from_outputs()
print("[SPLITS] df_train/test/fewshot sizes:", tuple(len(x) for x in (df_train, df_test, df_fewshot)))


[SPLITS] linked train: /content/outputs/data/train.csv -> /content/sau18_train.csv
[SPLITS] linked test: /content/outputs/data/test.csv -> /content/sau18_test.csv
[SPLITS] linked fewshot: /content/outputs/data/fewshot.csv -> /content/sau18_fewshot_64_per_class.csv
[SPLITS] df_train/test/fewshot sizes: (7372, 1509, 192)


In [7]:
# ==========================================================
# DATA ALIASES — guaranteed from /content/outputs/data
# ==========================================================
# Purpose of this cell:
# - Provide a clean, consistent set of dataframe aliases that ALL later phases use.
# - Read ONLY from the canonical output paths created by the Splits Resolver
#   (/content/outputs/data/*), so the notebook is restart-safe and reproducible.
# - Enforce the framework constraint: FEWSHOT must exist (no DEV fallback).
#
# Outputs in RAM:
# - df_train   : unlabeled pool used for weak supervision (Phase 2/2.5)
# - df_test    : held-out test set used ONLY for final evaluation/reporting
# - df_fewshot : small gold calibration set used for stacking calibration + early stopping/refine
# ==========================================================
import pandas as pd

# Canonical split loads (always read from outputs/data to avoid drift)
df_train   = pd.read_csv(OUT_FILES["splits"]["train"])
df_test    = pd.read_csv(OUT_FILES["splits"]["test"])

# FEWSHOT is mandatory for the thesis framework pipeline:
# - used as the only gold supervision source for calibration/early stopping/refinement
# - we intentionally do NOT fall back to dev here
if not OUT_FILES["splits"]["fewshot"].exists():
    raise FileNotFoundError(
        "Fewshot is required for thesis framework pipeline (no dev fallback). "
        "Ensure outputs/data/fewshot.csv exists."
    )
df_fewshot = pd.read_csv(OUT_FILES["splits"]["fewshot"])

def _norm_len(name, df):
    """Small helper to print dataset sizes safely."""
    try:
        return f"{name}={len(df)}"
    except Exception:
        return f"{name}=<??>"

# Quick sanity log: confirms the pipeline is reading the intended splits
print("[ALIASES]", ", ".join([
    _norm_len("df_train", df_train),
    _norm_len("df_test", df_test),
    _norm_len("df_fewshot", df_fewshot),
]))


[ALIASES] df_train=7372, df_test=1509, df_fewshot=192


In [8]:
# ============================================
# Phase 2 — Weak Supervision & Pseudo-Labels
# Labelers: (A) XNLI zero-shot, (B) Translate->EN 3-class, (C) Lexicon heuristic
# Aggregation: weighted vote + confidence
# Output: /content/outputs/phase2/sau18_train_pseudo.csv
#
# What this phase does (high level):
# - Takes the TRAIN split as an "unlabeled pool" (even if it contains a gold column, we DO NOT use it for training).
# - Runs 3 independent weak labelers to get (label, confidence) for each text.
# - Aggregates the 3 sources into a single pseudo label + pseudo confidence per row.
# - Writes a pseudo-labeled training CSV that later phases (2.5/3) will consume.
# ============================================

import os, json, math, gc, random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from collections import Counter, defaultdict

import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM  # AutoTokenizer/AutoModel kept for extensibility

# ---------- Config ----------
# Always read the canonical TRAIN pool created by Splits Resolver (restart-safe).
IN_CSV  = str(OUT_FILES["splits"]["train"])
OUT_DIR = str(OUT_PATHS["phase2"]); os.makedirs(OUT_DIR, exist_ok=True)
OUT_CSV = os.path.join(OUT_DIR, "sau18_train_pseudo.csv")

# Fixed 3-class label space (kept consistent across all phases).
LABELS = ["negative","neutral","positive"]
LABEL2ID = {l:i for i,l in enumerate(LABELS)}

# Batch sizes control GPU/CPU memory use for pipelines.
BATCH_XNLI  = 16
BATCH_TRANS = 16

# Optional subsampling of the unlabeled pool for quick experiments (None = use full train pool).
PSEUDO_MAX = None

# Reproducibility for sampling/ordering where applicable.
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# Use GPU if available; otherwise run pipelines on CPU.
device = 0 if torch.cuda.is_available() else -1

# ---------- Load & prep data ----------
def normalize_sentiment(x):
    """
    Normalizes any gold label strings/ids to our canonical label names.
    IMPORTANT: In this phase, gold labels are never used for training.
    We only keep them optionally for later analysis/debugging.
    """
    if pd.isna(x): return np.nan
    s = str(x).strip().lower()
    mp = {"pos":"positive","positive":"positive","+":"positive","1":"positive",
          "neg":"negative","negative":"negative","-":"negative","-1":"negative",
          "neu":"neutral","neutral":"neutral","0":"neutral"}
    return mp.get(s, s)

# Load TRAIN pool (may include extra columns like sentiment/sourcesheet; we standardize names).
df = pd.read_csv(IN_CSV, encoding="utf-8-sig")

# Standardize columns and DO NOT use the 'sentiment' column for training; keep only for analysis if present.
cols = {c.lower(): c for c in df.columns}
def pick(*names):
    """Best-effort column name resolver across slightly different CSV schemas."""
    for n in names:
        for k,v in cols.items():
            if k == n.lower() or k.replace(" ","")==n.lower().replace(" ",""):
                return v
    return None

text_col = pick("text","review","utterance")                      # required
src_col  = pick("sourcesheet","domain","category")               # optional (domain/source tracking)
sent_col = pick("sentiment","label")                             # optional (gold, analysis only)

if text_col is None:
    raise ValueError(f"Couldn't find a text column in {IN_CSV}. Found: {list(df.columns)}")

# Rename to canonical names used throughout the pipeline.
df = df.rename(columns={text_col:"text"})
if src_col and src_col != "sourcesheet": df = df.rename(columns={src_col:"sourcesheet"})
if sent_col and sent_col != "sentiment":  df = df.rename(columns={sent_col:"sentiment"})

# If a gold sentiment exists, normalize it (still not used for training in this phase).
if "sentiment" in df.columns:
    df["sentiment"] = df["sentiment"].apply(normalize_sentiment)

# Drop rows without text (cannot label).
df = df.dropna(subset=["text"]).reset_index(drop=True)

# Optional downsample for faster runs; keeps behavior reproducible via SEED.
if PSEUDO_MAX is not None:
    df = df.sample(n=min(PSEUDO_MAX, len(df)), random_state=SEED).reset_index(drop=True)

print("Unlabeled pool size (using TRAIN texts):", len(df))
print(df.head(3)[["text"] + ([ "sentiment" ] if "sentiment" in df.columns else [])])

# ---------- (A) Weak labeler 1: XNLI zero-shot ----------
# Uses multilingual NLI as a zero-shot classifier over our 3 sentiment labels.
print("\nLoading zero-shot (XNLI) pipeline ...")
zs = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli", device=device)

def run_xnli(texts, batch_size=BATCH_XNLI):
    """
    Returns:
    - preds: predicted label per text (argmax over LABELS)
    - confs: confidence score of the chosen label (pipeline 'scores'[0])
    """
    preds, confs = [], []
    for i in tqdm(range(0, len(texts), batch_size), desc="XNLI labeling"):
        batch = texts[i:i+batch_size]
        outs = zs(batch, candidate_labels=LABELS, multi_label=False)
        # pipeline returns dict for single item; list for batch
        if isinstance(outs, dict): outs = [outs]
        for o in outs:
            # 'labels' sorted by score desc -> top-1 becomes pseudo label for this source
            label = o["labels"][0]
            score = float(o["scores"][0])
            preds.append(label); confs.append(score)
    return preds, confs

# ---------- (B) Weak labeler 2: Urdu -> English translation -> English sentiment ----------
# Step 1: translate Urdu text to English (MT)
# Step 2: run an English 3-class sentiment model on translated text
# Step 3: map English labels back to our Urdu label space (same label names here)
print("\nLoading Urdu->English translator ...")
trans_pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-ur-en", device=device, max_length=256)

print("Loading English 3-class sentiment model ...")
en_sent_pipe = pipeline(
    "text-classification",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    return_all_scores=True,              # gives scores for all classes so we can take argmax + confidence
    device=device,
    truncation=True,
    max_length=256
)

EN2UR_LABEL = {"negative":"negative","neutral":"neutral","positive":"positive"}  # mapping kept explicit for clarity

def run_translate_en_sent(texts, b_trans=BATCH_TRANS, b_cls=32):
    """
    Returns:
    - preds: mapped sentiment label for each original Urdu text
    - confs: confidence of the chosen English class (max score)
    - translated: English translations (useful for debugging/analysis)
    """
    preds, confs = [], []

    # 1) Translate Urdu -> English
    translated = []
    for i in tqdm(range(0, len(texts), b_trans), desc="Translating ur->en"):
        batch = texts[i:i+b_trans]
        out = trans_pipe(batch)
        translated.extend([o["translation_text"] for o in out])

    # 2) Classify translated English with an English sentiment model
    for i in tqdm(range(0, len(translated), b_cls), desc="EN sentiment"):
        batch_en = translated[i:i+b_cls]
        outs = en_sent_pipe(batch_en)
        # outs: list of length batch; each item is list of dicts [{label, score}, ...]
        for scores in outs:
            scores_map = {d["label"].lower(): float(d["score"]) for d in scores}
            label = max(scores_map.items(), key=lambda kv: kv[1])[0]
            conf = scores_map[label]
            preds.append(EN2UR_LABEL[label])
            confs.append(conf)

    return preds, confs, translated

# ---------- (C) Weak labeler 3: Lexicon heuristic ----------
# A very lightweight rule-based labeler:
# - counts occurrences of positive/negative seed words
# - resolves to neutral when no cues or a tie
# - confidence is a simple bounded function of the gap
LEXICON_CSV = "/content/urdu_lexicon.csv"  # optional external lexicon (if available)
LEX_POS = set(["بہترین","عمدہ","بہت اچھا","زبردست","خوش","مزہ","پسند","محبت","حیران کن","لاجواب"])
LEX_NEG = set(["برا","بدترین","خراب","نفرت","ناکام","مایوس","غصہ","بکواس","گھٹیا","فضول"])

def lexicon_score(text):
    """
    Simple lexicon scoring:
    - p = count of positive cues
    - n = count of negative cues
    - returns (label, confidence)
    NOTE: This is intentionally simple; its main role is providing a diverse third signal.
    """
    p = sum(1 for w in LEX_POS if w in text)
    n = sum(1 for w in LEX_NEG if w in text)
    if p==0 and n==0:
        return "neutral", 0.0
    if abs(p-n) <= 0:  # tie
        return "neutral", 0.2
    if p > n:
        return "positive", min(0.9, 0.5 + 0.1*(p-n))
    else:
        return "negative", min(0.9, 0.5 + 0.1*(n-p))

# Optional: load a richer lexicon from disk and override the seed sets.
if LEXICON_CSV and os.path.exists(LEXICON_CSV):
    lex_df = pd.read_csv(LEXICON_CSV)
    lex_df["word"] = lex_df["word"].astype(str)
    pos_words = set(lex_df.loc[lex_df["sentiment"].str.lower().eq("positive"), "word"].tolist())
    neg_words = set(lex_df.loc[lex_df["sentiment"].str.lower().eq("negative"), "word"].tolist())
    if len(pos_words) > 0: LEX_POS = pos_words
    if len(neg_words) > 0: LEX_NEG = neg_words
    print(f"Loaded lexicon: pos={len(LEX_POS)} neg={len(LEX_NEG)}")

def run_lexicon(texts):
    """Runs lexicon_score over all texts and returns (preds, confs)."""
    preds, confs = [], []
    for t in texts:
        lab, sc = lexicon_score(t if isinstance(t,str) else str(t))
        preds.append(lab); confs.append(sc)
    return preds, confs

# ---------- Run all three labelers ----------
texts = df["text"].tolist()

print("\n>>> (A) XNLI zero-shot labeling ...")
xnli_pred, xnli_conf = run_xnli(texts)

print("\n>>> (B) Translate->EN -> Sentiment labeling ...")
en_pred, en_conf, en_text = run_translate_en_sent(texts)

print("\n>>> (C) Lexicon heuristic labeling ...")
lex_pred, lex_conf = run_lexicon(texts)

# ---------- Aggregation ----------
# Aggregation strategy:
# - Each source contributes only to its chosen label using (weight * confidence).
# - We also track "votes_agree" = max number of sources that picked the same label.
# - Final pseudo label = argmax over weighted scores; ties broken using vote counts.
W = {"xnli": 1.0, "en": 1.0, "lex": 0.6}  # lex is down-weighted (simple heuristic)

agg_labels, agg_conf, votes_agree = [], [], []
for i in range(len(texts)):
    # Accumulate weighted scores per label.
    scores = defaultdict(float)
    scores[xnli_pred[i]] += W["xnli"] * float(xnli_conf[i])
    scores[en_pred[i]]   += W["en"]   * float(en_conf[i])
    scores[lex_pred[i]]  += W["lex"]  * float(lex_conf[i])

    # Track raw vote agreement across the 3 sources (ignores confidence magnitude).
    votes = [xnli_pred[i], en_pred[i], lex_pred[i]]
    cnt = Counter(votes)
    majority_label, majority_votes = cnt.most_common(1)[0]

    # Choose label with highest weighted score; break ties by agreement count.
    best_label = max(scores.items(), key=lambda kv: (kv[1], cnt[kv[0]]))[0]

    # If two labels are extremely close, prefer the simple majority label for stability.
    if best_label != majority_label and abs(scores[best_label] - scores[majority_label]) < 1e-3:
        best_label = majority_label

    # Pseudo-confidence: normalized score of the chosen label (kept in [0,1] roughly).
    total_w = sum(W.values())
    conf = scores[best_label] / total_w

    agg_labels.append(best_label)
    agg_conf.append(float(conf))
    votes_agree.append(int(majority_votes))

# ---------- Build output frame ----------
# This CSV becomes the "Phase 2 artifact" consumed by Phase 2.5 (stacking calibration).
out = pd.DataFrame({
    "text": texts,
    "pseudo_label": agg_labels,
    "pseudo_confidence": agg_conf,
    "votes_agree": votes_agree,

    # keep per-source outputs for analysis + Phase 2.5 feature building
    "xnli_label": xnli_pred,
    "xnli_conf": xnli_conf,

    "en_translated": en_text,
    "en_label": en_pred,
    "en_conf": en_conf,

    "lex_label": lex_pred,
    "lex_conf": lex_conf,
})

# Optionally attach original columns for analysis only (still not used as supervision here).
if "sourcesheet" in df.columns: out["sourcesheet"] = df["sourcesheet"]
if "sentiment"   in df.columns: out["gold_sentiment_FOR_ANALYSIS_ONLY"] = df["sentiment"]

# Save pseudo-labeled dataset for downstream phases.
out.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
print(f"\n✅ Saved pseudo-labeled training set to: {OUT_CSV}")

# ---------- Quick summary ----------
def dist(series):
    """Prints class distribution (count + fraction) for quick sanity checking."""
    c = series.value_counts()
    return c.to_frame("count").assign(frac=lambda d: (d["count"]/len(series)).round(4))

print("\nPseudo label distribution:")
print(dist(out["pseudo_label"]).to_string())

print("\nVotes agreement (max votes among 3 sources):")
print(out["votes_agree"].value_counts().sort_index().rename("count").to_frame().to_string())

print("\nConfidence stats:")
print(out["pseudo_confidence"].describe().to_string())


Unlabeled pool size (using TRAIN texts): 7372
                                                text sentiment
0                                  گئ بھینس پانی میں   neutral
1  انشاءاللہ بھائی تم جیت یا ہمیں تم سے پیار ہےبہ...  positive
2  گنگولی کے دیس میں ٹیلنٹ بہت ہے ہمارے ہاں ایسا ...  positive

Loading zero-shot (XNLI) pipeline ...


Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0



Loading Urdu->English translator ...


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/848k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/816k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


Loading English 3-class sentiment model ...


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


Loaded lexicon: pos=128 neg=146

>>> (A) XNLI zero-shot labeling ...




XNLI labeling:   0%|          | 0/461 [00:00<?, ?it/s]


>>> (B) Translate->EN -> Sentiment labeling ...


Translating ur->en:   0%|          | 0/461 [00:00<?, ?it/s]

EN sentiment:   0%|          | 0/231 [00:00<?, ?it/s]


>>> (C) Lexicon heuristic labeling ...

✅ Saved pseudo-labeled training set to: /content/outputs/phase2/sau18_train_pseudo.csv

Pseudo label distribution:
              count    frac
pseudo_label               
positive       2925  0.3968
neutral        2876  0.3901
negative       1571  0.2131

Votes agreement (max votes among 3 sources):
             count
votes_agree       
1              583
2             5142
3             1647

Confidence stats:
count    7372.000000
mean        0.451367
std         0.191628
min         0.164947
25%         0.307590
50%         0.371959
75%         0.571981
max         0.963171


In [9]:
# =========================
# CONFIG: No-Extra-Data Mode + Weight Policy for Phase 3/3C
#
# Purpose of this cell:
# - Enforces the thesis constraint that we do NOT add new training rows (NO_EXTRA_DATA).
# - Defines a lightweight, deterministic "row-weighting" policy used later in Phase 3
#   to emphasize more reliable pseudo-labels without changing dataset size.
# - Uses simple Urdu cues (negators, intensifiers) as bounded multipliers so weights
#   remain stable and do not dominate training.
# =========================
from dataclasses import dataclass
import re
import numpy as np

# --- 1) Global switch: enforce no extra data anywhere ---
# If True, later phases should avoid any augmentation that increases row count
# (e.g., back-translation, synthetic neutral mining, duplication strategies).
NO_EXTRA_DATA: bool = True  # <== keep the training row count fixed

# --- 2) Urdu cue lexicons (minimal, extend if you have richer lists) ---
# These are *heuristic* cues used only for weighting, not for labeling.
NEGATORS_UR = [
    "نہیں", "مت", "ہرگز نہیں", "ہرگز", "کبھی نہیں", "نا", "نہ"  # add variants if needed
]
INTENSIFIERS_UR = [
    "بہت", "انتہائی", "بےحد", "زیادہ", "خاصا", "کافی"
]

# Precompile regex for fast substring matching of the cue lists.
NEG_RE = re.compile("|".join(map(re.escape, NEGATORS_UR)))
INT_RE = re.compile("|".join(map(re.escape, INTENSIFIERS_UR)))

# --- 3) Policy dataclass for weights ---
# Encapsulates all hyperparameters controlling per-row weights.
@dataclass
class WeightPolicy:
    # base weight range clamps: prevents extremely small/large weights
    min_w: float = 0.10
    max_w: float = 1.60

    # agreement bonuses: reward cases where multiple weak labelers agree
    bonus_votes: dict = None

    # optional neutral boost: slightly up-weight confident neutral predictions
    neutral_conf_boost: float = 0.65
    neutral_boost: float = 0.10  # +10%

    # negation/intensifier multipliers (bounded later)
    negation_mult: float = 0.90      # -10% if negation cue present
    intensifier_mult: float = 1.10   # +10% if intensifier cue present

    def __post_init__(self):
        # Default vote-based bonuses/penalties (3 labelers in Phase 2):
        # votes_agree=3 => strong agreement, votes_agree=0 => no agreement.
        if self.bonus_votes is None:
            self.bonus_votes = {3: 0.15, 2: 0.06, 1: 0.00, 0: -0.10}

WP = WeightPolicy()

# --- 4) Core helpers: compute row weights without adding rows ---
def base_conf_weight(confidence: float, votes_agree: int, wp: WeightPolicy = WP) -> float:
    """
    Base weight is driven by:
    - pseudo-label confidence (from Phase 2 / Phase 2.5)
    - agreement count across weak labelers (votes_agree)
    Output is clipped to [min_w, max_w] to avoid instability.
    """
    c = float(confidence)
    w = 0.20 + 1.20 * c + wp.bonus_votes.get(int(votes_agree), 0.0)
    return float(np.clip(w, wp.min_w, wp.max_w))

def neutral_boost_weight(pseudo_label: str, pseudo_confidence: float, wp: WeightPolicy = WP) -> float:
    """
    Optional neutral emphasis:
    - If the pseudo label is neutral and the confidence is high enough,
      slightly increase weight (no duplication / no new rows).
    """
    if (pseudo_label == "neutral") and (float(pseudo_confidence) >= wp.neutral_conf_boost):
        return 1.0 + wp.neutral_boost
    return 1.0

def neg_int_weight_from_text(text: str, wp: WeightPolicy = WP) -> float:
    """
    Lightweight text-aware multiplier using Urdu cues:
    - Negation cue present => slightly down-weight (can flip sentiment meaning)
    - Intensifier cue present => slightly up-weight (often stronger sentiment)
    We intentionally keep this heuristic simple and tightly bounded.
    """
    if not isinstance(text, str) or len(text) == 0:
        return 1.0
    has_neg = bool(NEG_RE.search(text))
    has_int = bool(INT_RE.search(text))
    mult = 1.0
    if has_neg:
        mult *= wp.negation_mult
    if has_int:
        mult *= wp.intensifier_mult
    # cap overall multiplier inside a tight range (prevents extreme effects)
    return float(np.clip(mult, 0.80, 1.20))

def compute_row_weight(pseudo_label: str,
                       pseudo_confidence: float,
                       votes_agree: int,
                       text: str,
                       wp: WeightPolicy = WP) -> float:
    """
    Final row weight = base(confidence + agreement) * neutral boost * cue multiplier,
    then clipped to [min_w, max_w].
    This weight is later applied inside the loss (Phase 3) so that training
    emphasizes higher-quality pseudo labels without changing dataset size.
    """
    w = base_conf_weight(pseudo_confidence, votes_agree, wp)
    w *= neutral_boost_weight(pseudo_label, pseudo_confidence, wp)
    w *= neg_int_weight_from_text(text, wp)
    return float(np.clip(w, wp.min_w, wp.max_w))

# --- 5) Policy switch used by later cells in Phase 3C ---
def no_extra_data_active() -> bool:
    """Convenience helper used by later phases to gate augmentation features."""
    return bool(NO_EXTRA_DATA)

print(f"[NO-EXTRA-DATA MODE] Active = {NO_EXTRA_DATA}. "
      "Back-translation and neutral re-mining that add rows should be DISABLED. "
      "Use weight policy only (see helpers above).")


[NO-EXTRA-DATA MODE] Active = True. Back-translation and neutral re-mining that add rows should be DISABLED. Use weight policy only (see helpers above).


In [10]:
# ============================================
# Phase 2.5 — Calibrated Stacking Combiner (No-Extra-Data, robust)
#
# Purpose of this cell:
# - Takes the *raw* Phase-2 weak labels (from multiple labelers: XNLI / Translate->EN / Lexicon)
#   and learns a small supervised "combiner" using a *few-shot* labeled set only.
# - The combiner is a multinomial Logistic Regression (stacking) over per-labeler probability
#   features, followed by *isotonic calibration* to improve probability reliability.
# - Outputs a new pseudo-label dataset where each TRAIN row gets:
#     (1) calibrated pseudo_label
#     (2) calibrated pseudo_confidence
#     (3) calibrated soft probabilities pseudo_prob_negative/neutral/positive
# - Important: NO_EXTRA_DATA principle is respected: we re-score the *same* TRAIN rows;
#   we do NOT generate new rows and do NOT use the DEV set as calibration fallback.
# ============================================
import os, json, joblib, numpy as np, pandas as pd
from pathlib import Path
from typing import Tuple, List
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss

# Canonical label space used everywhere in the pipeline.
LABELS = ["negative", "neutral", "positive"]
lab2id = {l:i for i,l in enumerate(LABELS)}

def norm_label_to_str(x):
    """
    Normalize labels into the canonical strings: negative/neutral/positive.
    Handles:
    - common string variants (neg/neu/pos, +/-)
    - integer encodings seen in some datasets (-1/0/1 or 0/1/2 or 1/2/3)
    """
    if isinstance(x, str):
        xx = x.strip().lower()
        if xx in {"neg","negative","-1","-"}: return "negative"
        if xx in {"neu","neutral","0"}:       return "neutral"
        if xx in {"pos","positive","1","+"}:  return "positive"
        return xx
    if isinstance(x, (int, np.integer)):
        if x in (-1,0,1): return {-1:"negative",0:"neutral",1:"positive"}[int(x)]
        if x in (0,1,2):  return { 0:"negative",1:"neutral",2:"positive"}[int(x)]
        if x in (1,2,3):  return { 1:"negative",2:"neutral",3:"positive"}[int(x)]
    return x

# --------------- locate Phase-2 CSV (outputs-only) ---------------
# We strictly read from the pipeline outputs folder (reproducibility / restart safety).
# Candidates list exists for robustness across runs / paths.
CANDIDATES = [
    str(OUT_FILES["phase2"]["pseudo_csv"]),
    "/content/outputs/phase2/sau18_train_pseudo.csv",
    "outputs/phase2/sau18_train_pseudo.csv",
]
PSEUDO_VOTE_CSV = next((p for p in CANDIDATES if Path(p).exists()), None)
if PSEUDO_VOTE_CSV is None:
    raise RuntimeError("Phase 2.5: Phase-2 CSV not found at outputs. Run Phase-2 first.")
print(f"[Phase2.5] Using Phase-2 CSV: {PSEUDO_VOTE_CSV}")

# --------------- load Phase-2 pseudo CSV ---------------
# Must contain "text" so we can align with the gold few-shot examples by exact text match.
df_phase2 = pd.read_csv(PSEUDO_VOTE_CSV)
if "text" not in df_phase2.columns:
    raise RuntimeError("Phase 2.5: Phase-2 CSV must contain a 'text' column.")

# --------------- ensure gold df (fewshot ONLY) ---------------
# Calibration/stacking is trained ONLY on a small few-shot labeled set to match the thesis constraint.
# No DEV fallback is allowed in this pipeline mode.
def _load_gold():
    # 1) Prefer in-memory fewshot DF if already loaded earlier in notebook
    if 'df_fewshot' in globals():
        return df_fewshot.copy(), "df_fewshot"
    # 2) Else load from outputs/data/fewshot.csv
    fs = OUT_FILES["splits"]["fewshot"]
    if fs.exists():
        return pd.read_csv(fs), str(fs)
    # 3) Else fail loudly (few-shot is required for Phase 2.5)
    raise FileNotFoundError(
        "Phase 2.5 requires fewshot gold for calibration (no dev fallback). "
        "Ensure outputs/data/fewshot.csv exists."
    )

df_gold_src_raw, gold_src_name = _load_gold()
print(f"[Phase2.5] Gold source: {gold_src_name} (n={len(df_gold_src_raw)})")

def _pick_text_col(df):
    """Heuristic column detector for the text field."""
    for c in ["text","review","sentence","content","tweet","message","msg","utterance"]:
        if c in df.columns: return c
    raise RuntimeError("Gold DF: no text column found.")

def _pick_label_col(df):
    """
    Heuristic column detector for the gold label.
    Supports either:
    - a single label column (string or numeric), or
    - one-hot triplet columns: negative/neutral/positive.
    """
    lower = {c.lower(): c for c in df.columns}
    for k in ["label_str","label","gold","y","target","label_id","labels","class","category","sentiment","polarity","stance"]:
        if k in lower: return lower[k]
    if all(k in lower for k in ["negative","neutral","positive"]): return "__triplet__"
    raise RuntimeError(f"Gold DF: no label column found. Columns: {list(df.columns)[:10]}...")

def _ensure_gold_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Convert the gold few-shot dataframe into a standardized schema:
      - text
      - gold_str (negative/neutral/positive)
      - gold_id  (0/1/2)
    """
    tcol = _pick_text_col(df); lcol = _pick_label_col(df)
    out = df[[tcol]].rename(columns={tcol:"text"}).copy()

    # Case 1: triplet probability/one-hot columns
    if lcol == "__triplet__":
        trip = df[["negative","neutral","positive"]].to_numpy()
        yid = trip.argmax(axis=1).astype(int)
        out["gold_id"]  = yid
        out["gold_str"] = [LABELS[i] for i in yid]
        return out

    # Case 2: single label column
    vals = df[lcol]
    if pd.api.types.is_numeric_dtype(vals):
        out["gold_str"] = [norm_label_to_str(int(v)) for v in vals]
    else:
        out["gold_str"] = [norm_label_to_str(str(v)) for v in vals]

    out = out.dropna(subset=["text","gold_str"])
    out = out[out["gold_str"].isin(LABELS)].copy()
    out["gold_id"] = out["gold_str"].map(lab2id)

    if len(out) == 0:
        raise RuntimeError("Gold DF: normalization produced no valid rows.")
    return out.reset_index(drop=True)

df_gold_small = _ensure_gold_df(df_gold_src_raw)

# --------------- features from sources (xnli/en/lex labelers) ---------------
# Each weak labeler contributes a 3-dim probability vector => total features = 3 labelers * 3 classes = 9 features.
# Phase-2 may store either:
#  - explicit prob columns (prefix_neg/neu/pos), OR
#  - (prefix_label, prefix_conf) which we convert into a pseudo-prob distribution.
SRC_PREFIXES = ["xnli", "en", "lex"]
def _has_probs_df(df, pfx):      return all(c in df.columns for c in [f"{pfx}_neg", f"{pfx}_neu", f"{pfx}_pos"])
def _has_label_conf_df(df, pfx): return all(c in df.columns for c in [f"{pfx}_label", f"{pfx}_conf"])

def _row_probs_from_label_conf(row: pd.Series, prefix: str) -> np.ndarray:
    """
    Convert (label, confidence) into a 3-class probability vector:
      - conf assigned to predicted label
      - remaining mass split evenly across the other 2 labels
    If prob columns already exist, they are used directly.
    """
    keys = [f"{prefix}_neg", f"{prefix}_neu", f"{prefix}_pos"]
    if all(k in row.index for k in keys):
        v = np.array([row[keys[0]], row[keys[1]], row[keys[2]]], dtype=np.float32)
        s = float(v.sum())
        return v/s if s>0 else np.array([1/3,1/3,1/3], dtype=np.float32)

    lab_key, conf_key = f"{prefix}_label", f"{prefix}_conf"
    if lab_key in row.index and conf_key in row.index:
        lab = norm_label_to_str(row[lab_key])
        conf = float(row[conf_key])
        v = np.full(3, (1.0 - conf)/2.0, dtype=np.float32)
        if lab in lab2id:
            v[lab2id[lab]] = conf
        return v

    # If nothing is available, fall back to uniform probabilities.
    return np.array([1/3,1/3,1/3], dtype=np.float32)

def _build_features(df_in: pd.DataFrame) -> Tuple[np.ndarray, List[str]]:
    """
    Build a feature matrix X where each row concatenates:
      [xnli_probs(3), en_probs(3), lex_probs(3)]  => shape (N, 9)
    Also returns feature names for debugging/traceability.
    """
    feats = []
    for _, r in df_in.iterrows():
        parts = []
        for pfx in SRC_PREFIXES:
            if _has_probs_df(df_in, pfx):
                parts.append(np.array([r[f"{pfx}_neg"], r[f"{pfx}_neu"], r[f"{pfx}_pos"]], dtype=np.float32))
            elif _has_label_conf_df(df_in, pfx):
                parts.append(_row_probs_from_label_conf(r, pfx))
            else:
                parts.append(np.array([1/3,1/3,1/3], dtype=np.float32))
        feats.append(np.concatenate(parts, axis=0))
    X = np.stack(feats, axis=0)
    names = [f"{p}_{c}" for p in SRC_PREFIXES for c in LABELS]
    return X, names

# --------------- fit LR + isotonic on gold ---------------
# We align gold few-shot rows to Phase-2 rows by exact "text" match.
# The "how=right" keeps only the gold rows (and pulls in Phase-2 labeler outputs as features).
X_gold, feat_names = _build_features(df_phase2.merge(df_gold_small[["text","gold_id"]], on="text", how="right"))
y_gold = df_gold_small["gold_id"].to_numpy(dtype=np.int64)

# Grid-search the LR regularization strength C using log-loss (probabilistic quality metric).
param_grid = {"C": [0.5, 1.0, 2.0]}
base = LogisticRegression(
    multi_class="multinomial",
    solver="lbfgs",
    max_iter=1000,
    n_jobs=-1,
    random_state=42
)
gs = GridSearchCV(base, param_grid=param_grid, scoring="neg_log_loss", cv=5, n_jobs=-1, verbose=0).fit(X_gold, y_gold)
best_lr = gs.best_estimator_

# Calibrate the LR probabilities using isotonic regression (still trained only on few-shot gold).
# Calibration improves probability "honesty" (better confidence estimates).
cal = CalibratedClassifierCV(best_lr, method="isotonic", cv=5).fit(X_gold, y_gold)
print(
    f"[Phase2.5] Best LR C={gs.best_params_['C']} | "
    f"NLL before: {-gs.best_score_:.4f} | "
    f"after: {log_loss(y_gold, cal.predict_proba(X_gold)):.4f}"
)

# --------------- relabel all Phase-2 rows ---------------
# After training the combiner, we re-score every TRAIN row from Phase-2 using calibrated probabilities.
X_train, _ = _build_features(df_phase2)
P_train = cal.predict_proba(X_train)      # calibrated class probabilities for each row
yhat    = P_train.argmax(axis=1)          # final pseudo label ids
conf    = P_train.max(axis=1)             # final pseudo confidence (max prob)

df_out = df_phase2.copy()
df_out["pseudo_label"]      = [LABELS[i] for i in yhat]
df_out["pseudo_confidence"] = conf.astype(np.float32)

# If Phase-2 did not carry votes_agree, derive it as "how many sources match the final label".
if "votes_agree" not in df_out.columns:
    def _derive_votes_agree(df):
        vals = []
        for _, r in df.iterrows():
            lbl = r["pseudo_label"]
            hits = 0
            for p in SRC_PREFIXES:
                if f"{p}_label" in r.index and isinstance(r[f"{p}_label"], str) and norm_label_to_str(r[f"{p}_label"]) == lbl:
                    hits += 1
            vals.append(hits)
        return np.array(vals, dtype=np.int32)
    df_out["votes_agree"] = _derive_votes_agree(df_out)

# Store calibrated soft targets to be used in Phase-3 KL training (soft-label learning).
for j,c in enumerate(LABELS):
    df_out[f"pseudo_prob_{c}"] = P_train[:,j].astype(np.float32)

# --------------- save to outputs ---------------
# Save:
# - combiner.joblib: trained calibrated stacker
# - meta.json: key metadata (C, n_gold, etc.)
# - train_pseudo_stacked_cal.csv: relabeled pseudo dataset with soft targets
OUT_PATHS["phase2_5"].mkdir(parents=True, exist_ok=True)
joblib.dump(cal, OUT_FILES["phase2_5"]["combiner"])
with open(OUT_FILES["phase2_5"]["meta"], "w") as f:
    json.dump(
        {
            "labels": LABELS,
            "best_lr_C": float(gs.best_params_["C"]),
            "n_gold": int(len(df_gold_small)),
            "n_train_relabeled": int(len(df_out))
        },
        f,
        indent=2
    )
df_out.to_csv(OUT_FILES["phase2_5"]["out_csv"], index=False)
save_json(
    {"out_csv": str(OUT_FILES["phase2_5"]["out_csv"]), "rows": int(len(df_out)), "source_phase2": str(PSEUDO_VOTE_CSV)},
    OUT_FILES["phase2_5"]["manifest"]
)
print(f"[Phase2.5] Saved -> {OUT_FILES['phase2_5']['out_csv']} (rows={len(df_out)})")


[Phase2.5] Using Phase-2 CSV: /content/outputs/phase2/sau18_train_pseudo.csv
[Phase2.5] Gold source: df_fewshot (n=192)




[Phase2.5] Best LR C=2.0 | NLL before: 0.7253 | after: 0.6436
[Phase2.5] Saved -> /content/outputs/phase2_5_stacked_cal/train_pseudo_stacked_cal.csv (rows=7372)


In [11]:
# ==========================================================
# Phase 3 — RESTART GUARD (ensure splits & weight helpers)
#
# Why this cell exists:
# - Colab sessions frequently restart or cells are run out-of-order.
# - Phase-3 depends on (1) cached dataset splits under outputs/data and
#   (2) the row-weighting helpers used to down/up-weight noisy pseudo-labels.
# - This "guard" ensures all prerequisites are present *before* training starts,
#   and defines minimal defaults if the weight policy cell was skipped.
# ==========================================================
import re, numpy as np, pandas as pd
from dataclasses import dataclass

# -----------------------------
# 1) Ensure required splits exist (NO DEV in this pipeline mode)
# -----------------------------
# This thesis pipeline mode uses:
# - train.csv   : unlabeled pool that will be pseudo-labeled (Phase-2/2.5) and trained on (Phase-3)
# - test.csv    : held-out evaluation set (gold) for final reporting
# - fewshot.csv : small labeled set used for calibration (Phase-2.5) and for early stopping/monitoring (Phase-3/3B)
for k in ["train","test","fewshot"]:
    if not OUT_FILES["splits"][k].exists():
        # Clean error message that tells exactly what is missing and where it should be.
        raise RuntimeError(
            f"[Restart Guard] Missing split: {OUT_FILES['splits'][k]}. "
            "Ensure train/test/fewshot exist in outputs/data."
        )

# Reload splits from disk to guarantee the rest of the notebook is using the canonical versions.
df_train   = pd.read_csv(OUT_FILES["splits"]["train"])
df_test    = pd.read_csv(OUT_FILES["splits"]["test"])
df_fewshot = pd.read_csv(OUT_FILES["splits"]["fewshot"])

print("[Restart Guard] Loaded splits:",
      tuple(len(x) for x in (df_train, df_test, df_fewshot)))

# -----------------------------
# 2) Ensure weighting helpers exist (fallback definition)
# -----------------------------
# Phase-3 uses per-row weights to reduce the impact of low-confidence or disagreement cases.
# If the more complete "No-Extra-Data Mode + Weight Policy" cell was not run,
# we define a minimal (safe) default here so Phase-3 does not break.
if "compute_row_weight" not in globals():
    print("[Restart Guard] Defining minimal weight policy now.")

    # Very small Urdu cue lists:
    # - NEGATORS_UR: words that can invert polarity (e.g., "not")
    # - INTENSIFIERS_UR: words that often strengthen sentiment strength (e.g., "very")
    NEGATORS_UR = ["نہیں","مت","ہرگز نہیں","ہرگز","کبھی نہیں","نا","نہ"]
    INTENSIFIERS_UR = ["بہت","انتہائی","بےحد","زیادہ","خاصا","کافی"]

    # Precompiled regex for fast matching (simple substring-style matching for Urdu).
    NEG_RE = re.compile("|".join(map(re.escape, NEGATORS_UR)))
    INT_RE = re.compile("|".join(map(re.escape, INTENSIFIERS_UR)))

    @dataclass
    class WeightPolicy:
        """
        Minimal weight policy:
        - base weight increases with pseudo_confidence
        - small bonus if more sources agree (votes_agree)
        - optional neutral boost for confident neutral cases
        - small bounded text-based multipliers (negation/intensifier)
        """
        min_w: float = 0.10
        max_w: float = 1.60
        bonus_votes: dict = None
        neutral_conf_boost: float = 0.65
        neutral_boost: float = 0.10
        negation_mult: float = 0.90
        intensifier_mult: float = 1.10
        def __post_init__(self):
            if self.bonus_votes is None:
                # +0.15 for 3-way agreement, +0.06 for 2-way, slight penalty if no agreement
                self.bonus_votes = {3:0.15, 2:0.06, 1:0.00, 0:-0.10}

    WP = WeightPolicy()

    def base_conf_weight(c, v, wp=WP):
        """
        Map pseudo_confidence + votes_agree into a bounded scalar weight.
        Higher confidence and higher agreement => higher training weight.
        """
        import numpy as _np
        w = 0.20 + 1.20*float(c) + wp.bonus_votes.get(int(v), 0.0)
        return float(_np.clip(w, wp.min_w, wp.max_w))

    def neutral_boost_weight(lbl, c, wp=WP):
        """
        Small extra emphasis for confident neutral cases (optional).
        This does NOT add data; it only changes the contribution in the loss.
        """
        return (1.0 + wp.neutral_boost) if (lbl=="neutral" and float(c)>=wp.neutral_conf_boost) else 1.0

    def neg_int_weight_from_text(text, wp=WP):
        """
        Lightweight text cue multiplier:
        - negation slightly down-weights (to reduce risk of polarity flips)
        - intensifier slightly up-weights
        Bounded to avoid instability.
        """
        if not isinstance(text, str): return 1.0
        mult = 1.0
        if NEG_RE.search(text): mult *= wp.negation_mult
        if INT_RE.search(text): mult *= wp.intensifier_mult
        import numpy as _np
        return float(_np.clip(mult, 0.80, 1.20))

    def compute_row_weight(pseudo_label, pseudo_confidence, votes_agree, text, wp=WP):
        """
        Final per-row weight used in Phase-3 training loss:
        weight = base(confidence, agreement) * neutral_boost * cue_multiplier
        """
        import numpy as _np
        w = base_conf_weight(pseudo_confidence, votes_agree, wp)
        w *= neutral_boost_weight(pseudo_label, pseudo_confidence, wp)
        w *= neg_int_weight_from_text(text, wp)
        return float(_np.clip(w, wp.min_w, wp.max_w))
else:
    # If the weight policy cell was already run, we do not redefine anything.
    print("[Restart Guard] Weight helpers already available.")


[Restart Guard] Loaded splits: (7372, 1509, 192)
[Restart Guard] Weight helpers already available.


In [12]:
# ==========================================================
# Phase 3 — Curriculum + Soft Targets + Weighted KL Trainer
# (No-Extra-Data; uses calibrated pseudo-probs from Phase 2.5)
#
# Goal of Phase 3:
# - Train a student model (XLM-R base) using ONLY pseudo-labeled TRAIN data
#   produced by Phase-2/2.5 (weak supervision + calibration).
# - Do NOT add any extra rows (NO_EXTRA_DATA). We only change the *loss weight*
#   of each pseudo-labeled example to control noise.
#
# Key ideas implemented here:
# 1) Soft targets (probability supervision):
#    - If Phase-2.5 produced calibrated class probabilities (pseudo_prob_*),
#      we train using those distributions (better than hard labels).
# 2) Per-example weighting:
#    - Each row gets a weight derived from confidence + agreement + Urdu cues.
#    - High-quality pseudo labels contribute more to the loss.
# 3) Curriculum training:
#    - Train in stages (HI -> MID -> ALL) from cleaner to noisier subsets.
# 4) Early stopping & model selection:
#    - We evaluate on FEWSHOT gold only (no DEV usage in this pipeline mode).
#    - This helps prevent overfitting to noisy pseudo labels.
# 5) Final reporting:
#    - Evaluate once on the held-out TEST set and save predictions + manifest.
# ==========================================================
import os, json, math, random
from pathlib import Path
import numpy as np
import pandas as pd
from typing import Dict, Any, Optional, List

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback, set_seed
)
from sklearn.metrics import accuracy_score, f1_score

# -----------------------------
# 0) Config (outputs-only)
# -----------------------------
# OUT_DIR: where Phase-3 artifacts (curriculum CSVs, checkpoints, predictions, manifest) are saved.
OUT_DIR = OUT_PATHS["phase3"]; OUT_DIR.mkdir(parents=True, exist_ok=True)

# Student model backbone (can be swapped with other HF models if needed).
MODEL_NAME = "xlm-roberta-base"

# Standard training hyperparameters (kept small to fit Colab runtime).
MAX_LEN, BATCH_SIZE, SEED = 256, 16, 42
EPOCHS_HI, EPOCHS_MID, EPOCHS_ALL = 2, 2, 2
LR, WD, WARMUP = 2e-5, 0.01, 0.0

# If True and pseudo_prob_* columns exist, we use calibrated soft targets from Phase-2.5.
USE_CAL_SOFT = True

# Fixed 3-class label set for consistency across all phases.
LABELS = ["negative", "neutral", "positive"]
lab2id = {l:i for i,l in enumerate(LABELS)}
id2lab = {i:l for l,i in lab2id.items()}

# -----------------------------
# 1) Helpers: robust gold detection
# -----------------------------
# These utilities make the code resilient to small differences in split CSV schemas.
# The fewshot/test CSVs might store labels as strings ("positive") or ints (0/1/2).
def norm_label_to_str(x):
    """Normalize label formats (string/int variants) into {negative, neutral, positive}."""
    if isinstance(x, str):
        xx = x.strip().lower()
        if xx in {"neg","negative","-1","-"}: return "negative"
        if xx in {"neu","neutral","0"}:       return "neutral"
        if xx in {"pos","positive","1","+"}:  return "positive"
        return xx
    if isinstance(x, (int, np.integer)):
        # Support common integer encodings: {-1,0,1} or {0,1,2} or {1,2,3}
        if x in (-1,0,1): return {-1:"negative",0:"neutral",1:"positive"}[int(x)]
        if x in (0,1,2):  return { 0:"negative",1:"neutral",2:"positive"}[int(x)]
        if x in (1,2,3):  return { 1:"negative",2:"neutral",3:"positive"}[int(x)]
    return x

def pick_text_col(df: pd.DataFrame) -> str:
    """Find the text column in a split CSV (supports a few common names)."""
    for c in ["text","review","sentence","content","tweet","message","msg","utterance"]:
        if c in df.columns: return c
    # Error message already updated for this mode: only FEWSHOT/TEST are expected here.
    raise RuntimeError("Fewshot/Test: no text column found.")

def pick_label_col(df: pd.DataFrame) -> str:
    """Find the label column in a split CSV (supports common names + one-hot triplet)."""
    lower = {c.lower(): c for c in df.columns}
    for k in ["label_str","label","gold","y","target","label_id","labels","class","category","sentiment","polarity","stance"]:
        if k in lower: return lower[k]
    # Support one-hot style: columns named negative/neutral/positive
    if all(k in lower for k in ["negative","neutral","positive"]): return "__triplet__"
    raise RuntimeError(f"Fewshot/Test: could not detect a label column. Columns: {list(df.columns)[:10]} ...")

def ensure_gold_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Convert FEWSHOT/TEST split into a canonical format:
    - text      : input text
    - label_id  : {0,1,2}
    - label_str : {negative, neutral, positive}
    """
    tcol = pick_text_col(df); lcol = pick_label_col(df)
    out = df[[tcol]].rename(columns={tcol:"text"}).copy()

    # One-hot triplet -> label_id by argmax
    if lcol == "__triplet__":
        trip = df[["negative","neutral","positive"]].to_numpy()
        yid = trip.argmax(axis=1).astype(int)
        out["label_id"]  = yid
        out["label_str"] = [LABELS[i] for i in yid]
        return out.reset_index(drop=True)

    # Single label column -> normalize then map
    vals = df[lcol]
    if pd.api.types.is_numeric_dtype(vals):
        out["label_str"] = [norm_label_to_str(int(v)) for v in vals]
    else:
        out["label_str"] = [norm_label_to_str(str(v)) for v in vals]

    out = out.dropna(subset=["text","label_str"])
    out = out[out["label_str"].isin(LABELS)].copy()
    if len(out) == 0:
        # Note: message still says "Dev/Test" in original, but logically this triggers for fewshot/test too.
        raise RuntimeError("Dev/Test: normalization produced no valid rows.")
    out["label_id"] = out["label_str"].map(lab2id)
    return out.reset_index(drop=True)

# -----------------------------
# 2) Load pseudo CSV + attach soft targets & weights
# -----------------------------
# This is the calibrated pseudo-labeled training set produced by Phase-2.5.
csv_path = OUT_FILES["phase2_5"]["out_csv"]
if not csv_path.exists():
    raise RuntimeError(f"Expected pseudo CSV not found: {csv_path}")

df_pseudo = pd.read_csv(csv_path)
assert "text" in df_pseudo.columns and "pseudo_label" in df_pseudo.columns, "CSV missing required columns."

# Hard pseudo label (string) -> label_id for book-keeping / fallback supervision.
df_pseudo["pseudo_label"] = df_pseudo["pseudo_label"].apply(norm_label_to_str)
df_pseudo["label_id"] = df_pseudo["pseudo_label"].map(lab2id)

# Soft targets (ST):
# - Preferred: calibrated combiner probabilities from Phase-2.5 (pseudo_prob_negative/neutral/positive).
# - Fallback: build a simple 3-way distribution from pseudo_confidence (winner gets conf, others share remaining mass).
if USE_CAL_SOFT and set([f"pseudo_prob_{c}" for c in LABELS]).issubset(df_pseudo.columns):
    # Using Phase-2.5 combiner outputs (this answers your earlier question: YES, Phase-3 uses combiner probabilities).
    ST = df_pseudo[[f"pseudo_prob_{c}" for c in LABELS]].to_numpy(dtype=np.float32)
    ST = ST / np.clip(ST.sum(axis=1, keepdims=True), 1e-8, None)
else:
    # Fallback when soft probs are not present (e.g., if Phase-2.5 not run).
    c = df_pseudo["pseudo_confidence"].astype(float).to_numpy()
    y = df_pseudo["label_id"].astype(int).to_numpy()
    ST = np.full((len(df_pseudo),3), 0.0, dtype=np.float32)
    ST[:] = (1.0 - c)[:,None] / 2.0
    ST[np.arange(len(df_pseudo)), y] = c.astype(np.float32)

# Store soft targets explicitly in the dataframe for later dataset construction.
df_pseudo["st_neg"] = ST[:,0]; df_pseudo["st_neu"] = ST[:,1]; df_pseudo["st_pos"] = ST[:,2]

# votes_agree:
# - If Phase-2 produced it, we keep it (it measures agreement among the 3 weak labelers).
# - If missing, we derive a coarse proxy from how peaked the soft target distribution is.
if "votes_agree" not in df_pseudo.columns:
    vg = (ST.max(axis=1) >= 0.6).astype(int) + (ST.max(axis=1) >= 0.8).astype(int)
    df_pseudo["votes_agree"] = vg

# Row weights:
# - compute_row_weight comes from the weight policy cell (or restart-guard fallback).
# - Higher confidence / agreement => higher weight, low confidence => smaller weight.
def _row_weight_fn(r) -> float:
    return compute_row_weight(
        pseudo_label=r["pseudo_label"],
        pseudo_confidence=float(r.get("pseudo_confidence", float(max(r["st_neg"], r["st_neu"], r["st_pos"])))),
        votes_agree=int(r["votes_agree"]),
        text=r["text"],
    )

df_pseudo["weight"] = [ _row_weight_fn(r) for _,r in df_pseudo.iterrows() ]
print(f"[Phase3] Pseudo rows: {len(df_pseudo)} | soft-targets attached | weights in "
      f"[{df_pseudo['weight'].min():.2f}, {df_pseudo['weight'].max():.2f}]")

# -----------------------------
# 3) Curriculum splits (same pool; different emphasis)
# -----------------------------
# Curriculum is built from the SAME pseudo-labeled pool, but we train in stages:
# - HI:   most reliable pseudo labels (high agreement and/or high confidence)
# - MID:  moderately reliable pseudo labels
# - ALL:  full pseudo-labeled set (includes noisier cases)
if "pseudo_confidence" not in df_pseudo.columns:
    df_pseudo["pseudo_confidence"] = ST.max(axis=1)

def build_curriculum(df: pd.DataFrame):
    hi  = df[(df.votes_agree==3) | ((df.votes_agree==2) & (df.pseudo_confidence>=0.70))].copy()
    mid = df[(df.votes_agree>=2) & (df.pseudo_confidence>=0.50)].copy()
    allp= df.copy()
    return hi.reset_index(drop=True), mid.reset_index(drop=True), allp.reset_index(drop=True)

df_hi, df_mid, df_all = build_curriculum(df_pseudo)
print(f"[Curriculum] hi={len(df_hi)} | mid={len(df_mid)} | all={len(df_all)}")

# Save curriculum splits for transparency / debugging / thesis reproducibility.
df_hi.to_csv(OUT_DIR / "curriculum_hi.csv",  index=False, encoding="utf-8-sig")
df_mid.to_csv(OUT_DIR / "curriculum_mid.csv", index=False, encoding="utf-8-sig")
df_all.to_csv(OUT_DIR / "curriculum_all.csv", index=False, encoding="utf-8-sig")
print("[SAVE] Curriculum CSVs ->", OUT_DIR)

# -----------------------------
# 4) Tokenizer & datasets
# -----------------------------
# Tokenization is done once per dataset (stored as tensors in memory for speed).
set_seed(SEED)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class PseudoDataset(Dataset):
    """
    Dataset for pseudo-labeled training:
    - labels       : hard pseudo label id (used only for bookkeeping / fallback)
    - soft_targets : 3-class distribution for KL training (main supervision signal)
    - weights      : per-example scalar weight for loss reweighting
    """
    def __init__(self, df: pd.DataFrame):
        self.df = df.reset_index(drop=True)
        enc = tokenizer(self.df["text"].tolist(), padding=True, truncation=True,
                        max_length=MAX_LEN, return_tensors="pt")
        self.input_ids = enc["input_ids"]; self.att_mask = enc["attention_mask"]
        self.labels = torch.tensor(self.df["label_id"].tolist(), dtype=torch.long)
        st = self.df[["st_neg","st_neu","st_pos"]].to_numpy(dtype=np.float32)
        self.soft = torch.tensor(st, dtype=torch.float32)
        self.weights = torch.tensor(self.df["weight"].to_numpy(dtype=np.float32))
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        return {
            "input_ids": self.input_ids[i],
            "attention_mask": self.att_mask[i],
            "labels": self.labels[i],
            "soft_targets": self.soft[i],
            "weights": self.weights[i]
        }

class GoldDataset(Dataset):
    """
    Dataset for gold evaluation (fewshot/test):
    - Only hard labels are needed (standard cross-entropy evaluation).
    """
    def __init__(self, df: pd.DataFrame):
        self.df = df.reset_index(drop=True)
        enc = tokenizer(self.df["text"].tolist(), padding=True, truncation=True,
                        max_length=MAX_LEN, return_tensors="pt")
        self.input_ids = enc["input_ids"]; self.att_mask = enc["attention_mask"]
        self.labels = torch.tensor(self.df["label_id"].tolist(), dtype=torch.long)
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        return {"input_ids": self.input_ids[i], "attention_mask": self.att_mask[i], "labels": self.labels[i]}

# -----------------------------
# 5) Custom Trainer (KL + weights) — HF >= 4.33 safe
# -----------------------------
# We override compute_loss to:
# - Use KL-divergence between model distribution and soft_targets during training
# - Multiply each sample's loss by its weight (confidence/agreement-aware)
# - Fall back to standard cross-entropy when evaluating on gold data
class SoftWeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels  = inputs.pop("labels", None)
        soft    = inputs.pop("soft_targets", None)
        weights = inputs.pop("weights", None)

        outputs = model(**inputs)
        logits  = outputs.get("logits")

        # Training: KL(model || soft_target) per example (weighted)
        if self.model.training and soft is not None:
            logp = F.log_softmax(logits, dim=-1)
            targ = soft.to(logp.dtype)
            loss_vec = F.kl_div(logp, targ, reduction="none").sum(dim=-1)
            if weights is not None:
                loss_vec = loss_vec * weights.to(loss_vec.dtype)

        # Evaluation: standard cross-entropy using gold hard labels
        else:
            if labels is None:
                raise ValueError("Labels must be provided for evaluation.")
            loss_vec = F.cross_entropy(logits, labels, reduction="none")

        loss = loss_vec.mean()
        return (loss, outputs) if return_outputs else loss

# Metrics reported on FEWSHOT during training and on TEST at the end.
def compute_metrics_fn(eval_pred):
    logits, labels = eval_pred
    yhat = logits.argmax(-1)
    macro = f1_score(labels, yhat, average="macro", labels=[0,1,2])
    per   = f1_score(labels, yhat, average=None, labels=[0,1,2])
    return {
        "accuracy":  accuracy_score(labels, yhat),
        "macro_f1":  macro,
        "f1_neg":    per[0],
        "f1_neu":    per[1],
        "f1_pos":    per[2],
    }

# -----------------------------
# 6) Build fewshot/test datasets
# -----------------------------
# IMPORTANT: In this pipeline mode, FEWSHOT is the ONLY gold data used for:
# - early stopping
# - checkpoint selection (best model)
# TEST remains completely held out for final reporting.
df_fs_gold   = ensure_gold_df(df_fewshot.copy())
df_test_gold = ensure_gold_df(df_test.copy())
dsv_fs, dsv_test = GoldDataset(df_fs_gold), GoldDataset(df_test_gold)

# -----------------------------
# 7) Initialize model
# -----------------------------
# Fresh student initialization (Stage-A training will update this model in-place across curriculum stages).
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=3, id2label=id2lab, label2id=lab2id
)

# -----------------------------
# 8) Training loop: HI -> MID -> ALL
# -----------------------------
def train_one_stage(stage_name: str, train_df: pd.DataFrame, num_epochs: int, output_dir: Path):
    """
    Train one curriculum stage on pseudo-labeled data while monitoring FEWSHOT gold.
    - load_best_model_at_end=True selects the checkpoint with best FEWSHOT macro-F1 for this stage.
    - EarlyStoppingCallback stops if FEWSHOT macro-F1 does not improve for 2 eval steps.
    """
    ds_train = PseudoDataset(train_df)

    args = TrainingArguments(
        output_dir=str(output_dir),
        num_train_epochs=num_epochs,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=LR,
        weight_decay=WD,
        warmup_ratio=WARMUP,

        # Evaluate & save every epoch (small datasets / short training).
        eval_strategy="epoch",
        save_strategy="epoch",

        # Always keep the best checkpoint according to FEWSHOT macro-F1.
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",

        logging_steps=50,
        save_total_limit=2,
        seed=SEED,
        report_to=[],
        remove_unused_columns=False
    )

    trainer = SoftWeightedTrainer(
        model=model,
        args=args,
        train_dataset=ds_train,
        eval_dataset=dsv_fs,  # <-- FEWSHOT used as the "dev" monitor in this pipeline mode
        compute_metrics=compute_metrics_fn,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    print(f"\n[TRAIN] Stage {stage_name}: n={len(train_df)} epochs={num_epochs}")
    trainer.train()

    fs_metrics = trainer.evaluate(dsv_fs)
    print(f"[FEWSHOT] {stage_name}:", {k: round(v,4) for k,v in fs_metrics.items()})

    # Save a convenient alias "checkpoint-best" for later loading.
    trainer.save_model(str(output_dir / "checkpoint-best"))
    return fs_metrics

# Curriculum training directory structure:
# outputs/phase3_noextra/stageA_cal_soft/{hi,mid,all}/checkpoint-best
stage_dir = OUT_DIR / "stageA_cal_soft"; stage_dir.mkdir(parents=True, exist_ok=True)
m1 = train_one_stage("HI",  df_hi,  EPOCHS_HI,  stage_dir / "hi")
m2 = train_one_stage("MID", df_mid, EPOCHS_MID, stage_dir / "mid")
m3 = train_one_stage("ALL", df_all, EPOCHS_ALL, stage_dir / "all")

# -----------------------------
# 9) Final evaluation on TEST
# -----------------------------
# We reload the best checkpoint from the final curriculum stage ("ALL") and evaluate on TEST once.
final_ckpt = str((stage_dir / "all" / "checkpoint-best"))
model = AutoModelForSequenceClassification.from_pretrained(
    final_ckpt, num_labels=3, id2label=id2lab, label2id=lab2id
)

trainer_final = SoftWeightedTrainer(
    model=model,
    args=TrainingArguments(
        output_dir=str(OUT_DIR / "final_eval"),
        per_device_eval_batch_size=BATCH_SIZE,
        report_to=[]
    ),
    eval_dataset=dsv_test,
    compute_metrics=compute_metrics_fn
)

test_metrics = trainer_final.evaluate()
print("\n[TEST] FINAL:", {k: round(v,4) for k,v in test_metrics.items()})

# -----------------------------
# 10) Save TEST predictions (text + gold + predicted probs)
# -----------------------------
def _softmax_np(x):
    """Numpy softmax for turning logits -> probabilities."""
    x = x - x.max(axis=1, keepdims=True)
    e = np.exp(x)
    return e / np.clip(e.sum(axis=1, keepdims=True), 1e-8, None)

pred = trainer_final.predict(dsv_test)
logits = pred.predictions
probs  = _softmax_np(logits)
yhat   = probs.argmax(axis=1)
conf   = probs.max(axis=1)

# Save a clean prediction file that can be used for:
# - confusion matrix plotting
# - error analysis
# - reporting metrics in thesis tables
out = df_test_gold[["text", "label_str", "label_id"]].copy()
out["pred_id"] = yhat
out["pred_str"] = [LABELS[i] for i in yhat]
out["pred_confidence"] = conf.astype(np.float32)
for j, l in enumerate(LABELS):
    out[f"pred_prob_{l}"] = probs[:, j].astype(np.float32)

PRED_DIR = OUT_DIR / "final_eval"
PRED_DIR.mkdir(parents=True, exist_ok=True)
out.to_csv(PRED_DIR / "preds_test.csv", index=False, encoding="utf-8-sig")
print(f"[SAVE] Stage-A test preds -> {PRED_DIR / 'preds_test.csv'}")

# -----------------------------
# 11) Save manifest for reproducibility
# -----------------------------
# Manifest keeps all key hyperparams and final test metrics in one JSON file.
with open(OUT_FILES["phase3"]["manifest"],"w") as f:
    json.dump({
        "model_name": MODEL_NAME, "seed": SEED, "max_len": MAX_LEN, "batch_size": BATCH_SIZE,
        "epochs": {"hi": EPOCHS_HI, "mid": EPOCHS_MID, "all": EPOCHS_ALL},
        "learning_rate": LR,
        "weights_range": [float(df_pseudo['weight'].min()), float(df_pseudo['weight'].max())],
        "csv_used": str(csv_path),
        "final_test": {k: float(v) for k,v in test_metrics.items()}
    }, f, indent=2)

print(f"[DONE] Saved manifest to {OUT_FILES['phase3']['manifest']}")


[Phase3] Pseudo rows: 7372 | soft-targets attached | weights in [0.56, 1.60]
[Curriculum] hi=4157 | mid=6316 | all=7372
[SAVE] Curriculum CSVs -> /content/outputs/phase3_noextra


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[TRAIN] Stage HI: n=4157 epochs=2


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,F1 Neg,F1 Neu,F1 Pos
1,0.307,0.724654,0.703125,0.705743,0.716418,0.631579,0.769231
2,0.2541,0.694331,0.729167,0.723556,0.769231,0.614035,0.787402


[FEWSHOT] HI: {'eval_loss': 0.6943, 'eval_accuracy': 0.7292, 'eval_macro_f1': 0.7236, 'eval_f1_neg': 0.7692, 'eval_f1_neu': 0.614, 'eval_f1_pos': 0.7874, 'eval_runtime': 0.5016, 'eval_samples_per_second': 382.754, 'eval_steps_per_second': 23.922, 'epoch': 2.0}

[TRAIN] Stage MID: n=6316 epochs=2


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,F1 Neg,F1 Neu,F1 Pos
1,0.2486,0.69763,0.71875,0.721194,0.746032,0.647059,0.770492
2,0.1929,0.66614,0.765625,0.764992,0.80916,0.698413,0.787402


[FEWSHOT] MID: {'eval_loss': 0.6661, 'eval_accuracy': 0.7656, 'eval_macro_f1': 0.765, 'eval_f1_neg': 0.8092, 'eval_f1_neu': 0.6984, 'eval_f1_pos': 0.7874, 'eval_runtime': 0.4954, 'eval_samples_per_second': 387.589, 'eval_steps_per_second': 24.224, 'epoch': 2.0}

[TRAIN] Stage ALL: n=7372 epochs=2


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,F1 Neg,F1 Neu,F1 Pos
1,0.2133,0.658626,0.776042,0.775965,0.83871,0.704,0.785185
2,0.1729,0.65532,0.760417,0.760327,0.806202,0.703125,0.771654


[FEWSHOT] ALL: {'eval_loss': 0.6586, 'eval_accuracy': 0.776, 'eval_macro_f1': 0.776, 'eval_f1_neg': 0.8387, 'eval_f1_neu': 0.704, 'eval_f1_pos': 0.7852, 'eval_runtime': 0.5004, 'eval_samples_per_second': 383.664, 'eval_steps_per_second': 23.979, 'epoch': 2.0}



[TEST] FINAL: {'eval_loss': 0.6668, 'eval_model_preparation_time': 0.0031, 'eval_accuracy': 0.7488, 'eval_macro_f1': 0.7439, 'eval_f1_neg': 0.727, 'eval_f1_neu': 0.6963, 'eval_f1_pos': 0.8083, 'eval_runtime': 4.713, 'eval_samples_per_second': 320.179, 'eval_steps_per_second': 20.157}
[SAVE] Stage-A test preds -> /content/outputs/phase3_noextra/final_eval/preds_test.csv
[DONE] Saved manifest to /content/outputs/phase3_noextra/manifest.json


In [13]:
# ==========================================================
# Phase 3B (Optional) — Few-shot Gold Refine (No-Extra-Data)
#
# Goal of Phase 3B:
# - Take the Stage-A checkpoint produced in Phase 3 (trained on pseudo labels)
#   and do a light *gold* fine-tune using ONLY the few-shot labeled set.
# - Still respects "No-Extra-Data": we do not add new training data beyond the
#   already-approved few-shot pack (and we still keep TEST held out).
#
# Why this phase exists:
# - Stage-A learns general decision boundaries from large pseudo-labeled data,
#   but can inherit noise/bias from weak supervision.
# - A short refinement on a small, trusted gold set often:
#     (1) corrects label noise,
#     (2) improves calibration,
#     (3) slightly boosts macro-F1, especially on minority classes.
#
# Important note about evaluation/early stopping here:
# - We do NOT use a DEV split.
# - We reuse the SAME few-shot set as "eval_dataset" only for early stopping and
#   best-checkpoint selection.
# - TEST is never used for training/selection; it remains final evaluation only.
# ==========================================================
import os, json
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback, set_seed
)
from sklearn.metrics import accuracy_score, f1_score

# --- config ---
# Output directory for Phase-3B artifacts (refined checkpoint, test predictions, manifest).
REF_OUT = OUT_PATHS["phase3"] / "fewshot_refine"; REF_OUT.mkdir(parents=True, exist_ok=True)

# Input checkpoint = best Stage-A checkpoint from Phase 3 (trained with pseudo-label curriculum).
CKPT_IN = OUT_FILES["phase3"]["final_ckpt"]
assert CKPT_IN.exists(), f"Phase 3B needs Phase 3 checkpoint: {CKPT_IN}"

# Refinement hyperparameters:
# - small LR + few epochs because dataset is tiny (fewshot) and can overfit easily.
REF_EPOCHS, REF_LR, REF_BS, SEED, MAX_LEN = 5, 1e-5, 16, 42, 256

# Fixed label set (must match earlier phases).
LABELS = ["negative","neutral","positive"]
lab2id = {l:i for i,l in enumerate(LABELS)}
id2lab = {i:l for l,i in lab2id.items()}

def norm_label(x):
    """Normalize label strings/ints into canonical strings (negative/neutral/positive)."""
    if isinstance(x, str):
        xx = x.strip().lower()
        if xx in {"neg","negative","-1"}: return "negative"
        if xx in {"neu","neutral","0"}:   return "neutral"
        if xx in {"pos","positive","1"}:  return "positive"
        return xx
    if isinstance(x, (int, np.integer)):
        # Assumes 0/1/2 encoding when integer labels are used.
        return LABELS[int(x)]
    return x

def ensure_gold(df: pd.DataFrame) -> pd.DataFrame:
    """
    Convert a dataframe containing gold labels into the canonical format:
    - text
    - label_id (0/1/2)
    Supports:
    - a single label column (label_str/label/gold/...)
    - one-hot triplet columns (negative/neutral/positive)
    """
    col = None
    for c in ["label_str","label","gold","y","target","sentiment","polarity","class","category"]:
        if c in df.columns:
            col = c
            break

    # One-hot case: columns negative/neutral/positive exist
    if col is None and all(k in df.columns for k in ["negative","neutral","positive"]):
        trip = df[["negative","neutral","positive"]].to_numpy()
        yid = trip.argmax(axis=1).astype(int)
        return pd.DataFrame({"text": df["text"], "label_id": yid})

    if col is None:
        raise RuntimeError("Gold column not found for few-shot refine.")

    out = df[["text", col]].copy()
    out["label_str"] = out[col].apply(norm_label)
    out = out.dropna(subset=["text","label_str"])

    # Map string labels -> ids
    out["label_id"] = out["label_str"].map(lab2id)
    return out[["text","label_id"]].reset_index(drop=True)

# ----------------------------------------------------------
# 1) Load FEWSHOT gold (ONLY source of training data here)
# ----------------------------------------------------------
# Prefer in-RAM df_fewshot if already loaded earlier, else load from outputs/data/fewshot.csv.
if 'df_fewshot' in globals():
    df_gold_small = ensure_gold(df_fewshot.copy()); src = "df_fewshot"
else:
    fs_path = OUT_FILES["splits"]["fewshot"]
    if not fs_path.exists():
        raise RuntimeError("[Refine] fewshot split is required (no dev fallback). Create/upload OUT_FILES['splits']['fewshot'] first.")
    df_gold_small = ensure_gold(pd.read_csv(fs_path)); src = str(fs_path)

print(f"[Refine] Using gold from {src}: n={len(df_gold_small)}")

# ----------------------------------------------------------
# 2) Build datasets (tokenize once into tensors)
# ----------------------------------------------------------
set_seed(SEED)
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

class GoldDataset(Dataset):
    """Simple gold dataset for supervised fine-tuning/evaluation."""
    def __init__(self, df):
        enc = tokenizer(df["text"].tolist(), padding=True, truncation=True,
                        max_length=MAX_LEN, return_tensors="pt")
        self.input_ids = enc["input_ids"]
        self.att_mask  = enc["attention_mask"]
        self.labels    = torch.tensor(df["label_id"].tolist(), dtype=torch.long)
    def __len__(self): return len(self.labels)
    def __getitem__(self, i):
        return {
            "input_ids": self.input_ids[i],
            "attention_mask": self.att_mask[i],
            "labels": self.labels[i]
        }

# Training dataset for refinement = FEWSHOT gold
dsv_train = GoldDataset(df_gold_small)

# (Legacy helper kept for flexibility; not used in this simplified "fewshot-only eval" mode)
def _ensure_eval_gold(df):
    g = df.copy()
    if "label_id" not in g.columns:
        col = next((c for c in ["label_str","label","gold","y","target","sentiment","polarity","class","category"] if c in g.columns), None)
        g["label_str"] = g[col].apply(norm_label)
        g["label_id"] = g["label_str"].map(lab2id)
    return GoldDataset(g[["text","label_id"]])

# ----------------------------------------------------------
# 3) Evaluation strategy for Stage-B
# ----------------------------------------------------------
# Evaluation set for Stage-B: reuse the same fewshot set for early stopping / best checkpoint.
# This avoids using dev entirely, but may slightly bias selection toward the fewshot set.
dsv_dev = dsv_train

# Test set remains gold test (held-out final evaluation only)
df_test_df = ensure_gold(pd.read_csv(OUT_FILES["splits"]["test"]))
df_test_df["label_str"] = df_test_df["label_id"].map(id2lab)  # helpful for later saved CSV
dsv_test = GoldDataset(df_test_df)

# ----------------------------------------------------------
# 4) Load Stage-A checkpoint and fine-tune on FEWSHOT
# ----------------------------------------------------------
# Start from Phase-3 "Stage-A" model weights and refine with standard supervised training.
model = AutoModelForSequenceClassification.from_pretrained(
    str(CKPT_IN), num_labels=3, id2label=id2lab, label2id=lab2id
)

args = TrainingArguments(
    output_dir=str(REF_OUT),
    num_train_epochs=REF_EPOCHS,
    per_device_train_batch_size=REF_BS,
    per_device_eval_batch_size=REF_BS,
    learning_rate=REF_LR,

    # Evaluate every epoch (dataset is small; overhead is minimal)
    eval_strategy="epoch",
    save_strategy="epoch",

    # Keep best checkpoint using FEWSHOT macro-F1
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",

    logging_steps=50,
    save_total_limit=2,
    seed=SEED,
    report_to=[],
)

def metrics_fn(eval_pred):
    """Compute accuracy + macro-F1 + per-class F1 on the evaluation set."""
    logits, labels = eval_pred
    yhat = logits.argmax(-1)
    macro = f1_score(labels, yhat, average="macro", labels=[0,1,2])
    per   = f1_score(labels, yhat, average=None, labels=[0,1,2])
    return {
        "accuracy":  accuracy_score(labels, yhat),
        "macro_f1":  macro,
        "f1_neg":    per[0],
        "f1_neu":    per[1],
        "f1_pos":    per[2],
    }

# Trainer uses early stopping to reduce overfitting on tiny fewshot data.
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dsv_train,
    eval_dataset=dsv_dev,
    compute_metrics=metrics_fn,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print(f"[Refine] Starting few-shot refine from {CKPT_IN} for {REF_EPOCHS} epoch(s)")
trainer.train()

# FEWSHOT metrics are reported as "DEV" here only because HF Trainer expects an eval set.
dev_metrics  = trainer.evaluate(dsv_dev)
test_metrics = trainer.evaluate(dsv_test)
print("[Refine][FEWSHOT] ", {k: round(v,4) for k,v in dev_metrics.items()})
print("[Refine][TEST]", {k: round(v,4) for k,v in test_metrics.items()})

# ----------------------------------------------------------
# 5) Save Stage-A+B TEST predictions (for thesis tables/analysis)
# ----------------------------------------------------------
def _softmax_np(x):
    """Numpy softmax for turning logits -> probabilities."""
    x = x - x.max(axis=1, keepdims=True)
    e = np.exp(x)
    return e / np.clip(e.sum(axis=1, keepdims=True), 1e-8, None)

# Reload test split as a DF (ensures we have clean "text" + gold labels for saving)
df_test_df = ensure_gold(pd.read_csv(OUT_FILES["splits"]["test"]))
df_test_df["label_str"] = df_test_df["label_id"].map(id2lab)

pred = trainer.predict(dsv_test)
logits = pred.predictions
probs  = _softmax_np(logits)
yhat   = probs.argmax(axis=1)
conf   = probs.max(axis=1)

# Output CSV includes:
# - gold label (label_str/label_id)
# - predicted label + confidence + probabilities (useful for error analysis)
out = df_test_df[["text","label_str","label_id"]].copy()
out["pred_id"] = yhat
out["pred_str"] = [LABELS[i] for i in yhat]
out["pred_confidence"] = conf.astype(np.float32)
for j,l in enumerate(LABELS):
    out[f"pred_prob_{l}"] = probs[:, j].astype(np.float32)

OUT_PATHS["phase3b"].mkdir(parents=True, exist_ok=True)
out.to_csv(OUT_PATHS["phase3b"] / "preds_test.csv", index=False, encoding="utf-8-sig")
print(f"[SAVE] Stage-A+B test preds -> {OUT_PATHS['phase3b'] / 'preds_test.csv'}")

# ----------------------------------------------------------
# 6) Save refined checkpoint + manifest
# ----------------------------------------------------------
# Save best refined model checkpoint and a small manifest for reproducibility.
trainer.save_model(str(OUT_FILES["phase3b"]["final_ckpt"]))
with open(OUT_FILES["phase3b"]["manifest"],"w") as f:
    json.dump({
        # "final_dev" is FEWSHOT metrics (used for early stopping/selection)
        "final_dev":  {k: float(v) for k,v in dev_metrics.items()},
        "final_test": {k: float(v) for k,v in test_metrics.items()}
    }, f, indent=2)

print(f"[Refine] Saved refined checkpoint to {OUT_FILES['phase3b']['final_ckpt']}")


[Refine] Using gold from df_fewshot: n=192
[Refine] Starting few-shot refine from /content/outputs/phase3_noextra/stageA_cal_soft/all/checkpoint-best for 5 epoch(s)


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,F1 Neg,F1 Neu,F1 Pos
1,No log,0.562447,0.807292,0.80752,0.852713,0.753846,0.816
2,No log,0.492626,0.822917,0.823977,0.88189,0.77037,0.819672
3,No log,0.423726,0.869792,0.869408,0.888889,0.825397,0.893939
4,No log,0.386685,0.875,0.87466,0.904762,0.832,0.887218
5,0.564200,0.374784,0.875,0.87466,0.904762,0.832,0.887218


[Refine][FEWSHOT]  {'eval_loss': 0.3867, 'eval_accuracy': 0.875, 'eval_macro_f1': 0.8747, 'eval_f1_neg': 0.9048, 'eval_f1_neu': 0.832, 'eval_f1_pos': 0.8872, 'eval_runtime': 0.4938, 'eval_samples_per_second': 388.812, 'eval_steps_per_second': 24.301, 'epoch': 5.0}
[Refine][TEST] {'eval_loss': 0.6302, 'eval_accuracy': 0.7594, 'eval_macro_f1': 0.7569, 'eval_f1_neg': 0.7533, 'eval_f1_neu': 0.701, 'eval_f1_pos': 0.8164, 'eval_runtime': 4.7543, 'eval_samples_per_second': 317.399, 'eval_steps_per_second': 19.982, 'epoch': 5.0}
[SAVE] Stage-A+B test preds -> /content/outputs/phase3_noextra_refine/preds_test.csv
[Refine] Saved refined checkpoint to /content/outputs/phase3_noextra_refine/checkpoint-best


In [14]:
# ==========================================================
# NOTE: Running the same framework with other backbones
# ==========================================================
# This notebook is written with XLM-R as the default backbone:
#   MODEL_NAME = "xlm-roberta-base"
#
# To run the *same pipeline* (Phase-1/2/3/3B) with a different multilingual model,
# you generally only need to replace the HuggingFace model identifier wherever
# MODEL_NAME (and tokenizer/model .from_pretrained) is used.
#
# Recommended alternatives (HuggingFace model names):
#   1) XLM-T:
#      MODEL_NAME = "cardiffnlp/twitter-xlm-roberta-base"   # (XLM-T base)
#
#   2) mDeBERTa v3 base:
#      MODEL_NAME = "microsoft/mdeberta-v3-base"
#
#   3) MuRIL (Urdu/Hindi focused):
#      MODEL_NAME = "google/muril-base-cased"
#
#   4) Multilingual BERT:
#      MODEL_NAME = "bert-base-multilingual-cased"
#
# What to update (minimal):
#   - Replace MODEL_NAME in the config section(s)
#   - Ensure tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
#   - Ensure model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, ...)
#
# Notes:
#   - Keep num_labels=3 and the same LABEL mappings.
#   - If a model has a shorter max length or you face GPU OOM, reduce MAX_LEN or BATCH_SIZE.
#   - Phase-2 XNLI teacher remains the same (joeddav/xlm-roberta-large-xnli) unless you intentionally change it.
