In [None]:
!pip -q install "transformers==4.41.2" "accelerate==0.31.0" "datasets==2.19.0"

import os, json, random
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m122.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m101.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.23.0+cu126 requires torch==2.8.0, but you have torch 2.5.1+cpu which is incompati

In [None]:
BASE = Path("/content/drive/MyDrive/cs-senti")
DATA = BASE / "data"
LABELING = BASE / "labeling"
MODELS = BASE / "models"

# EESA (جاهزة بسplits)
FP_EESA_TR = DATA / "eesa_train.jsonl"
FP_EESA_DE = DATA / "eesa_dev.jsonl"
FP_EESA_TE = DATA / "eesa_test.jsonl"

# AMG و MR (final labels بعد majority vote)
# ملاحظة: لو عندك AMG بصيغة jsonl استخدم .jsonl، ولو CSV استخدم read_csv تحت
FP_AMG_FINAL = DATA / "amg_cs_final_adjudicated.jsonl"   # غيّرها لو عندك CSV
FP_MR_FINAL  = LABELING / "mr_cs_final_adjudicated.csv"   # غالباً CSV


In [None]:
LABELS = ["pos","neu","neg"]
label_fix = {"positive":"pos","negative":"neg","negitive":"neg","neutral":"neu"}

def read_jsonl(fp: Path):
    rows=[]
    with open(fp, encoding="utf-8") as f:
        for line in f:
            line=line.strip()
            if not line: continue
            rows.append(json.loads(line))
    return rows

def normalize_row(obj, text_keys=("text","sentence","utterance"),
                  label_keys=("label","final_label","sentiment","gold","target")):
    # التكســت
    text = None
    for k in text_keys:
        if k in obj and isinstance(obj[k], str) and obj[k].strip():
            text = obj[k].strip()
            break
    assert text is not None, f"no text field in: {obj.keys()}"

    # الليبــل
    lab = None
    for k in label_keys:
        if k in obj and isinstance(obj[k], str) and obj[k].strip():
            lab = obj[k].strip().lower()
            lab = label_fix.get(lab, lab)
            break
    assert lab in LABELS, f"unknown label: {lab}"

    return {"text": text, "label": lab}

def show_dist(name, rows):
    c = Counter([r["label"] for r in rows])
    print(f"{name} → {len(rows)} samples")
    print(c); print("-"*40)


In [None]:
# 1) EESA (جاهزة)
eesa_tr = [normalize_row(r) for r in read_jsonl(FP_EESA_TR)]
eesa_de = [normalize_row(r) for r in read_jsonl(FP_EESA_DE)]
eesa_te = [normalize_row(r) for r in read_jsonl(FP_EESA_TE)]

show_dist("EESA train", eesa_tr)
show_dist("EESA dev",   eesa_de)
show_dist("EESA test",  eesa_te)

# 2) AMG (حاول نقرأ JSONL أولاً، لو فشل جرّب CSV)
amg_rows = None
try:
    amg_rows = [normalize_row(r) for r in read_jsonl(FP_AMG_FINAL)]
except Exception as e:
    print("AMG jsonl read failed, trying CSV...", e)
    df_amg = pd.read_csv(FP_AMG_FINAL)  # لو الملف فعلاً CSV غيّر الامتداد فوق
    # حدد أعمدة text/label تلقائي
    tx_col = "text" if "text" in df_amg.columns else [c for c in df_amg.columns if "text" in c.lower()][0]
    lb_col = "final_label" if "final_label" in df_amg.columns else [c for c in df_amg.columns if "label" in c.lower()][0]
    df_amg = df_amg[[tx_col, lb_col]].rename(columns={tx_col:"text", lb_col:"label"})
    df_amg["label"] = df_amg["label"].map(lambda x: label_fix.get(str(x).lower().strip(), str(x).lower().strip()))
    df_amg = df_amg[df_amg["label"].isin(LABELS)].dropna(subset=["text","label"])
    amg_rows = df_amg.to_dict(orient="records")

# 80/20 stratified
amg_df = pd.DataFrame(amg_rows)
amg_tr_df, amg_de_df = train_test_split(
    amg_df, test_size=0.20, random_state=SEED, stratify=amg_df["label"]
)
amg_tr = amg_tr_df.to_dict(orient="records")
amg_de = amg_de_df.to_dict(orient="records")

show_dist("AMG train", amg_tr)
show_dist("AMG dev",   amg_de)

# 3) MR (غالباً CSV نهائي)
mr_df_raw = pd.read_csv(FP_MR_FINAL)
# اكتشاف تلقائي للأعمدة
mr_tx = "text" if "text" in mr_df_raw.columns else [c for c in mr_df_raw.columns if "text" in c.lower()][0]
mr_lb = "final_label" if "final_label" in mr_df_raw.columns else [c for c in mr_df_raw.columns if "label" in c.lower()][0]

mr_df = mr_df_raw[[mr_tx, mr_lb]].rename(columns={mr_tx:"text", mr_lb:"label"})
mr_df["label"] = mr_df["label"].map(lambda x: label_fix.get(str(x).lower().strip(), str(x).lower().strip()))
mr_df = mr_df[mr_df["label"].isin(LABELS)].dropna(subset=["text","label"]).reset_index(drop=True)

print("MR all (cleaned) →", len(mr_df), Counter(mr_df["label"]))
print("-"*40)

mr_tr_df, mr_de_df = train_test_split(
    mr_df, test_size=0.20, random_state=SEED, stratify=mr_df["label"]
)
mr_tr = mr_tr_df.to_dict(orient="records")
mr_de = mr_de_df.to_dict(orient="records")

print("MR train →", len(mr_tr), Counter([r["label"] for r in mr_tr]))
print("MR dev   →", len(mr_de), Counter([r["label"] for r in mr_de]))
print("-"*40)


EESA train → 2463 samples
Counter({'pos': 1092, 'neu': 778, 'neg': 593})
----------------------------------------
EESA dev → 817 samples
Counter({'pos': 363, 'neu': 258, 'neg': 196})
----------------------------------------
EESA test → 817 samples
Counter({'pos': 362, 'neu': 258, 'neg': 197})
----------------------------------------
AMG train → 298 samples
Counter({'neu': 131, 'pos': 85, 'neg': 82})
----------------------------------------
AMG dev → 75 samples
Counter({'neu': 33, 'pos': 21, 'neg': 21})
----------------------------------------
MR all (cleaned) → 1000 Counter({'neu': 578, 'pos': 244, 'neg': 178})
----------------------------------------
MR train → 800 Counter({'neu': 462, 'pos': 195, 'neg': 143})
MR dev   → 200 Counter({'neu': 116, 'pos': 49, 'neg': 35})
----------------------------------------


In [None]:
def to_jsonl(rows, fp: Path):
    with open(fp, "w", encoding="utf-8") as f:
        for r in rows: f.write(json.dumps(r, ensure_ascii=False) + "\n")

mix_train = eesa_tr + amg_tr + mr_tr
mix_dev   = eesa_de + amg_de + mr_de
mix_test  = eesa_te  # ثابت للتقييم العادل

show_dist("MIXED train", mix_train)
show_dist("MIXED dev",   mix_dev)
show_dist("TEST (EESA)", mix_test)

OUT_MIX_TR = DATA / "mixed_train.jsonl"
OUT_MIX_DE = DATA / "mixed_dev.jsonl"
to_jsonl(mix_train, OUT_MIX_TR)
to_jsonl(mix_dev,   OUT_MIX_DE)
print("Saved mixed:", OUT_MIX_TR, "and", OUT_MIX_DE)


MIXED train → 3561 samples
Counter({'pos': 1372, 'neu': 1371, 'neg': 818})
----------------------------------------
MIXED dev → 1092 samples
Counter({'pos': 433, 'neu': 407, 'neg': 252})
----------------------------------------
TEST (EESA) → 817 samples
Counter({'pos': 362, 'neu': 258, 'neg': 197})
----------------------------------------
Saved mixed: /content/drive/MyDrive/cs-senti/data/mixed_train.jsonl and /content/drive/MyDrive/cs-senti/data/mixed_dev.jsonl


In [None]:
LABELS = ["pos","neu","neg"]
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

class JsonlDS(Dataset):
    def __init__(self, rows, tok, max_len=160):
        self.rows = rows; self.tok = tok; self.max_len=max_len
    def __len__(self): return len(self.rows)
    def __getitem__(self, i):
        x = self.rows[i]
        enc = self.tok(x["text"], max_length=self.max_len, truncation=True, padding="max_length")
        enc["labels"] = label2id[x["label"]]
        return {k: torch.tensor(v) for k,v in enc.items()}

tok = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=3, id2label=id2label, label2id=label2id
).to(device)

ds_tr = JsonlDS(mix_train, tok, 160)
ds_de = JsonlDS(mix_dev,   tok, 160)
ds_te = JsonlDS(mix_test,  tok, 160)

dl_tr = DataLoader(ds_tr, batch_size=16, shuffle=True)
dl_de = DataLoader(ds_de, batch_size=32, shuffle=False)
dl_te = DataLoader(ds_te, batch_size=32, shuffle=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.optim import AdamW

EPOCHS = 3
optim = AdamW(model.parameters(), lr=2e-5)
best_f1 = -1.0
best_state = None

def eval_loop(dataloader):
    model.eval()
    preds=[]; gold=[]
    with torch.no_grad():
        for batch in dataloader:
            labels = batch["labels"].numpy().tolist()
            batch = {k:v.to(device) for k,v in batch.items()}
            logits = model(**batch).logits.detach().cpu().numpy()
            preds.extend(logits.argmax(axis=1).tolist())
            gold.extend(labels)
    return gold, preds, f1_score(gold, preds, average="macro")

for ep in range(1, EPOCHS+1):
    model.train()
    for batch in dl_tr:
        batch = {k:v.to(device) for k,v in batch.items()}
        out = model(**batch)
        out.loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optim.step(); optim.zero_grad()

    g,p,f1 = eval_loop(dl_de)
    print(f"Epoch {ep} → dev macro-F1 = {f1:.4f}")
    if f1 > best_f1:
        best_f1 = f1
        best_state = model.state_dict().copy()

# load best and save
model.load_state_dict(best_state)
out_dir = MODELS / "xlmr_sentiment_eesa_amg_mr"
out_dir.mkdir(parents=True, exist_ok=True)
model.save_pretrained(out_dir.as_posix())
tok.save_pretrained(out_dir.as_posix())
print("✅ Saved best to:", out_dir)


Epoch 1 → dev macro-F1 = 0.7435
Epoch 2 → dev macro-F1 = 0.7723
Epoch 3 → dev macro-F1 = 0.7950
✅ Saved best to: /content/drive/MyDrive/cs-senti/models/xlmr_sentiment_eesa_amg_mr


In [None]:
gold, preds, macro = eval_loop(dl_te)
print("\n=== XLM-R (EESA+AMG+MR) on EESA TEST ===")
print(classification_report(gold, preds, target_names=LABELS, digits=4))
print("Macro-F1:", macro)



=== XLM-R (EESA+AMG+MR) on EESA TEST ===
              precision    recall  f1-score   support

         pos     0.8582    0.9365    0.8956       362
         neu     0.8039    0.7946    0.7992       258
         neg     0.8503    0.7208    0.7802       197

    accuracy                         0.8397       817
   macro avg     0.8375    0.8173    0.8250       817
weighted avg     0.8392    0.8397    0.8374       817

Macro-F1: 0.8250269133487748


In [None]:
from random import sample

# choose a small portion (~10%) from each domain dev set
amg_sample = sample(amg_de, min(30, len(amg_de)))  # 30 examples or fewer
mr_sample = sample(mr_de, min(50, len(mr_de)))     # 50 examples or fewer
eesa_sample = sample(eesa_de, min(100, len(eesa_de)))  # small slice for variety

mixed_test = amg_sample + mr_sample + eesa_sample
show_dist("MIXED test (cross-domain)", mixed_test)

OUT_MIX_TEST = DATA / "mixed_test.jsonl"
with open(OUT_MIX_TEST, "w", encoding="utf-8") as f:
    for r in mixed_test:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")


MIXED test (cross-domain) → 180 samples
Counter({'neu': 74, 'pos': 69, 'neg': 37})
----------------------------------------


In [None]:
ds_mix = JsonlDS(mixed_test, tok, 160)
dl_mix = DataLoader(ds_mix, batch_size=32, shuffle=False)

gold, preds, macro = eval_loop(dl_mix)
print("\n=== XLM-R (EESA+AMG+MR) on MIXED TEST ===")
print(classification_report(gold, preds, target_names=LABELS, digits=4))
print("Macro-F1:", macro)



=== XLM-R (EESA+AMG+MR) on MIXED TEST ===
              precision    recall  f1-score   support

         pos     0.8261    0.8261    0.8261        69
         neu     0.7561    0.8378    0.7949        74
         neg     0.8966    0.7027    0.7879        37

    accuracy                         0.8056       180
   macro avg     0.8262    0.7889    0.8029       180
weighted avg     0.8118    0.8056    0.8054       180

Macro-F1: 0.8029458464241074


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import json, torch, pathlib

SRC = "/content/drive/MyDrive/cs-senti/models/xlmr_sentiment_eesa_amg_mr"
DST = "/content/drive/MyDrive/cs-senti/models/sa_mixed_v3_frozen"
pathlib.Path(DST).mkdir(parents=True, exist_ok=True)

tok = AutoTokenizer.from_pretrained(SRC)
mdl = AutoModelForSequenceClassification.from_pretrained(SRC)
tok.save_pretrained(DST); mdl.save_pretrained(DST)

# persist label map used everywhere in GAN code
json.dump({"labels":["pos","neg","neu"]}, open(f"{DST}/label_map.json","w"))


In [None]:
# /content/drive/MyDrive/cs-senti/utils/sa_reward.py
import torch, json
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class SentimentOracle:
    def __init__(self, model_dir):
        self.tok  = AutoTokenizer.from_pretrained(model_dir)
        self.mdl  = AutoModelForSequenceClassification.from_pretrained(model_dir).eval().cuda()
        self.map  = json.load(open(f"{model_dir}/label_map.json"))["labels"]
        self.idx  = {l:i for i,l in enumerate(self.map)}
    @torch.no_grad()
    def score(self, texts, targets):
        enc = self.tok(texts, truncation=True, padding=True, max_length=128, return_tensors="pt").to(self.mdl.device)
        probs = self.mdl(**enc).logits.softmax(-1)
        ids   = torch.tensor([self.idx[t] for t in targets], device=probs.device)
        return probs[torch.arange(len(texts), device=probs.device), ids].detach().cpu().numpy()


LEXICAL REPLACEMENTS

In [None]:
# Fallback: downgrade Torch to a version before the new default
!pip -q install "torch==2.5.1" --index-url https://download.pytorch.org/whl/cpu
import os, sys; print(torch.__version__)
# Then Runtime -> Restart and run the cell again (no need for the add_safe_globals hack)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.6/174.6 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m123.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.23.0+cu126 requires torch==2.8.0, but you have torch 2.5.1+cpu which is incompatible.
torchaudio 2.8.0+cu126 requires torch==2.8.0, but you have torch 2.5.1+cpu which is incompatible.[0m[31m
[0m2.8.0+cu126


In [None]:
# --- FIX for PyTorch 2.6 "weights_only" UnpicklingError with Stanza ---
import numpy as np
import torch
import torch.serialization as ts

# Allow-list NumPy globals used by older checkpoints
ts.add_safe_globals([
    np.core.multiarray._reconstruct,
    np.dtype,
    np.ufunc,
    np.ndarray,
])

import stanza, spacy

# (Re)download Arabic models if needed (safe to call repeatedly)
stanza.download('ar')

# Build Stanza Arabic pipeline (CPU is fine; set use_gpu=True if CUDA OK)
ar_nlp = stanza.Pipeline(
    'ar',
    processors='tokenize,mwt,pos,lemma,ner',
    tokenize_pretokenized=False,
    verbose=False,
    use_gpu=False  # change to True if your session has CUDA and works
)

# English spaCy (you already installed en_core_web_sm in Cell 1)
nlp_en = spacy.load("en_core_web_sm", disable=["ner","parser","lemmatizer"])

print("✅ Pipelines ready: Stanza(ar) + spaCy(en)")

# quick sanity check
doc = ar_nlp("انا بحب القهوة من Starbucks")
for s in doc.sentences:
    print([(w.text, w.upos, w.lemma) for w in s.words])


  np.core.multiarray._reconstruct,


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

INFO:stanza:Downloading default packages for language: ar (Arabic) ...
INFO:stanza:File exists: /root/stanza_resources/ar/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources.
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage, loc: storage)
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  checkpoint = torch.load(filename, lambda storage, loc: storage)


✅ Pipelines ready: Stanza(ar) + spaCy(en)
[('انا', 'X', 'انا'), ('بحب', 'X', 'بحب'), ('القهوة', 'NOUN', 'قُهوَة'), ('من', 'ADP', 'مِن'), ('Star', 'X', 'Star'), ('bucks', 'X', 'bucks')]


In [None]:
# === Cell: POS + Morph feature extraction and switch-candidate tagging ===
import re, json, math, random
from pathlib import Path
from collections import Counter, defaultdict

# --- simple language id per token
_ar_re = re.compile(r"[\u0600-\u06FF]")   # Arabic block
_en_re = re.compile(r"[A-Za-z]")

def token_lang(t: str) -> str:
    has_ar = bool(_ar_re.search(t))
    has_en = bool(_en_re.search(t))
    if has_ar and not has_en: return "ar"
    if has_en and not has_ar: return "en"
    if has_ar and has_en:     return "mixed"
    return "other"

# --- useful sets
CONTENT_UPOS = {"NOUN","VERB","ADJ","PROPN","ADV"}
PUNCT_UPOS   = {"PUNCT","SYM"}
AR_STOPLIKE  = set(["و","في","من","على","عن","أن","إن","كان","كانت","هو","هي","هم","يا"])  # quick heuristic

def stanza_ar_annot(text: str):
    """
    Returns per-token dicts for Arabic using stanza:
    [{'text': tok, 'upos': UPOS, 'lemma': lemma, 'feats': 'Feat=Val|...','ner': 'B-PER' or 'O'}...]
    If the line contains English/mixed tokens, they will still appear in Stanza tokens (unknown UPOS sometimes).
    """
    out = []
    doc = ar_nlp(text)
    for s in doc.sentences:
        for w in s.words:
            out.append({
                "text": w.text,
                "upos": w.upos or "X",
                "lemma": w.lemma or w.text,
                "feats": w.feats or "",
                "ner":  "O",  # we’ll fill from token-level NER result below (stanza has sentence.ents for NER spans)
            })
    # overlay NER spans (token-level tag, rough)
    # stanza stores NER on sentence.ents with character offsets; for simplicity, mark tokens that exactly match span text
    # (good enough for protecting obvious names like 'Cairo', 'Starbucks')
    try:
        idx = 0
        for s in doc.sentences:
            tok_texts = [w.text for w in s.words]
            tok_marks = ["O"] * len(tok_texts)
            for ent in s.ents:
                span = ent.text
                # naive exact match window search
                for i in range(len(tok_texts)):
                    if tok_texts[i] == span:
                        tok_marks[i] = f"B-{ent.type}"
            # write back into 'out'
            for i in range(len(tok_texts)):
                out[idx]["ner"] = tok_marks[i]
                idx += 1
    except Exception:
        pass
    return out

def spacy_en_annot(text: str):
    """
    Returns per-token dicts for English using spaCy:
    [{'text': tok, 'pos': coarse, 'lemma': lemma}...]
    """
    out = []
    doc = nlp_en(text)
    for t in doc:
        out.append({
            "text": t.text,
            "pos": t.pos_,       # coarse POS
            "lemma": t.lemma_ if t.lemma_ != "-PRON-" else t.text,
        })
    return out

def build_features_line(text: str):
    """
    Combine: language id per token + Arabic POS/morph (if ar) + English POS (if en).
    Also compute a 'switch_candidate' boolean: Arabic content word, not NER, not punctuation/stoplike.
    """
    # base tokens by simple whitespace split to align across tools reasonably
    raw_tokens = text.split()
    # stanza token stream (may differ; we’ll align by greedy scan)
    ar_ann = stanza_ar_annot(text)
    en_ann = spacy_en_annot(text)

    # make fast lookup by exact token for spaCy coarse POS (fallback only)
    en_pos_map = defaultdict(list)
    for d in en_ann:
        en_pos_map[d["text"]].append(d["pos"])

    enriched = []
    for tok in raw_tokens:
        lang = token_lang(tok)
        # find closest stanza match (first unused matching token)
        match_idx = None
        for i, d in enumerate(ar_ann):
            if d is None: continue
            if d["text"] == tok:
                match_idx = i; break
        ar_upos = "X"; ar_feats = ""; ar_lemma = tok; ar_ner = "O"
        if match_idx is not None:
            ar_upos  = ar_ann[match_idx]["upos"]
            ar_feats = ar_ann[match_idx]["feats"]
            ar_lemma = ar_ann[match_idx]["lemma"]
            ar_ner   = ar_ann[match_idx]["ner"]
            ar_ann[match_idx] = None  # consume

        # english POS fallback
        en_pos = en_pos_map[tok][0] if en_pos_map.get(tok) else ""

        # candidate rule (Arabic only): content words, not NER, not punctuation/symbol, not stoplike, reasonable length
        is_candidate = False
        if lang == "ar":
            if (ar_upos in CONTENT_UPOS) and (ar_ner == "O") and (ar_upos not in PUNCT_UPOS):
                if (tok not in AR_STOPLIKE) and (len(tok) >= 2):
                    is_candidate = True

        enriched.append({
            "tok": tok,
            "lang": lang,
            "ar_upos": ar_upos,
            "ar_feats": ar_feats,
            "ar_lemma": ar_lemma,
            "ar_ner": ar_ner,
            "en_pos": en_pos,
            "switch_candidate": bool(is_candidate),
        })
    return {
        "text": text,
        "tokens": enriched,
        "num_candidates": sum(1 for t in enriched if t["switch_candidate"]),
    }

def annotate_rows(rows, out_path: Path, limit: int = 500):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    stats = Counter()
    with open(out_path, "w", encoding="utf-8") as f:
        for i, r in enumerate(rows[:limit]):
            rec = build_features_line(r["text"])
            rec["label"] = r["label"]
            json.dump(rec, f, ensure_ascii=False)
            f.write("\n")
            stats["lines"] += 1
            stats["candidates"] += rec["num_candidates"]
    print(f"✅ wrote {stats['lines']} lines to {out_path} | total candidates={stats['candidates']} | avg per line={stats['candidates']/max(1,stats['lines']):.2f}")

# ---- run on small samples to verify (use your already-loaded eesa_de / amg_de / mr_de) ----
LING_DIR = (DATA / "ling"); LING_DIR.mkdir(parents=True, exist_ok=True)

annotate_rows(eesa_de, LING_DIR / "eesa_dev_annot.jsonl", limit=300)
annotate_rows(amg_de,  LING_DIR / "amg_dev_annot.jsonl",  limit=200)
annotate_rows(mr_de,   LING_DIR / "mr_dev_annot.jsonl",   limit=200)

# quick peek
peek_fp = LING_DIR / "eesa_dev_annot.jsonl"
print("Peek one line:")
with open(peek_fp, encoding="utf-8") as f:
    print(f.readline().strip()[:300] + " ...")


✅ wrote 300 lines to /content/drive/MyDrive/cs-senti/data/ling/eesa_dev_annot.jsonl | total candidates=1021 | avg per line=3.40
✅ wrote 75 lines to /content/drive/MyDrive/cs-senti/data/ling/amg_dev_annot.jsonl | total candidates=139 | avg per line=1.85
✅ wrote 200 lines to /content/drive/MyDrive/cs-senti/data/ling/mr_dev_annot.jsonl | total candidates=618 | avg per line=3.09
Peek one line:
{"text": "مطلوب مراجعة مسلسل loki 🙏🏻🙏🏻🙏🏻🙏🏻", "tokens": [{"tok": "مطلوب", "lang": "ar", "ar_upos": "ADJ", "ar_feats": "Case=Nom|Definite=Cons|Gender=Masc|Number=Sing", "ar_lemma": "مَطلُوب", "ar_ner": "O", "en_pos": "PROPN", "switch_candidate": true}, {"tok": "مراجعة", "lang": "ar", "ar_upos": "NOUN" ...


In [None]:
import json
from pathlib import Path
from collections import Counter

BASE = Path("/content/drive/MyDrive/cs-senti")
DATA = BASE / "data"
LING = DATA / "ling"

FP_E_ANN = LING / "eesa_dev_annot.jsonl"
FP_A_ANN = LING / "amg_dev_annot.jsonl"
FP_M_ANN = LING / "mr_dev_annot.jsonl"

OUT_E_SK = LING / "eesa_dev_skeleton.jsonl"
OUT_A_SK = LING / "amg_dev_skeleton.jsonl"
OUT_M_SK = LING / "mr_dev_skeleton.jsonl"
OUT_ALL  = LING / "host_dev_skeleton_all.jsonl"

def read_jsonl(fp: Path):
    rows = []
    with open(fp, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

def write_jsonl(rows, fp: Path):
    with open(fp, "w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

def build_skeleton(annot_rows, domain: str):
    """
    Convert annotated lines into 'skeleton' entries:
      - tokens: list of token strings
      - langs: language tag per token
      - cand_indices: positions where we *may* code-switch
      - label: sentiment label
      - text: original sentence
      - domain: source dataset (eesa / amg / mr)
    Skip sentences with 0 candidates.
    """
    out = []
    total_cands = 0

    for obj in annot_rows:
        toks_meta = obj.get("tokens", [])
        if not toks_meta:
            continue

        tokens = [t["tok"] for t in toks_meta]
        langs  = [t.get("lang", "other") for t in toks_meta]

        cand_indices = [
            i for i, t in enumerate(toks_meta)
            if t.get("switch_candidate", False)
        ]

        if not cand_indices:
            # nothing to switch → not useful for GAN
            continue

        total_cands += len(cand_indices)

        sk = {
            "text": obj.get("text", ""),
            "label": obj.get("label", "neu"),  # fallback
            "tokens": tokens,
            "langs": langs,
            "cand_indices": cand_indices,
            "domain": domain
        }
        out.append(sk)

    avg_cands = total_cands / max(len(out), 1)
    print(f"{domain.upper()} skeletons → {len(out)} lines | "
          f"total candidates={total_cands} | avg per line={avg_cands:.2f}")
    return out

# 1) Load annotated dev files
eesa_ann = read_jsonl(FP_E_ANN)
amg_ann  = read_jsonl(FP_A_ANN)
mr_ann   = read_jsonl(FP_M_ANN)

print("Loaded annotated:")
print("  EESA:", len(eesa_ann))
print("  AMG :", len(amg_ann))
print("  MR  :", len(mr_ann))
print("-" * 40)

# 2) Build skeletons per domain
eesa_sk = build_skeleton(eesa_ann, "eesa")
amg_sk  = build_skeleton(amg_ann,  "amg")
mr_sk   = build_skeleton(mr_ann,   "mr")

# 3) Save per-domain skeletons
write_jsonl(eesa_sk, OUT_E_SK)
write_jsonl(amg_sk,  OUT_A_SK)
write_jsonl(mr_sk,   OUT_M_SK)

print("✅ Saved:")
print(" ", OUT_E_SK)
print(" ", OUT_A_SK)
print(" ", OUT_M_SK)

# 4) Merge into one host pool for GAN
host_all = eesa_sk + amg_sk + mr_sk
write_jsonl(host_all, OUT_ALL)

# some quick stats
lbl_counts = Counter([r["label"] for r in host_all])
dom_counts = Counter([r["domain"] for r in host_all])
print("-" * 40)
print("Merged host_dev_skeleton_all:", len(host_all), "lines")
print("Label dist:", lbl_counts)
print("Domain dist:", dom_counts)
print("✅ Saved merged host pool to:", OUT_ALL)


Loaded annotated:
  EESA: 300
  AMG : 75
  MR  : 200
----------------------------------------
EESA skeletons → 248 lines | total candidates=1021 | avg per line=4.12
AMG skeletons → 49 lines | total candidates=139 | avg per line=2.84
MR skeletons → 157 lines | total candidates=618 | avg per line=3.94
✅ Saved:
  /content/drive/MyDrive/cs-senti/data/ling/eesa_dev_skeleton.jsonl
  /content/drive/MyDrive/cs-senti/data/ling/amg_dev_skeleton.jsonl
  /content/drive/MyDrive/cs-senti/data/ling/mr_dev_skeleton.jsonl
----------------------------------------
Merged host_dev_skeleton_all: 454 lines
Label dist: Counter({'neu': 180, 'pos': 169, 'neg': 105})
Domain dist: Counter({'eesa': 248, 'mr': 157, 'amg': 49})
✅ Saved merged host pool to: /content/drive/MyDrive/cs-senti/data/ling/host_dev_skeleton_all.jsonl


In [None]:
import json, random
from pathlib import Path

BASE = Path("/content/drive/MyDrive/cs-senti")
DATA = BASE / "data"
LING = DATA / "ling"

FP_HOST_SK = LING / "host_dev_skeleton_all.jsonl"
OUT_TOY    = LING / "host_dev_switched_toy.jsonl"

random.seed(42)

def read_jsonl(fp: Path):
    rows = []
    with open(fp, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

def write_jsonl(rows, fp: Path):
    with open(fp, "w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

# 🔤 Very small toy AR→EN “lexicon” just to see switching working
AR_EN_LEX = {
    "مسلسل": ["series", "show"],
    "فيلم": ["movie", "film"],
    "اغنية": ["song"],
    "أغنية": ["song"],
    "حاجه": ["thing"],
    "حاجة": ["thing"],
    "برنامج": ["program", "show"],
    "مراجعة": ["review"],
    "حلقة": ["episode"],
    "موبايل": ["phone"],
    "تليفون": ["phone"],
    "انترفيو": ["interview"],
    "جامعة": ["university"],
    "ماتش": ["match", "game"],
    # fallback if token not in dict
}

def pick_en_replacement(ar_token: str) -> str:
    # basic normalization
    base = ar_token.strip()
    if base in AR_EN_LEX:
        return random.choice(AR_EN_LEX[base])
    # fallback: generic English word so we still see a switch
    return "thing"

def build_toy_switched(host_rows):
    out_rows = []
    for obj in host_rows:
        tokens = obj["tokens"]
        cand_indices = obj.get("cand_indices", [])
        if not cand_indices:
            continue

        # 1) pick exactly ONE position to switch (toy setup)
        switch_pos = random.choice(cand_indices)

        new_tokens = tokens.copy()
        ar_tok = new_tokens[switch_pos]
        en_tok = pick_en_replacement(ar_tok)
        new_tokens[switch_pos] = en_tok

        switched_text = " ".join(new_tokens)

        out_rows.append({
            "orig_text": obj["text"],
            "switched_text": switched_text,
            "label": obj["label"],
            "domain": obj["domain"],
            "tokens_orig": tokens,
            "tokens_switched": new_tokens,
            "switch_pos": switch_pos,
            "switched_from": ar_tok,
            "switched_to": en_tok
        })

    return out_rows

# 1) Load skeleton pool
host_rows = read_jsonl(FP_HOST_SK)
print("Loaded skeleton rows:", len(host_rows))

# 2) Build toy switched examples
toy_rows = build_toy_switched(host_rows)
print("Toy switched rows:", len(toy_rows))

# 3) Save
write_jsonl(toy_rows, OUT_TOY)
print("✅ Saved toy switched set to:", OUT_TOY)

# 4) Peek a few examples
for ex in toy_rows[:5]:
    print("\n--- EXAMPLE ---")
    print("LABEL  :", ex["label"], "| DOMAIN:", ex["domain"])
    print("ORIG   :", ex["orig_text"])
    print("SWITCH :", ex["switched_text"])
    print("switched token:", ex["switched_from"], "→", ex["switched_to"],
          "@pos", ex["switch_pos"])


Loaded skeleton rows: 454
Toy switched rows: 454
✅ Saved toy switched set to: /content/drive/MyDrive/cs-senti/data/ling/host_dev_switched_toy.jsonl

--- EXAMPLE ---
LABEL  : neu | DOMAIN: eesa
ORIG   : مطلوب مراجعة مسلسل loki 🙏🏻🙏🏻🙏🏻🙏🏻
SWITCH : مطلوب مراجعة series loki 🙏🏻🙏🏻🙏🏻🙏🏻
switched token: مسلسل → series @pos 2

--- EXAMPLE ---
LABEL  : pos | DOMAIN: eesa
ORIG   : دنيا سمير غانم شخصية محترمة اوي و انا بجد عارفة ان في مسلسل بدل الحدوتة ٣ كان دور Bella و Lahfa و لولي هي بجد ممثلتي المفضلة
SWITCH : دنيا سمير غانم thing محترمة اوي و انا بجد عارفة ان في مسلسل بدل الحدوتة ٣ كان دور Bella و Lahfa و لولي هي بجد ممثلتي المفضلة
switched token: شخصية → thing @pos 3

--- EXAMPLE ---
LABEL  : neu | DOMAIN: eesa
ORIG   : فين مراجعة venom
SWITCH : فين review venom
switched token: مراجعة → review @pos 1

--- EXAMPLE ---
LABEL  : pos | DOMAIN: eesa
ORIG   : انتي عسل وربنا انا بحبك ILove you
SWITCH : انتي thing وربنا انا بحبك ILove you
switched token: عسل → thing @pos 1

--- EXAMPLE ---
LABEL  : pos 

In [None]:
import json, random
from pathlib import Path

BASE = Path("/content/drive/MyDrive/cs-senti")
LING = (BASE / "data" / "ling")
FP_HOST_SK = LING / "host_dev_skeleton_all.jsonl"
OUT_TOY    = LING / "host_dev_switched_toy_clean.jsonl"

random.seed(42)

def read_jsonl(fp: Path):
    rows = []
    with open(fp, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

def write_jsonl(rows, fp: Path):
    with open(fp, "w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

# 💡 Slightly more reasonable toy lexicon
AR_EN_LEX = {
    "مسلسل": ["series", "show"],
    "فيلم": ["movie", "film"],
    "برنامج": ["program", "show"],
    "مراجعة": ["review"],
    "حلقة": ["episode"],
    "جامعة": ["university"],
    "انترفيو": ["interview"],
    "موبايل": ["phone"],
    "تليفون": ["phone"],
    "اغنية": ["song"],
    "أغنية": ["song"],
    "ماتش": ["match", "game"],
}

def pick_en_replacement(ar_token: str):
    base = ar_token.strip()
    if base in AR_EN_LEX:
        return random.choice(AR_EN_LEX[base])
    # ❌ no fallback → return None so we can skip this switch
    return None

def build_toy_switched_clean(host_rows):
    out_rows = []
    skipped = 0
    for obj in host_rows:
        tokens = obj["tokens"]
        cand_indices = obj.get("cand_indices", [])
        if not cand_indices:
            continue

        # shuffle candidate indices to try a few until we find one with a lexicon match
        idxs = cand_indices[:]
        random.shuffle(idxs)
        switched = False

        for switch_pos in idxs:
            ar_tok = tokens[switch_pos]
            en_tok = pick_en_replacement(ar_tok)
            if en_tok is None:
                continue  # try another candidate
            new_tokens = tokens.copy()
            new_tokens[switch_pos] = en_tok
            switched_text = " ".join(new_tokens)
            out_rows.append({
                "orig_text": obj["text"],
                "switched_text": switched_text,
                "label": obj["label"],
                "domain": obj["domain"],
                "tokens_orig": tokens,
                "tokens_switched": new_tokens,
                "switch_pos": switch_pos,
                "switched_from": ar_tok,
                "switched_to": en_tok
            })
            switched = True
            break

        if not switched:
            skipped += 1

    print(f"Total rows in skeleton: {len(host_rows)}")
    print(f"Kept with at least one lexicon switch: {len(out_rows)}")
    print(f"Skipped (no lexicon match): {skipped}")
    return out_rows

# 1) Load skeleton pool
host_rows = read_jsonl(FP_HOST_SK)
print("Loaded skeleton rows:", len(host_rows))

# 2) Build cleaner toy switched examples
toy_rows = build_toy_switched_clean(host_rows)

# 3) Save
write_jsonl(toy_rows, OUT_TOY)
print("✅ Saved CLEAN toy switched set to:", OUT_TOY)

# 4) Peek
for ex in toy_rows[:5]:
    print("\n--- EXAMPLE ---")
    print("LABEL  :", ex["label"], "| DOMAIN:", ex["domain"])
    print("ORIG   :", ex["orig_text"])
    print("SWITCH :", ex["switched_text"])
    print("switched token:", ex["switched_from"], "→", ex["switched_to"],
          "@pos", ex["switch_pos"])


Loaded skeleton rows: 454
Total rows in skeleton: 454
Kept with at least one lexicon switch: 39
Skipped (no lexicon match): 415
✅ Saved CLEAN toy switched set to: /content/drive/MyDrive/cs-senti/data/ling/host_dev_switched_toy_clean.jsonl

--- EXAMPLE ---
LABEL  : neu | DOMAIN: eesa
ORIG   : مطلوب مراجعة مسلسل loki 🙏🏻🙏🏻🙏🏻🙏🏻
SWITCH : مطلوب review مسلسل loki 🙏🏻🙏🏻🙏🏻🙏🏻
switched token: مراجعة → review @pos 1

--- EXAMPLE ---
LABEL  : pos | DOMAIN: eesa
ORIG   : دنيا سمير غانم شخصية محترمة اوي و انا بجد عارفة ان في مسلسل بدل الحدوتة ٣ كان دور Bella و Lahfa و لولي هي بجد ممثلتي المفضلة
SWITCH : دنيا سمير غانم شخصية محترمة اوي و انا بجد عارفة ان في series بدل الحدوتة ٣ كان دور Bella و Lahfa و لولي هي بجد ممثلتي المفضلة
switched token: مسلسل → series @pos 12

--- EXAMPLE ---
LABEL  : neu | DOMAIN: eesa
ORIG   : فين مراجعة venom
SWITCH : فين review venom
switched token: مراجعة → review @pos 1

--- EXAMPLE ---
LABEL  : pos | DOMAIN: eesa
ORIG   : دي احسن اغنية في العالم و اروع اغنية 
اغنية 3 دقات

In [None]:
import json
from pathlib import Path

# ------------------------------
# 1) annotate_one()
# ------------------------------
def annotate_one(text, label, domain, ar_nlp, nlp_en):
    """Annotate a single text with tokens + POS + lang + switch candidates."""
    # simple whitespace tokenization
    raw_toks = text.strip().split()
    if not raw_toks:
        return None

    # Arabic pass (Stanza)
    ar_doc = ar_nlp(text)
    ar_tok_map = {}
    for s in ar_doc.sentences:
        for w in s.words:
            ar_tok_map.setdefault(w.text, []).append({
                "lang": "ar",
                "ar_upos": w.upos,
                "ar_lemma": w.lemma,
                "ar_ner": w.ner if hasattr(w, "ner") else "O",
            })

    # English POS tagging (spaCy)
    en_doc = nlp_en(text)
    en_tok_map = {t.text: t.pos_ for t in en_doc}

    annotated = []
    for tok in raw_toks:
        info = {"tok": tok}

        if tok in ar_tok_map:
            # Mark as Arabic
            info.update(ar_tok_map[tok][0])
            info["lang"] = "ar"
        else:
            # Try English fallback
            info["lang"] = "en"
            info["en_pos"] = en_tok_map.get(tok, "X")

        # Decide if switchable
        info["switch_candidate"] = (
            info["lang"] == "ar" and
            info.get("ar_upos") in ["NOUN", "ADJ"]  # switch nouns + adjectives
        )
        annotated.append(info)

    return {
        "text": text,
        "label": label,
        "domain": domain,
        "tokens": annotated,
    }


# ------------------------------
# 2) process_split()
# ------------------------------
def process_split(rows, domain, out_path, split_name="", max_lines=None):
    """Annotate many rows and write jsonl file."""
    count = 0
    total_candidates = 0

    with open(out_path, "w", encoding="utf-8") as f:
        for i, r in enumerate(rows):
            if max_lines and count >= max_lines:
                break

            out = annotate_one(
                text=r["text"],
                label=r["label"],
                domain=domain,
                ar_nlp=ar_nlp,
                nlp_en=nlp_en
            )
            if not out:
                continue

            cand = sum(1 for t in out["tokens"] if t.get("switch_candidate"))
            if cand == 0:
                continue

            total_candidates += cand
            f.write(json.dumps(out, ensure_ascii=False) + "\n")
            count += 1

            if i % 200 == 0:
                print(f"[{split_name}] processed {i} rows...")

    print(f"✅ wrote {count} lines to {out_path}")
    print(f"   total candidates={total_candidates} | avg per line={total_candidates/count if count else 0:.2f}")


In [None]:
from pathlib import Path
import json
from collections import Counter

# --- paths ---
LING = DATA / "ling"
LING.mkdir(parents=True, exist_ok=True)

FP_EESA_TR_ANNOT = LING / "eesa_train_annot.jsonl"
FP_AMG_TR_ANNOT  = LING / "amg_train_annot.jsonl"
FP_MR_TR_ANNOT   = LING / "mr_train_annot.jsonl"

# ---------- 1) annotate TRAIN splits (same as you did for DEV) ----------

# process_split(rows, domain, out_path, max_lines=None)
# we set split_name just for logging inside the function if it uses it

print("🔁 Annotating TRAIN splits...")

process_split(
    rows=eesa_tr,
    domain="eesa",
    out_path=FP_EESA_TR_ANNOT,
    split_name="eesa_train",
    max_lines=None,      # use full train
)

process_split(
    rows=amg_tr,
    domain="amg",
    out_path=FP_AMG_TR_ANNOT,
    split_name="amg_train",
    max_lines=None,
)

process_split(
    rows=mr_tr,
    domain="mr",
    out_path=FP_MR_TR_ANNOT,
    split_name="mr_train",
    max_lines=None,
)

print("✅ Finished annotating TRAIN splits.")
print("Files:")
print(" -", FP_EESA_TR_ANNOT)
print(" -", FP_AMG_TR_ANNOT)
print(" -", FP_MR_TR_ANNOT)

# ---------- 2) Build host skeleton from *_train_annot ----------

def load_annot(fp: Path, domain_hint=None):
    """Load annotated jsonl and (optionally) enforce domain field."""
    rows = []
    with open(fp, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            # make sure these fields exist
            if "domain" not in obj and domain_hint is not None:
                obj["domain"] = domain_hint
            rows.append(obj)
    return rows

# load all three
eesa_ann = load_annot(FP_EESA_TR_ANNOT, domain_hint="eesa")
amg_ann  = load_annot(FP_AMG_TR_ANNOT,  domain_hint="amg")
mr_ann   = load_annot(FP_MR_TR_ANNOT,   domain_hint="mr")

print("Loaded annotated TRAIN:")
print("  EESA:", len(eesa_ann))
print("  AMG :", len(amg_ann))
print("  MR  :", len(mr_ann))

host_skeleton = []
for src_rows in (eesa_ann, amg_ann, mr_ann):
    for r in src_rows:
        text   = r.get("text", "").strip()
        label  = r.get("label", "neu")
        domain = r.get("domain", "unk")
        toks   = r.get("tokens", [])

        if not text or not toks:
            continue

        # indices of candidate tokens
        cand_idx = [
            i for i, t in enumerate(toks)
            if t.get("switch_candidate", False) is True
        ]
        if not cand_idx:
            continue  # nothing interesting to switch here

        # optional filter: require at least 1 Arabic token
        has_ar = any(t.get("lang") == "ar" for t in toks)
        if not has_ar:
            continue

        host_skeleton.append({
            "text": text,
            "label": label,
            "domain": domain,
            "tokens": [t["tok"] for t in toks],
            "cand_indices": cand_idx,
        })

print("Total host skeleton rows (TRAIN):", len(host_skeleton))
print("Label distribution:", Counter([r["label"] for r in host_skeleton]))
print("Domain distribution:", Counter([r["domain"] for r in host_skeleton]))

FP_HOST_TRAIN = LING / "host_train_skeleton_all.jsonl"
with open(FP_HOST_TRAIN, "w", encoding="utf-8") as f:
    for r in host_skeleton:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print("✅ Saved HOST TRAIN skeleton to:", FP_HOST_TRAIN)

# quick peek
if host_skeleton:
    ex = host_skeleton[0]
    print("\n--- EXAMPLE HOST TRAIN ---")
    print("LABEL :", ex["label"], "| DOMAIN:", ex["domain"])
    print("TEXT  :", ex["text"])
    print("TOKENS:", ex["tokens"])
    print("CAND  :", ex["cand_indices"])


🔁 Annotating TRAIN splits...
Annotating eesa_train (eesa) with up to all available lines...
✅ wrote 2463 lines to /content/drive/MyDrive/cs-senti/data/ling/eesa_train_annot.jsonl | total candidates=8812 | avg per line=3.58
Annotating amg_train (amg) with up to all available lines...
✅ wrote 298 lines to /content/drive/MyDrive/cs-senti/data/ling/amg_train_annot.jsonl | total candidates=511 | avg per line=1.71
Annotating mr_train (mr) with up to all available lines...
✅ wrote 800 lines to /content/drive/MyDrive/cs-senti/data/ling/mr_train_annot.jsonl | total candidates=2509 | avg per line=3.14
✅ Finished annotating TRAIN splits.
Files:
 - /content/drive/MyDrive/cs-senti/data/ling/eesa_train_annot.jsonl
 - /content/drive/MyDrive/cs-senti/data/ling/amg_train_annot.jsonl
 - /content/drive/MyDrive/cs-senti/data/ling/mr_train_annot.jsonl
Loaded annotated TRAIN:
  EESA: 2463
  AMG : 298
  MR  : 800
Total host skeleton rows (TRAIN): 2948
Label distribution: Counter({'neu': 1132, 'pos': 1103, 'n

In [None]:
from pathlib import Path
import json, random, math
from collections import Counter

from transformers import MarianMTModel, MarianTokenizer

# ---------------- paths ----------------
LING = DATA / "ling"
FP_HOST_TRAIN = LING / "host_train_skeleton_all.jsonl"
FP_LEXICON    = LING / "ar_en_lex_mt_from_host.json"

# ---------------- load host skeleton ----------------
host_skeleton = []
with open(FP_HOST_TRAIN, encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        host_skeleton.append(json.loads(line))

print("Loaded host skeleton rows:", len(host_skeleton))

# ---------------- collect unique candidate forms ----------------
cand_forms = set()
for r in host_skeleton:
    toks = r["tokens"]
    for idx in r["cand_indices"]:
        if 0 <= idx < len(toks):
            tok = toks[idx].strip()
            if tok and any("\u0600" <= ch <= "\u06FF" for ch in tok):  # has Arabic
                cand_forms.add(tok)

cand_forms = sorted(cand_forms)
print("Unique Arabic candidate forms:", len(cand_forms))
print("Sample:", cand_forms[:20])

# ---------------- load AR→EN MT model ----------------
mt_name = "Helsinki-NLP/opus-mt-ar-en"
mt_tok   = MarianTokenizer.from_pretrained(mt_name)
mt_model = MarianMTModel.from_pretrained(mt_name).to(device)
mt_model.eval()

def translate_batch(words):
    """Translate a batch of Arabic tokens to short English words."""
    if not words:
        return []
    batch = [w if w.strip() else "UNK" for w in words]
    enc = mt_tok(batch, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        gen = mt_model.generate(**enc, max_length=6, num_beams=5)
    outs = mt_tok.batch_decode(gen, skip_special_tokens=True)

    cleaned = []
    for out in outs:
        txt = out.strip()
        # Simple cleanup
        txt = txt.replace(" .", "").replace(".", "").strip()
        # Take first content token (avoid long phrases)
        parts = txt.split()
        if not parts:
            cleaned.append("thing")   # fallback
        else:
            # drop articles like "the", "a", "an" if there is a second word
            if parts[0].lower() in {"the", "a", "an"} and len(parts) > 1:
                cleaned.append(parts[1].lower())
            else:
                cleaned.append(parts[0].lower())
    return cleaned

# ---------------- build lexicon in batches ----------------
BATCH = 32
lex = {}
for i in range(0, len(cand_forms), BATCH):
    chunk = cand_forms[i : i + BATCH]
    trans = translate_batch(chunk)
    for src, tgt in zip(chunk, trans):
        lex[src] = tgt
    print(f"Translated {i+len(chunk)}/{len(cand_forms)} candidate forms...", end="\r")

print("\nDone translating candidates.")
print("Sample lexicon entries:")
for k in list(lex.keys())[:15]:
    print(f"  {k}  →  {lex[k]}")

# optionally fix some bad ones manually if you spot them
manual_fixes = {
    # "عسل": "sweetheart",
    # "شخصية": "character",
}
lex.update(manual_fixes)

# ---------------- save lexicon ----------------
with open(FP_LEXICON, "w", encoding="utf-8") as f:
    json.dump(lex, f, ensure_ascii=False, indent=2)

print("✅ Saved lexicon to:", FP_LEXICON, "| size:", len(lex))


Loaded host skeleton rows: 2948
Unique Arabic candidate forms: 4479
Sample: ['#لحم_غزال', '5مليون', 'آخد', 'آخر', 'آخرة', 'آخـــرهـــم', 'آراء', 'آشتركوا', 'آلة', 'أبان', 'أبحاث', 'أبدًا', 'أبطال', 'أبقى', 'أبنك', 'أبني', 'أتحمل', 'أترك', 'أتسائل', 'أتعامل']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Translated 4479/4479 candidate forms...
Done translating candidates.
Sample lexicon entries:
  #لحم_غزال  →  ♪
  5مليون  →  $5
  آخد  →  take
  آخر  →  another
  آخرة  →  last
  آخـــرهـــم  →  last
  آراء  →  views
  آشتركوا  →  share
  آلة  →  machine
  أبان  →  aban
  أبحاث  →  research
  أبدًا  →  never
  أبطال  →  heroes
  أبقى  →  stay
  أبنك  →  your
✅ Saved lexicon to: /content/drive/MyDrive/cs-senti/data/ling/ar_en_lex_mt_from_host.json | size: 4479


In [None]:
import json, random
from pathlib import Path

FP_LEXICON = DATA / "ling" / "ar_en_lex_mt_from_host.json"

with open(FP_LEXICON, encoding="utf-8") as f:
    lex = json.load(f)

print("Lexicon size:", len(lex))

# show 30 random entries
for src in random.sample(list(lex.keys()), 30):
    print(f"{src:15s} → {lex[src]}")


Lexicon size: 4479
علمت            → i
لقيت            → got
مشاهدا          → watching
شاعر            → poet
القصير          → short
عجيب            → wow
دردشة           → chat
رقصة            → dance
الاغانى         → songs
استماعا         → listen
القاهره         → cairo
ازعاجا          → disturbing
الاسفلت         → asphalt
تهتمش           → you
مختلفة          → different
ماتتصوري        → what
نطق             → pronounce
محظوظين         → lucky
كمّلت           → completed
توزيع           → distribution
بعيدة           → far
ياعسيلى         → oh,
الحوض           → tub
شعار            → logo
مبني            → building
الحضوري         → audience
جماهير          → masses
نعم             → yeah
يسرا            → sira
خيارين          → two


In [None]:
from collections import Counter
from pathlib import Path
import json

FP_HOST_TRAIN = DATA / "ling" / "host_train_skeleton_all.jsonl"
FP_LEXICON    = DATA / "ling" / "ar_en_lex_mt_from_host.json"

# load lexicon
with open(FP_LEXICON, encoding="utf-8") as f:
    lex = json.load(f)

# load host skeleton
host_skeleton = []
with open(FP_HOST_TRAIN, encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        host_skeleton.append(json.loads(line))

cand_freq = Counter()
for r in host_skeleton:
    toks = r["tokens"]
    for idx in r["cand_indices"]:
        if 0 <= idx < len(toks):
            tok = toks[idx]
            if tok in lex:
                cand_freq[tok] += 1

print("Unique candidate tokens in lexicon:", len(cand_freq))
print("\nTop 40 by frequency:")
for src, cnt in cand_freq.most_common(40):
    print(f"{cnt:4d}x  {src:15s} → {lex[src]}")


Unique candidate tokens in lexicon: 4479

Top 40 by frequency:
 205x  يعني            → i
 167x  كل              → eat
 125x  الناس           → people
 116x  جدا             → very
  98x  حاجة            → need
  84x  فيلم            → movie
  83x  ممكن            → maybe
  80x  حد              → limit
  79x  عاملين          → working
  72x  اغنية           → song
  63x  كنت             → i
  61x  بس              → but
  55x  غير             → other
  54x  مسلسل           → it's
  54x  مثلا            → like
  49x  الاغنية         → song
  48x  دول             → states
  46x  اول             → first
  46x  قناتي           → my
  45x  قناة            → channel
  45x  عامل            → worker
  45x  اعلان           → advertisement
  44x  نفسي            → myself
  44x  هنا             → here
  42x  الفيلم          → movie
  40x  احلى            → sweet
  37x  مراجعة          → review
  36x  رمضان           → ramadan
  35x  نفس             → breathe
  35x  يعمل            → it
  35x  الأغ

In [None]:
from pathlib import Path
import json

LEX_PATH = Path("/content/drive/MyDrive/cs-senti/data/ling/ar_en_lex_mt_from_host.json")

with open(LEX_PATH, encoding="utf-8") as f:
    raw = json.load(f)

print("Type of top-level object:", type(raw))

# Normalize into list of dicts: {ar, mt_en}
norm = []

if isinstance(raw, dict):
    # assume: { "آخد": "take", ... }
    for ar, mt_en in raw.items():
        norm.append({"ar": ar, "mt_en": mt_en})
elif isinstance(raw, list):
    # assume: [ {"ar": "...", "en": "..."}, ... ]
    for obj in raw:
        ar = obj.get("ar")
        mt_en = obj.get("en") or obj.get("mt_en") or obj.get("mt") or ""
        if ar is None:
            continue
        norm.append({"ar": ar, "mt_en": mt_en})
else:
    raise ValueError("Unexpected lexicon format")

print(f"Loaded {len(norm)} entries.")
print("Sample:", norm[:10])


Type of top-level object: <class 'dict'>
Loaded 4479 entries.
Sample: [{'ar': '#لحم_غزال', 'mt_en': '♪'}, {'ar': '5مليون', 'mt_en': '$5'}, {'ar': 'آخد', 'mt_en': 'take'}, {'ar': 'آخر', 'mt_en': 'another'}, {'ar': 'آخرة', 'mt_en': 'last'}, {'ar': 'آخـــرهـــم', 'mt_en': 'last'}, {'ar': 'آراء', 'mt_en': 'views'}, {'ar': 'آشتركوا', 'mt_en': 'share'}, {'ar': 'آلة', 'mt_en': 'machine'}, {'ar': 'أبان', 'mt_en': 'aban'}]


In [None]:
import math
import textwrap

BATCH_SIZE = 150   # adjust if you want smaller/larger chunks

def make_batches(items, batch_size):
    for i in range(0, len(items), batch_size):
        yield i // batch_size, items[i:i+batch_size]

def build_prompt(batch_idx, total_batches, batch_items):
    """
    Build a clear instruction prompt for the LLM.
    """
    header = f"""You are helping me clean a bilingual lexicon for code-switched Egyptian Arabic → English.

TASK:
- I will give you a JSON array of objects. Each object has:
  - "ar": an Egyptian Arabic word or short phrase (informal, social media style).
  - "mt_en": an approximate or incorrect machine translation.
- Your job is to produce a new JSON array where:
  - You KEEP the "ar" field exactly as it is.
  - You REPLACE "mt_en" with a better English translation in a new field "en".
  - The translation should be:
    - Short (1–3 English words).
    - The most common / natural meaning in everyday speech and social media.
    - Neutral, not overly formal.
    - If the Arabic is a named entity or proper noun, translate to its common English form (or keep as is if there's no translation).
    - Do NOT include explanations, transliterations, or Arabic text in "en".

FORMAT:
- Input:  JSON array with fields: "ar", "mt_en"
- Output: JSON array with fields: "ar", "en"
- IMPORTANT:
  - Same number of items in the same order.
  - Only change the translation.
  - Do NOT add comments or any text around the JSON.

Now here is batch {batch_idx + 1} of {total_batches} as JSON:
"""

    json_block = json.dumps(batch_items, ensure_ascii=False, indent=2)
    return header + "\n" + json_block


batches = list(make_batches(norm, BATCH_SIZE))
print(f"Total batches: {len(batches)}")

# Show the first prompt as example:
idx0, batch0 = batches[0]
prompt0 = build_prompt(idx0, len(batches), batch0)
print(prompt0[:2000])  # preview first 2000 chars


Total batches: 30
You are helping me clean a bilingual lexicon for code-switched Egyptian Arabic → English.

TASK:
- I will give you a JSON array of objects. Each object has:
  - "ar": an Egyptian Arabic word or short phrase (informal, social media style).
  - "mt_en": an approximate or incorrect machine translation.
- Your job is to produce a new JSON array where:
  - You KEEP the "ar" field exactly as it is.
  - You REPLACE "mt_en" with a better English translation in a new field "en".
  - The translation should be:
    - Short (1–3 English words).
    - The most common / natural meaning in everyday speech and social media.
    - Neutral, not overly formal.
    - If the Arabic is a named entity or proper noun, translate to its common English form (or keep as is if there's no translation).
    - Do NOT include explanations, transliterations, or Arabic text in "en".

FORMAT:
- Input:  JSON array with fields: "ar", "mt_en"
- Output: JSON array with fields: "ar", "en"
- IMPORTANT: 
  - S

In [None]:
for idx, batch_items in batches:
    prompt = build_prompt(idx, len(batches), batch_items)
    print("\n" + "="*80)
    print(f"=== BATCH {idx+1} / {len(batches)} ===")
    print("="*80 + "\n")
    print(prompt)
    # You manually copy this, paste into ChatGPT, and save the returned JSON somewhere.
    # Then go to the next batch.



=== BATCH 1 / 30 ===

You are helping me clean a bilingual lexicon for code-switched Egyptian Arabic → English.

TASK:
- I will give you a JSON array of objects. Each object has:
  - "ar": an Egyptian Arabic word or short phrase (informal, social media style).
  - "mt_en": an approximate or incorrect machine translation.
- Your job is to produce a new JSON array where:
  - You KEEP the "ar" field exactly as it is.
  - You REPLACE "mt_en" with a better English translation in a new field "en".
  - The translation should be:
    - Short (1–3 English words).
    - The most common / natural meaning in everyday speech and social media.
    - Neutral, not overly formal.
    - If the Arabic is a named entity or proper noun, translate to its common English form (or keep as is if there's no translation).
    - Do NOT include explanations, transliterations, or Arabic text in "en".

FORMAT:
- Input:  JSON array with fields: "ar", "mt_en"
- Output: JSON array with fields: "ar", "en"
- IMPORTANT: 


In [None]:
import json
from pathlib import Path

fp = Path("/content/drive/MyDrive/cs-senti/data/ling/ar_en_lex_llm_fixed.jsonl")

with fp.open(encoding="utf-8") as f:
    rows = [json.loads(line) for line in f if line.strip()]

print("Total entries:", len(rows))
print("Sample:", rows[:10])


Total entries: 4275
Sample: [{'ar': '5مليون', 'en': '5 million'}, {'ar': 'آخد', 'en': 'take'}, {'ar': 'آخر', 'en': 'last'}, {'ar': 'آخرة', 'en': 'afterlife'}, {'ar': 'آخـــرهـــم', 'en': 'the last one'}, {'ar': 'آراء', 'en': 'opinions'}, {'ar': 'آشتركوا', 'en': 'subscribe'}, {'ar': 'آلة', 'en': 'machine'}, {'ar': 'أبان', 'en': 'Aban'}, {'ar': 'أبحاث', 'en': 'research'}]


In [None]:
from pathlib import Path

BASE = Path("/content/drive/MyDrive/cs-senti")
DATA = BASE / "data"
LING = DATA / "ling"
LING.mkdir(parents=True, exist_ok=True)

print("BASE:", BASE)
print("DATA:", DATA)
print("LING:", LING)


BASE: /content/drive/MyDrive/cs-senti
DATA: /content/drive/MyDrive/cs-senti/data
LING: /content/drive/MyDrive/cs-senti/data/ling


In [None]:
import json, random
from pathlib import Path
from collections import Counter

random.seed(42)

LING = DATA / "ling"

FP_HOST_TRAIN = LING / "host_train_skeleton_all.jsonl"
FP_LEX_LLM    = LING / "ar_en_lex_llm_fixed.jsonl"   # <- your cleaned file
FP_TRAIN_SW   = LING / "host_train_switched_lex_llm.jsonl"

# 1) load host skeleton
host_rows = []
with open(FP_HOST_TRAIN, encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        host_rows.append(json.loads(line))

print("Loaded host skeleton rows:", len(host_rows))

# 2) load LLM lexicon into dict: ar_token -> en_phrase
lex_llm = {}
with open(FP_LEX_LLM, encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        obj = json.loads(line)
        ar = str(obj.get("ar", "")).strip()
        en = str(obj.get("en", "")).strip()
        if not ar or not en:
            continue
        # simple filter: must contain at least one Latin letter
        if not any(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in en):
            continue
        # strip quotes
        en = en.replace('"', '').replace("'", "").strip()
        lex_llm[ar] = en

print("Lexicon entries:", len(lex_llm))
sample_items = list(lex_llm.items())[:10]
print("Sample lexicon items:")
for ar, en in sample_items:
    print(" ", ar, "→", en)

# 3) apply switching
switched_rows = []

for r in host_rows:
    toks = r["tokens"]
    label = r["label"]
    domain = r["domain"]
    cand_idx = r["cand_indices"]

    # keep only candidates that exist in lexicon
    valid_idx = [i for i in cand_idx if toks[i] in lex_llm]
    if not valid_idx:
        continue

    # choose 1–2 random positions to switch (avoid over-switching)
    k = random.randint(1, min(2, len(valid_idx)))
    chosen = random.sample(valid_idx, k)

    new_toks = toks[:]
    switches = []
    for idx in chosen:
        src = toks[idx]
        tgt = lex_llm[src]
        new_toks[idx] = tgt
        switches.append({"idx": idx, "src": src, "tgt": tgt})

    switched_text = " ".join(new_toks)

    switched_rows.append({
        "orig_text": r["text"],
        "switched_text": switched_text,
        "label": label,
        "domain": domain,
        "switches": switches
    })

print("Total switched rows:", len(switched_rows))
print("Label dist:", Counter([r["label"] for r in switched_rows]))
print("Domain dist:", Counter([r["domain"] for r in switched_rows]))

# 4) save
with open(FP_TRAIN_SW, "w", encoding="utf-8") as f:
    for r in switched_rows:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print("✅ Saved switched TRAIN to:", FP_TRAIN_SW)

# 5) peek a few examples
for ex in switched_rows[:5]:
    print("\nLABEL :", ex["label"], "| DOMAIN:", ex["domain"])
    print("ORIG  :", ex["orig_text"])
    print("SW    :", ex["switched_text"])
    print("SWAPS :", ex["switches"])


Loaded host skeleton rows: 2948
Lexicon entries: 4274
Sample lexicon items:
  5مليون → 5 million
  آخد → take
  آخر → last
  آخرة → afterlife
  آخـــرهـــم → the last one
  آراء → opinions
  آشتركوا → subscribe
  آلة → machine
  أبان → Aban
  أبحاث → research
Total switched rows: 2913
Label dist: Counter({'neu': 1108, 'pos': 1098, 'neg': 707})
Domain dist: Counter({'eesa': 2076, 'mr': 651, 'amg': 186})
✅ Saved switched TRAIN to: /content/drive/MyDrive/cs-senti/data/ling/host_train_switched_lex_llm.jsonl

LABEL : neu | DOMAIN: eesa
ORIG  : *أنسّى الألمَ ؟ أنها مجرد حياهَ ولن تدوّم ..* 🖤🍁
Forget the pain , it's just life will not last.
SW    : *أنسّى pain ؟ أنها مجرد حياهَ ولن تدوّم ..* 🖤🍁 Forget the pain , it's just life will not last.
SWAPS : [{'idx': 1, 'src': 'الألمَ', 'tgt': 'pain'}]

LABEL : neg | DOMAIN: eesa
ORIG  : الحراميه اللي سارقه الموسيقي من اغنيه فنانه كوريه اسمها آيو الاغنيه اسمها  friday   نفس النغمه في البدايه
SW    : الحراميه اللي سارقه musician من اغنيه فنانه كوريه اس

In [None]:
import json, random
from pathlib import Path

FP_SWITCHED_TRAIN = Path("/content/drive/MyDrive/cs-senti/data/ling/host_train_switched_lex_llm.jsonl")

# load all switched rows
switched = []
with open(FP_SWITCHED_TRAIN, encoding="utf-8") as f:
    for line in f:
        if line.strip():
            switched.append(json.loads(line))

print("Total switched rows:", len(switched))

def show_examples(domain, k=5):
    print(f"\n===== DOMAIN: {domain} =====")
    rows = [r for r in switched if r.get("domain") == domain]
    print(f"Found {len(rows)} rows.")
    for ex in random.sample(rows, min(k, len(rows))):
        print("\nLABEL :", ex.get("label"))
        print("ORIG  :", ex.get("orig_text", ex.get("text", "")))
        print("SW    :", ex.get("switched_text", ex.get("sw", "")))
        print("SWAPS :", ex.get("swaps", []))

# show some from each domain
show_examples("eesa", k=5)
show_examples("amg",  k=5)
show_examples("mr",   k=5)


Total switched rows: 2913

===== DOMAIN: eesa =====
Found 2076 rows.

LABEL : pos
ORIG  : غير حاصل في هذا .......😘 
Love from India ❤️
SW    : other happening في هذا .......😘 Love from India ❤️
SWAPS : []

LABEL : neu
ORIG  : ألمشتهيه تجي خاص أصدقائك على YouTube https://youtu.be/addme/3WJaXdGvKC2u1kNFNp7eMTH9LL9l8Q
SW    : ألمشتهيه you come خاص أصدقائك على YouTube https://youtu.be/addme/3WJaXdGvKC2u1kNFNp7eMTH9LL9l8Q
SWAPS : []

LABEL : pos
ORIG  : عايزين من دا كتير الفترة الجاية welcome back بيكم❤️
SW    : عايزين من دا كتير الفترة coming welcome back بيكم❤️
SWAPS : []

LABEL : neg
ORIG  : 81k dislike 😂😂 اهم حاجة محبة الناس
SW    : 81k dislike 😂😂 اهم حاجة محبة people
SWAPS : []

LABEL : pos
ORIG  : هيفاااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااا ، دمرت نسوان الارض ،،، ولسا بتسالي ليش بغاروااااااا ،،،،، love youuuuuuuuuuuu
SW    : هيفاااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااا ، I destroyed نسوان الارض ،،، ولسا بتس

Now we want to train a new XLM-R on:

Train (AUG): mixed_train (real) + host_train_switched_lex_llm (synthetic)

Dev: same mixed_dev (real only, no synthetic)

Test: same eesa_test and mixed_test (unchanged!)

Then compare:

Baseline (Real only) vs Augmented (Real + Lex-switched)
→ on the same test sets.

Your sa_mixed_v3_frozen copy stays as the frozen oracle for GAN rewards. We’re training a new classifier for the augmentation experiment.

In [None]:
import json, random
from pathlib import Path
from collections import Counter

BASE = Path("/content/drive/MyDrive/cs-senti")
DATA = BASE / "data"
LING = DATA / "ling"
MODELS = BASE / "models"

def read_jsonl(fp: Path):
    rows = []
    with open(fp, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

# 1) Real mixed train/dev
FP_MIX_TR = DATA / "mixed_train.jsonl"
FP_MIX_DE = DATA / "mixed_dev.jsonl"

mix_train = read_jsonl(FP_MIX_TR)
mix_dev   = read_jsonl(FP_MIX_DE)

print("Real mixed train:", len(mix_train))
print("Real mixed dev  :", len(mix_dev))

# 2) Lex-switched train (EESA+AMG+MR)
FP_SWITCHED_TRAIN = LING / "host_train_switched_lex_llm.jsonl"
switched_raw = read_jsonl(FP_SWITCHED_TRAIN)
print("Switched train raw:", len(switched_raw))

# normalise the structure -> {text, label}
def norm_switched(r):
    # we stored both original text and switched text; prefer switched
    txt = r.get("switched_text") or r.get("sw", r.get("text", ""))
    return {
        "text": txt,
        "label": r["label"]
    }

switched = [norm_switched(r) for r in switched_raw if r.get("label")]

print("Switched usable:", len(switched))
print("Label dist (switched):", Counter([r["label"] for r in switched]))

# 3) Build augmented training set
aug_train = mix_train + switched
print("\nAugmented TRAIN size:", len(aug_train))
print("Label dist (real train):", Counter([r["label"] for r in mix_train]))
print("Label dist (aug train) :", Counter([r["label"] for r in aug_train]))


Real mixed train: 3561
Real mixed dev  : 1092
Switched train raw: 2913
Switched usable: 2913
Label dist (switched): Counter({'neu': 1108, 'pos': 1098, 'neg': 707})

Augmented TRAIN size: 6474
Label dist (real train): Counter({'pos': 1372, 'neu': 1371, 'neg': 818})
Label dist (aug train) : Counter({'neu': 2479, 'pos': 2470, 'neg': 1525})


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import f1_score, classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

LABELS = ["pos", "neu", "neg"]  # same order you used earlier
label2id = {l: i for i, l in enumerate(LABELS)}
id2label = {i: l for l, i in label2id.items()}

class JsonlDS(Dataset):
    def __init__(self, rows, tok, max_len=160):
        self.rows = rows
        self.tok = tok
        self.max_len = max_len

    def __len__(self):
        return len(self.rows)

    def __getitem__(self, i):
        x = self.rows[i]
        enc = self.tok(
            x["text"],
            max_length=self.max_len,
            truncation=True,
            padding="max_length"
        )
        enc["labels"] = label2id[x["label"]]
        return {k: torch.tensor(v) for k, v in enc.items()}

def eval_loop(model, dataloader):
    model.eval()
    preds, gold = [], []
    with torch.no_grad():
        for batch in dataloader:
            labels = batch["labels"].numpy().tolist()
            batch = {k: v.to(device) for k, v in batch.items()}
            logits = model(**batch).logits.detach().cpu().numpy()
            preds.extend(logits.argmax(axis=1).tolist())
            gold.extend(labels)
    macro = f1_score(gold, preds, average="macro")
    return gold, preds, macro


Device: cuda


In [None]:
# tokenizer + model from base XLM-R (same as baseline setup)
tok_aug = AutoTokenizer.from_pretrained("xlm-roberta-base")
model_aug = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
).to(device)

# datasets / loaders
ds_tr_aug = JsonlDS(aug_train, tok_aug, 160)
ds_de     = JsonlDS(mix_dev,   tok_aug, 160)

dl_tr_aug = DataLoader(ds_tr_aug, batch_size=16, shuffle=True)
dl_de     = DataLoader(ds_de,     batch_size=32, shuffle=False)

EPOCHS = 3
optim = AdamW(model_aug.parameters(), lr=2e-5)
best_f1 = -1.0
best_state = None

for ep in range(1, EPOCHS+1):
    model_aug.train()
    for batch in dl_tr_aug:
        batch = {k: v.to(device) for k, v in batch.items()}
        out = model_aug(**batch)
        out.loss.backward()
        torch.nn.utils.clip_grad_norm_(model_aug.parameters(), 1.0)
        optim.step()
        optim.zero_grad()

    _, _, f1_dev = eval_loop(model_aug, dl_de)
    print(f"Epoch {ep} → dev macro-F1 (augmented) = {f1_dev:.4f}")
    if f1_dev > best_f1:
        best_f1 = f1_dev
        best_state = model_aug.state_dict().copy()

# restore best dev checkpoint and save it
model_aug.load_state_dict(best_state)
out_dir_aug = MODELS / "xlmr_sentiment_eesa_amg_mr_lexaug"
out_dir_aug.mkdir(parents=True, exist_ok=True)
model_aug.save_pretrained(out_dir_aug.as_posix())
tok_aug.save_pretrained(out_dir_aug.as_posix())
print("✅ Saved AUGMENTED model to:", out_dir_aug)
print("Best dev Macro-F1 (aug):", best_f1)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 → dev macro-F1 (augmented) = 0.7693
Epoch 2 → dev macro-F1 (augmented) = 0.8139
Epoch 3 → dev macro-F1 (augmented) = 0.7863
✅ Saved AUGMENTED model to: /content/drive/MyDrive/cs-senti/models/xlmr_sentiment_eesa_amg_mr_lexaug
Best dev Macro-F1 (aug): 0.8139129162068944


In [None]:
# --- 1) Reload test sets ---
from pathlib import Path
import json

BASE = Path("/content/drive/MyDrive/cs-senti")
DATA = BASE / "data"

def read_jsonl(fp: Path):
    rows = []
    with open(fp, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

# EESA held-out test
FP_EESA_TE = DATA / "eesa_test.jsonl"
eesa_te = read_jsonl(FP_EESA_TE)

# Mixed cross-domain test (you saved it earlier)
FP_MIX_TEST = DATA / "mixed_test.jsonl"
mixed_test = read_jsonl(FP_MIX_TEST)

print("EESA test:", len(eesa_te))
print("Mixed test:", len(mixed_test))


EESA test: 817
Mixed test: 180


In [None]:
from torch.utils.data import DataLoader

LABELS = ["pos","neu","neg"]
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

class JsonlDS(Dataset):
    def __init__(self, rows, tok, max_len=160):
        self.rows = rows
        self.tok = tok
        self.max_len = max_len

    def __len__(self):
        return len(self.rows)

    def __getitem__(self, i):
        x = self.rows[i]
        enc = self.tok(
            x["text"],
            max_length=self.max_len,
            truncation=True,
            padding="max_length"
        )
        enc["labels"] = label2id[x["label"]]
        return {k: torch.tensor(v) for k,v in enc.items()}


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report, f1_score
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

AUG_DIR = BASE / "models" / "xlmr_sentiment_eesa_amg_mr_lexaug"

tok_aug = AutoTokenizer.from_pretrained(AUG_DIR.as_posix())
model_aug = AutoModelForSequenceClassification.from_pretrained(
    AUG_DIR.as_posix()
).to(device)

# reuse eval_loop from your cell
def eval_loop(model, dataloader):
    model.eval()
    preds, gold = [], []
    with torch.no_grad():
        for batch in dataloader:
            labels = batch["labels"].numpy().tolist()
            batch = {k: v.to(device) for k, v in batch.items()}
            logits = model(**batch).logits.detach().cpu().numpy()
            preds.extend(logits.argmax(axis=1).tolist())
            gold.extend(labels)
    macro = f1_score(gold, preds, average="macro")
    return gold, preds, macro

# build loaders
ds_eesa_te = JsonlDS(eesa_te, tok_aug, 160)
ds_mix_te  = JsonlDS(mixed_test, tok_aug, 160)

dl_eesa_te = DataLoader(ds_eesa_te, batch_size=32, shuffle=False)
dl_mix_te  = DataLoader(ds_mix_te,  batch_size=32, shuffle=False)

# --- Evaluate on EESA-test ---
gold_e, preds_e, macro_e = eval_loop(model_aug, dl_eesa_te)
print("\n=== XLM-R + LexAug on EESA TEST ===")
print(classification_report(gold_e, preds_e, target_names=LABELS, digits=4))
print("Macro-F1 (EESA):", macro_e)

# --- Evaluate on Mixed-test ---
gold_m, preds_m, macro_m = eval_loop(model_aug, dl_mix_te)
print("\n=== XLM-R + LexAug on MIXED TEST ===")
print(classification_report(gold_m, preds_m, target_names=LABELS, digits=4))
print("Macro-F1 (MIXED):", macro_m)



=== XLM-R + LexAug on EESA TEST ===
              precision    recall  f1-score   support

         pos     0.8441    0.9420    0.8903       362
         neu     0.8377    0.7403    0.7860       258
         neg     0.8324    0.7817    0.8063       197

    accuracy                         0.8397       817
   macro avg     0.8381    0.8213    0.8275       817
weighted avg     0.8393    0.8397    0.8371       817

Macro-F1 (EESA): 0.8275434595177438

=== XLM-R + LexAug on MIXED TEST ===
              precision    recall  f1-score   support

         pos     0.7692    0.8696    0.8163        69
         neu     0.8281    0.7162    0.7681        74
         neg     0.7632    0.7838    0.7733        37

    accuracy                         0.7889       180
   macro avg     0.7868    0.7899    0.7859       180
weighted avg     0.7922    0.7889    0.7877       180

Macro-F1 (MIXED): 0.7859252686581879


🧪 Step 1 – Build the supervised “switch decision” dataset

In [None]:
import json
from pathlib import Path
from collections import Counter

BASE = Path("/content/drive/MyDrive/cs-senti")
DATA = BASE / "data"
LING = DATA / "ling"

FP_SWITCHED_TRAIN = LING / "host_train_switched_lex_llm.jsonl"

# use the actual file name you see in Drive
FP_LEXICON = LING / "ar_en_lex_llm_fixed.jsonl"   # or .json if that's the extension

def read_jsonl(fp: Path):
    rows = []
    with open(fp, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

def load_lexicon_flexible(fp: Path):
    """Handle both JSON array and JSONL formats."""
    with open(fp, encoding="utf-8") as f:
        txt = f.read().strip()
    # Case 1: starts with '[' → JSON array
    if txt.startswith("["):
        return json.loads(txt)
    # Case 2: JSONL → one JSON obj per line
    lex = []
    for line in txt.splitlines():
        line = line.strip()
        if not line:
            continue
        lex.append(json.loads(line))
    return lex

# 1) load switched rows
switched_rows = read_jsonl(FP_SWITCHED_TRAIN)
print("Loaded switched rows:", len(switched_rows))

# 2) load lexicon (LLM-corrected, any format)
lex_list = load_lexicon_flexible(FP_LEXICON)
lexicon = {e["ar"]: e["en"] for e in lex_list}
print("Lexicon size:", len(lexicon))


Loaded switched rows: 2913
Lexicon size: 4275


In [None]:
switch_examples = []

for r in switched_rows:
    text   = r.get("text", "").strip()
    label  = r.get("label", "neu")
    domain = r.get("domain", "unk")
    toks   = r.get("tokens", [])
    swaps  = r.get("swaps", [])

    if not text or not toks:
        continue

    # indices that were actually switched in your lex-aug data
    switched_idx = {s["idx"] for s in swaps} if swaps else set()

    # candidate indices = tokens that exist in lexicon
    cand_idx = [i for i, tok in enumerate(toks) if tok in lexicon]

    if not cand_idx:
        continue

    for idx in cand_idx:
        if idx < 0 or idx >= len(toks):
            continue
        switch_examples.append({
            "text": text,
            "label": label,
            "domain": domain,
            "token": toks[idx],
            "token_idx": idx,
            "switch": 1 if idx in switched_idx else 0
        })

print("Total switch examples:", len(switch_examples))
print("Switch label dist:", Counter([e["switch"] for e in switch_examples]))
print("Domain dist:", Counter([e["domain"] for e in switch_examples]))

for ex in switch_examples[:5]:
    print("\nTEXT :", ex["text"])
    print("CAND  :", ex["token"], "@", ex["token_idx"])
    print("SWITCH:", ex["switch"])
    print("LBL   :", ex["label"], "| DOMAIN:", ex["domain"])


Total switch examples: 0
Switch label dist: Counter()
Domain dist: Counter()


In [None]:
from itertools import islice
for r in islice(switched_rows, 3):
    print(r.keys())
    print(r)
    print("-"*50)


dict_keys(['orig_text', 'switched_text', 'label', 'domain', 'switches'])
{'orig_text': "*أنسّى الألمَ ؟ أنها مجرد حياهَ ولن تدوّم ..* 🖤🍁\nForget the pain , it's just life will not last.", 'switched_text': "*أنسّى pain ؟ أنها مجرد حياهَ ولن تدوّم ..* 🖤🍁 Forget the pain , it's just life will not last.", 'label': 'neu', 'domain': 'eesa', 'switches': [{'idx': 1, 'src': 'الألمَ', 'tgt': 'pain'}]}
--------------------------------------------------
dict_keys(['orig_text', 'switched_text', 'label', 'domain', 'switches'])
{'orig_text': 'الحراميه اللي سارقه الموسيقي من اغنيه فنانه كوريه اسمها آيو الاغنيه اسمها  friday   نفس النغمه في البدايه', 'switched_text': 'الحراميه اللي سارقه musician من اغنيه فنانه كوريه اسمها آيو الاغنيه اسمها friday نفس melody في البدايه', 'label': 'neg', 'domain': 'eesa', 'switches': [{'idx': 3, 'src': 'الموسيقي', 'tgt': 'musician'}, {'idx': 14, 'src': 'النغمه', 'tgt': 'melody'}]}
--------------------------------------------------
dict_keys(['orig_text', 'switched_text'

In [None]:
import json, random
from pathlib import Path
from collections import Counter

random.seed(42)

BASE = Path("/content/drive/MyDrive/cs-senti")
DATA = BASE / "data"
LING = DATA / "ling"

FP_HOST_TRAIN = LING / "host_train_skeleton_all.jsonl"
FP_LEXICON    = LING / "ar_en_lex_llm_fixed.jsonl"

# ---------- helpers ----------

def read_jsonl(fp: Path):
    rows = []
    with open(fp, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

def load_lexicon_flexible(fp: Path):
    """Handle both JSON array and JSONL lexicon formats."""
    with open(fp, encoding="utf-8") as f:
        txt = f.read().strip()
    if txt.startswith("["):
        return json.loads(txt)
    lex = []
    for line in txt.splitlines():
        line = line.strip()
        if not line:
            continue
        lex.append(json.loads(line))
    return lex

# ---------- 1) load host skeleton + lexicon ----------

host_rows = read_jsonl(FP_HOST_TRAIN)
print("Loaded host skeleton rows:", len(host_rows))

lex_list = load_lexicon_flexible(FP_LEXICON)
lexicon  = {e["ar"]: e["en"] for e in lex_list}
print("Lexicon entries:", len(lexicon))

# ---------- 2) build switched sentences + token-level labels ----------

switched_rows   = []
switch_examples = []

for r in host_rows:
    text   = r.get("text", "").strip()
    label  = r.get("label", "neu")
    domain = r.get("domain", "unk")
    toks   = r.get("tokens", [])
    cand   = r.get("cand_indices", [])

    if not text or not toks or not cand:
        continue

    # keep only candidates that actually exist in lexicon
    cand_lex = [i for i in cand if 0 <= i < len(toks) and toks[i] in lexicon]
    if not cand_lex:
        continue

    # decide how many to switch (1–3 or all if fewer)
    max_sw = min(3, len(cand_lex))
    n_sw   = random.randint(1, max_sw)
    sw_idx = set(random.sample(cand_lex, n_sw))

    new_toks = toks[:]  # shallow copy
    swaps = []

    for i in cand_lex:
        src_tok = toks[i]
        if src_tok not in lexicon:
            continue
        if i in sw_idx:
            # actually switch
            tgt_tok = lexicon[src_tok]
            new_toks[i] = tgt_tok
            swaps.append({"idx": i, "src": src_tok, "tgt": tgt_tok})
            sw_label = 1
        else:
            # candidate but we keep it Arabic
            tgt_tok = None
            sw_label = 0

        # record token-level supervision
        switch_examples.append({
            "text": text,
            "label": label,
            "domain": domain,
            "token": src_tok,
            "token_idx": i,
            "switch": sw_label
        })

    # build final switched sentence row
    switched_rows.append({
        "orig_text": text,
        "switched_text": " ".join(new_toks),
        "label": label,
        "domain": domain,
        "tokens": toks,
        "cand_indices": cand_lex,
        "switched_indices": sorted(list(sw_idx)),
        "swaps": swaps
    })

print("\nSwitched sentence rows:", len(switched_rows))
print("Token-level examples:", len(switch_examples))
print("Switch label dist:", Counter([e["switch"] for e in switch_examples]))
print("Domain dist:", Counter([e["domain"] for e in switch_examples]))

# ---------- 3) save to disk ----------

FP_SWITCHED_NEW  = LING / "host_train_switched_new.jsonl"
FP_SWITCH_EXAMP  = LING / "switch_examples.jsonl"

with open(FP_SWITCHED_NEW, "w", encoding="utf-8") as f:
    for r in switched_rows:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

with open(FP_SWITCH_EXAMP, "w", encoding="utf-8") as f:
    for e in switch_examples:
        f.write(json.dumps(e, ensure_ascii=False) + "\n")

print("\n✅ Saved switched TRAIN to:", FP_SWITCHED_NEW)
print("✅ Saved token-level switch examples to:", FP_SWITCH_EXAMP)

# quick peek at a couple of rows
for r in switched_rows[:3]:
    print("\nLABEL :", r["label"], "| DOMAIN:", r["domain"])
    print("ORIG  :", r["orig_text"])
    print("SW    :", r["switched_text"])
    print("SWAPS :", r["swaps"])


Loaded host skeleton rows: 2948
Lexicon entries: 4275

Switched sentence rows: 2913
Token-level examples: 11296
Switch label dist: Counter({0: 6505, 1: 4791})
Domain dist: Counter({'eesa': 8407, 'mr': 2398, 'amg': 491})

✅ Saved switched TRAIN to: /content/drive/MyDrive/cs-senti/data/ling/host_train_switched_new.jsonl
✅ Saved token-level switch examples to: /content/drive/MyDrive/cs-senti/data/ling/switch_examples.jsonl

LABEL : neu | DOMAIN: eesa
ORIG  : *أنسّى الألمَ ؟ أنها مجرد حياهَ ولن تدوّم ..* 🖤🍁
Forget the pain , it's just life will not last.
SW    : *أنسّى pain ؟ أنها just حياهَ ولن work continuously ..* 🖤🍁 Forget the pain , it's just life will not last.
SWAPS : [{'idx': 1, 'src': 'الألمَ', 'tgt': 'pain'}, {'idx': 4, 'src': 'مجرد', 'tgt': 'just'}, {'idx': 7, 'src': 'تدوّم', 'tgt': 'work continuously'}]

LABEL : neg | DOMAIN: eesa
ORIG  : الحراميه اللي سارقه الموسيقي من اغنيه فنانه كوريه اسمها آيو الاغنيه اسمها  friday   نفس النغمه في البدايه
SW    : الحراميه اللي سارقه musicia

In [None]:
from pathlib import Path
import json
from collections import Counter
from sklearn.model_selection import train_test_split

BASE = Path("/content/drive/MyDrive/cs-senti")
DATA = BASE / "data"
LING = DATA / "ling"

FP_SWITCH_EX = LING / "switch_examples.jsonl"

def read_jsonl(fp: Path):
    rows = []
    with open(fp, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

switch_rows = read_jsonl(FP_SWITCH_EX)
print("Total token-level examples:", len(switch_rows))

print("Switch label dist:", Counter([r["switch"] for r in switch_rows]))
print("Domain dist:", Counter([r.get("domain","unk") for r in switch_rows]))

# quick peek
for ex in switch_rows[:5]:
    print("\nTEXT :", ex["text"])
    print("TOKEN:", ex["token"], "@", ex["token_idx"])
    print("SWITCH:", ex["switch"], "| LABEL:", ex["label"], "| DOMAIN:", ex["domain"])


Total token-level examples: 11296
Switch label dist: Counter({0: 6505, 1: 4791})
Domain dist: Counter({'eesa': 8407, 'mr': 2398, 'amg': 491})

TEXT : *أنسّى الألمَ ؟ أنها مجرد حياهَ ولن تدوّم ..* 🖤🍁
Forget the pain , it's just life will not last.
TOKEN: الألمَ @ 1
SWITCH: 1 | LABEL: neu | DOMAIN: eesa

TEXT : *أنسّى الألمَ ؟ أنها مجرد حياهَ ولن تدوّم ..* 🖤🍁
Forget the pain , it's just life will not last.
TOKEN: مجرد @ 4
SWITCH: 1 | LABEL: neu | DOMAIN: eesa

TEXT : *أنسّى الألمَ ؟ أنها مجرد حياهَ ولن تدوّم ..* 🖤🍁
Forget the pain , it's just life will not last.
TOKEN: تدوّم @ 7
SWITCH: 1 | LABEL: neu | DOMAIN: eesa

TEXT : الحراميه اللي سارقه الموسيقي من اغنيه فنانه كوريه اسمها آيو الاغنيه اسمها  friday   نفس النغمه في البدايه
TOKEN: الحراميه @ 0
SWITCH: 0 | LABEL: neg | DOMAIN: eesa

TEXT : الحراميه اللي سارقه الموسيقي من اغنيه فنانه كوريه اسمها آيو الاغنيه اسمها  friday   نفس النغمه في البدايه
TOKEN: الموسيقي @ 3
SWITCH: 1 | LABEL: neg | DOMAIN: eesa


In [None]:
# we will use all domains together (eesa+amg+mr)
# you can filter by domain later if you want

train_rows, dev_rows = train_test_split(
    switch_rows,
    test_size=0.2,
    random_state=42,
    stratify=[r["switch"] for r in switch_rows]  # keep 0/1 balance
)

print("Train size:", len(train_rows))
print("Dev size  :", len(dev_rows))

print("Train switch dist:", Counter([r["switch"] for r in train_rows]))
print("Dev switch dist  :", Counter([r["switch"] for r in dev_rows]))


Train size: 9036
Dev size  : 2260
Train switch dist: Counter({0: 5204, 1: 3832})
Dev switch dist  : Counter({0: 1301, 1: 959})


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import f1_score, classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

class SwitchDecisionDS(Dataset):
    def __init__(self, rows, tok, max_len=160):
        self.rows = rows
        self.tok = tok
        self.max_len = max_len

    def __len__(self):
        return len(self.rows)

    def __getitem__(self, i):
        r = self.rows[i]
        text = r["text"]
        tok_str = r["token"]
        idx = r["token_idx"]

        # very simple: re-tokenize text by space to put markers
        # (we rely on the "tokens" that produced token_idx, but here we only have text + token_idx)
        # we approximate by splitting on spaces:
        words = text.split()
        if 0 <= idx < len(words):
            words_marked = (
                words[:idx] +
                ["<SW>", words[idx], "</SW>"] +
                words[idx+1:]
            )
            marked_text = " ".join(words_marked)
        else:
            # fallback: no marking if idx weird
            marked_text = text

        label = int(r["switch"])  # 0 or 1

        enc = self.tok(
            marked_text,
            max_length=self.max_len,
            truncation=True,
            padding="max_length"
        )
        enc["labels"] = label
        return {k: torch.tensor(v) for k, v in enc.items()}

# binary labels
id2label = {0: "no_switch", 1: "switch"}
label2id = {"no_switch": 0, "switch": 1}

tok_sw = AutoTokenizer.from_pretrained("xlm-roberta-base")

model_sw = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
).to(device)

ds_sw_tr = SwitchDecisionDS(train_rows, tok_sw, max_len=160)
ds_sw_de = SwitchDecisionDS(dev_rows,   tok_sw, max_len=160)

dl_sw_tr = DataLoader(ds_sw_tr, batch_size=16, shuffle=True)
dl_sw_de = DataLoader(ds_sw_de, batch_size=32, shuffle=False)


Device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.optim import AdamW

def eval_switch(model, dataloader):
    model.eval()
    preds, gold = [], []
    with torch.no_grad():
        for batch in dataloader:
            labels = batch["labels"].numpy().tolist()
            batch = {k: v.to(device) for k, v in batch.items()}
            logits = model(**batch).logits.detach().cpu().numpy()
            preds.extend(logits.argmax(axis=1).tolist())
            gold.extend(labels)
    macro = f1_score(gold, preds, average="macro")
    return gold, preds, macro

EPOCHS = 3
optim = AdamW(model_sw.parameters(), lr=2e-5)
best_f1 = -1.0
best_state = None

for ep in range(1, EPOCHS+1):
    model_sw.train()
    for batch in dl_sw_tr:
        batch = {k: v.to(device) for k, v in batch.items()}
        out = model_sw(**batch)
        out.loss.backward()
        torch.nn.utils.clip_grad_norm_(model_sw.parameters(), 1.0)
        optim.step()
        optim.zero_grad()

    _, _, f1_dev = eval_switch(model_sw, dl_sw_de)
    print(f"Epoch {ep} → dev macro-F1 (switch decision) = {f1_dev:.4f}")
    if f1_dev > best_f1:
        best_f1 = f1_dev
        best_state = model_sw.state_dict().copy()

# restore best and save
model_sw.load_state_dict(best_state)
out_dir_sw = MODELS / "xlmr_switch_decider_lexsupervised"
out_dir_sw.mkdir(parents=True, exist_ok=True)
model_sw.save_pretrained(out_dir_sw.as_posix())
tok_sw.save_pretrained(out_dir_sw.as_posix())
print("✅ Saved switch-decider model to:", out_dir_sw)
print("Best dev Macro-F1:", best_f1)


Epoch 1 → dev macro-F1 (switch decision) = 0.6983
Epoch 2 → dev macro-F1 (switch decision) = 0.6778
Epoch 3 → dev macro-F1 (switch decision) = 0.6991
✅ Saved switch-decider model to: /content/drive/MyDrive/cs-senti/models/xlmr_switch_decider_lexsupervised
Best dev Macro-F1: 0.6991487669248215


In [None]:
from sklearn.metrics import classification_report

gold, preds, macro = eval_switch(model_sw, dl_sw_de)
print("\n=== Switch decision dev report ===")
print(classification_report(gold, preds, target_names=["no_switch","switch"], digits=4))
print("Macro-F1:", macro)



=== Switch decision dev report ===
              precision    recall  f1-score   support

   no_switch     0.7290    0.7940    0.7601      1301
      switch     0.6821    0.5996    0.6382       959

    accuracy                         0.7115      2260
   macro avg     0.7055    0.6968    0.6991      2260
weighted avg     0.7091    0.7115    0.7084      2260

Macro-F1: 0.6991487669248215


In [None]:
# CELL 1: imports & paths
from pathlib import Path
import json, random
from collections import Counter

import numpy as np
import torch

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
)

# fix seed
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

BASE   = Path("/content/drive/MyDrive/cs-senti")
DATA   = BASE / "data"
LING   = DATA / "ling"
MODELS = BASE / "models"

FP_SWITCHED_TRAIN = LING / "host_train_switched_new.jsonl"

def read_jsonl(fp: Path):
    rows = []
    with open(fp, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows


In [None]:
from pathlib import Path
import json
from collections import Counter

BASE = Path("/content/drive/MyDrive/cs-senti")
DATA = BASE / "data"
LING = DATA / "ling"

FP_HOST_SKELETON = LING / "host_train_switched_new.jsonl"
FP_LEXICON       = LING / "ar_en_lex_llm_fixed_clean.json"
FP_SW_TRAIN      = LING / "host_train_switched_supervised.jsonl"
FP_TOK_EXAMPLES  = LING / "switch_examples_supervised.jsonl"

def read_jsonl(fp: Path):
    rows = []
    with open(fp, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

print("Loading host skeleton...")
host_rows = read_jsonl(FP_HOST_SKELETON)
print("Loaded host skeleton rows:", len(host_rows))

print("Loading lexicon...")
lex_list = json.load(open(FP_LEXICON, encoding="utf-8"))
lexicon = {e["ar"]: e["en"] for e in lex_list}
print("Lexicon entries:", len(lexicon))

switched_rows = []
token_level_examples = []

for r in host_rows:
    text   = r.get("text", "").strip()
    label  = r.get("label", "neu")
    domain = r.get("domain", "unk")
    toks   = r.get("tokens", [])

    if not text or not toks:
        continue

    sw_tokens = []
    swaps     = []

    for i, tok in enumerate(toks):
        if tok in lexicon:
            en = lexicon[tok]
            sw_tokens.append(en)
            swaps.append({"idx": i, "src": tok, "tgt": en})
        else:
            sw_tokens.append(tok)

    # require at least one actual swap
    if not swaps:
        continue

    switched_text = " ".join(sw_tokens)

    switched_rows.append({
        "text": text,
        "tokens": toks,
        "switched_text": switched_text,
        "sw_tokens": sw_tokens,
        "swaps": swaps,
        "label": label,
        "domain": domain
    })

    # also build token-level examples for possible later use
    for i, tok in enumerate(toks):
        token_level_examples.append({
            "text": text,
            "label": label,
            "domain": domain,
            "token": tok,
            "token_idx": i,
            "switch": 1 if any(s["idx"] == i for s in swaps) else 0
        })

print("\nSwitched sentence rows:", len(switched_rows))
print("Token-level examples:", len(token_level_examples))
print("Switch label dist:", Counter([e["switch"] for e in token_level_examples]))
print("Domain dist:", Counter([e["domain"] for e in token_level_examples]))

# save sentence-level supervised pairs
with open(FP_SW_TRAIN, "w", encoding="utf-8") as f:
    for r in switched_rows:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

# save token-level examples
with open(FP_TOK_EXAMPLES, "w", encoding="utf-8") as f:
    for e in token_level_examples:
        f.write(json.dumps(e, ensure_ascii=False) + "\n")

print("\n✅ Saved switched TRAIN to:", FP_SW_TRAIN)
print("✅ Saved token-level switch examples to:", FP_TOK_EXAMPLES)

# quick peek
for ex in switched_rows[:3]:
    print("\nLABEL :", ex["label"], "| DOMAIN:", ex["domain"])
    print("ORIG  :", ex["text"])
    print("SW    :", ex["switched_text"])
    print("SWAPS :", ex["swaps"])
    break


Loading host skeleton...
Loaded host skeleton rows: 2913
Loading lexicon...
Lexicon entries: 4275

Switched sentence rows: 0
Token-level examples: 0
Switch label dist: Counter()
Domain dist: Counter()

✅ Saved switched TRAIN to: /content/drive/MyDrive/cs-senti/data/ling/host_train_switched_supervised.jsonl
✅ Saved token-level switch examples to: /content/drive/MyDrive/cs-senti/data/ling/switch_examples_supervised.jsonl


In [None]:
from pathlib import Path
import json, re

FP = Path("/content/drive/MyDrive/cs-senti/data/ling/ar_en_lex_llm_fixed.jsonl")

print("Loading raw text...")
raw = open(FP, encoding="utf-8").read()

# --- AUTO-REPAIR LOGIC ---
# 1) remove BOM
raw = raw.lstrip("\ufeff")

# 2) extract ALL JSON objects inside the file
objects = re.findall(r"\{[^}]+\}", raw)

print(f"Found {len(objects)} entries.")

# 3) rebuild as proper JSON array
fixed = [json.loads(obj) for obj in objects]

# 4) save as valid JSON
FP_FIXED = FP.parent / "ar_en_lex_llm_fixed_clean.json"
json.dump(fixed, open(FP_FIXED, "w", encoding="utf-8"), ensure_ascii=False, indent=2)

print("\n✅ Saved cleaned lexicon to:", FP_FIXED)


Loading raw text...
Found 4275 entries.

✅ Saved cleaned lexicon to: /content/drive/MyDrive/cs-senti/data/ling/ar_en_lex_llm_fixed_clean.json


In [None]:
import json
lex = json.load(open("/content/drive/MyDrive/cs-senti/data/ling/ar_en_lex_llm_fixed_clean.json"))
print("Entries:", len(lex))
print(lex[:5])


Entries: 4275
[{'ar': '5مليون', 'en': '5 million'}, {'ar': 'آخد', 'en': 'take'}, {'ar': 'آخر', 'en': 'last'}, {'ar': 'آخرة', 'en': 'afterlife'}, {'ar': 'آخـــرهـــم', 'en': 'the last one'}]


In [None]:
# CELL 2: build supervised pairs from lexical-switched file

rows = read_jsonl(FP_SWITCHED_TRAIN)
print("Total rows in host_train_switched_new:", len(rows))

pairs = []
for r in rows:
    src = r.get("text", "").strip()
    tgt = r.get("switched_text") or r.get("sw", "").strip()
    lab = r.get("label", "neu").strip()

    if not src or not tgt:
        continue

    # optional: filter out extremely short outputs
    if len(tgt.split()) < 2:
        continue

    pairs.append({
        "src": src,
        "tgt": tgt,
        "label": lab,
        "domain": r.get("domain", "unk")
    })

print("Usable pairs:", len(pairs))
print("Label dist:", Counter([p["label"] for p in pairs]))
print("Domain dist:", Counter([p["domain"] for p in pairs]) )


Total rows in host_train_switched_new: 2913
Usable pairs: 0
Label dist: Counter()
Domain dist: Counter()


In [None]:
from pathlib import Path
import json
from collections import Counter

BASE = Path("/content/drive/MyDrive/cs-senti")
DATA = BASE / "data"
LING = DATA / "ling"

FP_SWITCHED_NEW = LING / "host_train_switched_new.jsonl"

def read_jsonl(fp: Path):
    rows = []
    with open(fp, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

switched_rows = read_jsonl(FP_SWITCHED_NEW)
print("Total rows in host_train_switched_new:", len(switched_rows))

# peek structure of first few rows
for i, r in enumerate(switched_rows[:3]):
    print(f"\n--- ROW {i} ---")
    print("keys:", list(r.keys()))
    print("label :", r.get("label"))
    print("domain:", r.get("domain"))
    print("text  :", r.get("text"))
    print("switched_text:", r.get("switched_text"))
    print("tokens:", r.get("tokens")[:10] if isinstance(r.get("tokens"), list) else None)
    print("swaps :", r.get("swaps"))


Total rows in host_train_switched_new: 2913

--- ROW 0 ---
keys: ['orig_text', 'switched_text', 'label', 'domain', 'tokens', 'cand_indices', 'switched_indices', 'swaps']
label : neu
domain: eesa
text  : None
switched_text: *أنسّى pain ؟ أنها just حياهَ ولن work continuously ..* 🖤🍁 Forget the pain , it's just life will not last.
tokens: ['*أنسّى', 'الألمَ', '؟', 'أنها', 'مجرد', 'حياهَ', 'ولن', 'تدوّم', '..*', '🖤🍁']
swaps : [{'idx': 1, 'src': 'الألمَ', 'tgt': 'pain'}, {'idx': 4, 'src': 'مجرد', 'tgt': 'just'}, {'idx': 7, 'src': 'تدوّم', 'tgt': 'work continuously'}]

--- ROW 1 ---
keys: ['orig_text', 'switched_text', 'label', 'domain', 'tokens', 'cand_indices', 'switched_indices', 'swaps']
label : neg
domain: eesa
text  : None
switched_text: الحراميه اللي سارقه musician من اغنيه فنانه كوريه اسمها آيو الاغنيه اسمها friday نفس النغمه في البدايه
tokens: ['الحراميه', 'اللي', 'سارقه', 'الموسيقي', 'من', 'اغنيه', 'فنانه', 'كوريه', 'اسمها', 'آيو']
swaps : [{'idx': 3, 'src': 'الموسيقي', 'tgt': 'mus

In [None]:
from pathlib import Path
import json
from collections import Counter

BASE = Path("/content/drive/MyDrive/cs-senti")
DATA = BASE / "data"
LING = DATA / "ling"

FP_SWITCHED_NEW = LING / "host_train_switched_new.jsonl"
FP_GAN_PAIRS    = LING / "gan_supervised_pairs.jsonl"  # output file

def read_jsonl(fp: Path):
    rows = []
    with open(fp, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

switched_rows = read_jsonl(FP_SWITCHED_NEW)
print("Total rows in host_train_switched_new:", len(switched_rows))

pairs = []

for r in switched_rows:
    # FIX: use orig_text instead of text
    src = (r.get("orig_text") or "").strip()
    tgt = (r.get("switched_text") or "").strip()

    label  = r.get("label", "neu")
    domain = r.get("domain", "unk")

    # sanity checks
    if not src or not tgt:
        continue
    if src == tgt:
        continue

    # build a proper supervised pair
    pairs.append({
        "host": src,    # original monolingual-ish Arabic
        "cs": tgt,      # lexical-switched target
        "label": label,
        "domain": domain
    })

print("\nUsable pairs:", len(pairs))
print("Label dist:", Counter(p["label"] for p in pairs))
print("Domain dist:", Counter(p["domain"] for p in pairs))

for ex in pairs[:5]:
    print("\nLABEL :", ex["label"], "| DOMAIN:", ex["domain"])
    print("HOST  :", ex["host"])
    print("CS    :", ex["cs"])

# save supervised GAN pairs
with open(FP_GAN_PAIRS, "w", encoding="utf-8") as f:
    for p in pairs:
        f.write(json.dumps(p, ensure_ascii=False) + "\n")

print("\n✅ Saved GAN supervised pairs to:", FP_GAN_PAIRS)


Total rows in host_train_switched_new: 2913

Usable pairs: 2913
Label dist: Counter({'neu': 1108, 'pos': 1098, 'neg': 707})
Domain dist: Counter({'eesa': 2076, 'mr': 651, 'amg': 186})

LABEL : neu | DOMAIN: eesa
HOST  : *أنسّى الألمَ ؟ أنها مجرد حياهَ ولن تدوّم ..* 🖤🍁
Forget the pain , it's just life will not last.
CS    : *أنسّى pain ؟ أنها just حياهَ ولن work continuously ..* 🖤🍁 Forget the pain , it's just life will not last.

LABEL : neg | DOMAIN: eesa
HOST  : الحراميه اللي سارقه الموسيقي من اغنيه فنانه كوريه اسمها آيو الاغنيه اسمها  friday   نفس النغمه في البدايه
CS    : الحراميه اللي سارقه musician من اغنيه فنانه كوريه اسمها آيو الاغنيه اسمها friday نفس النغمه في البدايه

LABEL : neu | DOMAIN: eesa
HOST  : مش عارف ليه حاس اني داخل pupge 😂😂😂
CS    : مش عارف ليه feeling اني داخل pupge 😂😂😂

LABEL : neg | DOMAIN: eesa
HOST  : 500 مليون ههههههه والله العظيم ماتستاهل حتى 50 views
CS    : 500 مليون ههههههه والله great ماتستاهل حتى 50 views

LABEL : pos | DOMAIN: eesa
HOST  : جميل الاعلان

Cell 1 – Load GAN pairs & create train/dev splits

This uses the file we just created:
/content/drive/MyDrive/cs-senti/data/ling/gan_supervised_pairs.jsonl

In [None]:
# === Cell 1: load GAN supervised pairs + train/dev split ===
import json, random
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split

BASE = Path("/content/drive/MyDrive/cs-senti")
DATA = BASE / "data"
LING = DATA / "ling"

FP_GAN_PAIRS = LING / "gan_supervised_pairs.jsonl"

def read_jsonl(fp: Path):
    rows = []
    with open(fp, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

pairs = read_jsonl(FP_GAN_PAIRS)
print("Total supervised CS pairs:", len(pairs))
print("Label dist:", Counter(p["label"] for p in pairs))
print("Domain dist:", Counter(p["domain"] for p in pairs))

# make sure each example has host + cs
pairs = [p for p in pairs if p.get("host") and p.get("cs")]
print("Usable pairs after sanity check:", len(pairs))

# train/dev split (e.g. 90/10, stratified by label)
labels = [p["label"] for p in pairs]
train_pairs, dev_pairs = train_test_split(
    pairs,
    test_size=0.1,
    random_state=42,
    stratify=labels
)

print("\nTrain size:", len(train_pairs))
print("Dev size  :", len(dev_pairs))

print("Train label dist:", Counter(p["label"] for p in train_pairs))
print("Dev label dist  :", Counter(p["label"] for p in dev_pairs))

# optionally save them (useful for reproducibility)
FP_GAN_TR = LING / "gan_pairs_train.jsonl"
FP_GAN_DE = LING / "gan_pairs_dev.jsonl"

with open(FP_GAN_TR, "w", encoding="utf-8") as f:
    for r in train_pairs:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

with open(FP_GAN_DE, "w", encoding="utf-8") as f:
    for r in dev_pairs:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print("\n✅ Saved:")
print(" -", FP_GAN_TR)
print(" -", FP_GAN_DE)


Total supervised CS pairs: 2913
Label dist: Counter({'neu': 1108, 'pos': 1098, 'neg': 707})
Domain dist: Counter({'eesa': 2076, 'mr': 651, 'amg': 186})
Usable pairs after sanity check: 2913

Train size: 2621
Dev size  : 292
Train label dist: Counter({'neu': 997, 'pos': 988, 'neg': 636})
Dev label dist  : Counter({'neu': 111, 'pos': 110, 'neg': 71})

✅ Saved:
 - /content/drive/MyDrive/cs-senti/data/ling/gan_pairs_train.jsonl
 - /content/drive/MyDrive/cs-senti/data/ling/gan_pairs_dev.jsonl


Cell 2 – Build a seq2seq dataset (mT5) for HOST → CS

Here we prepare data for a generator model.
I’ll use google/mt5-small (multilingual, lightweight enough for Colab).

In [None]:
# === Cell 2: build HF Datasets + tokenizer for mT5 ===
from datasets import Dataset
from transformers import AutoTokenizer

MODEL_NAME = "google/mt5-small"

tok_g = AutoTokenizer.from_pretrained(MODEL_NAME)

# reload from disk to be safe
train_pairs = read_jsonl(FP_GAN_TR)
dev_pairs   = read_jsonl(FP_GAN_DE)

print("Train pairs:", len(train_pairs))
print("Dev pairs  :", len(dev_pairs))

# Build HF datasets from Python lists
ds_train = Dataset.from_list(train_pairs)
ds_dev   = Dataset.from_list(dev_pairs)

MAX_SRC_LEN = 64   # host sentence length
MAX_TGT_LEN = 64   # cs sentence length

def preprocess_fn(batch):
    # inputs: host (Arabic-ish)
    model_inputs = tok_g(
        batch["host"],
        max_length=MAX_SRC_LEN,
        truncation=True,
        padding="max_length"
    )
    # targets: cs (code-switched sentence)
    with tok_g.as_target_tokenizer():
        labels = tok_g(
            batch["cs"],
            max_length=MAX_TGT_LEN,
            truncation=True,
            padding="max_length"
        )["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs

ds_train_tok = ds_train.map(preprocess_fn, batched=True, remove_columns=ds_train.column_names)
ds_dev_tok   = ds_dev.map(preprocess_fn,   batched=True, remove_columns=ds_dev.column_names)

print(ds_train_tok)
print(ds_dev_tok)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Train pairs: 2621
Dev pairs  : 292


Map:   0%|          | 0/2621 [00:00<?, ? examples/s]



Map:   0%|          | 0/292 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2621
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 292
})


Cell 3 – Define the generator model + Trainer

Now we create an mT5 generator and a Seq2SeqTrainer to learn HOST → CS mapping.

In [None]:
# === Cell 3: define training arguments + trainer (SAFE VERSION) ===
from transformers import (
    MT5ForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

# Load mT5-small model
model_g = MT5ForConditionalGeneration.from_pretrained(MODEL_NAME)

OUT_DIR_G = MODELS / "gan_stage1_generator_supervised"
OUT_DIR_G.mkdir(parents=True, exist_ok=True)

# Collator ensures correct padding for seq2seq
data_collator = DataCollatorForSeq2Seq(tok_g, model=model_g)

# IMPORTANT: ONLY use arguments supported by your transformers version
args = Seq2SeqTrainingArguments(
    output_dir=str(OUT_DIR_G),
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=50,
    weight_decay=0.01,
    logging_steps=50,
    save_total_limit=1,
    predict_with_generate=False,  # avoids errors in older versions
)

trainer_g = Seq2SeqTrainer(
    model=model_g,
    args=args,
    train_dataset=ds_train_tok,
    eval_dataset=ds_dev_tok,       # if you want no eval, set to None
    data_collator=data_collator,
    tokenizer=tok_g,
)

print("Trainer created successfully!")


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer_g = Seq2SeqTrainer(


Trainer created successfully!


In [None]:
# === Cell 4: train generator (supervised) ===

# Disable W&B completely to avoid crashes
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"
os.environ["WANDB_SILENT"] = "true"

print("🚀 Training supervised generator...")
trainer_g.train()

# Save final model
trainer_g.save_model()
tok_g.save_pretrained(OUT_DIR_G)

print("✅ Supervised generator saved to:", OUT_DIR_G)


🚀 Training supervised generator...


Step,Training Loss
50,43.7695
100,35.0834
150,28.5555
200,23.1007
250,20.7295
300,18.3026
350,16.0278
400,14.4773
450,13.2348
500,12.1756


✅ Supervised generator saved to: /content/drive/MyDrive/cs-senti/models/gan_stage1_generator_supervised


Quick sanity check: generate from the supervised generator

Before going to GAN/RL stuff, we should see what it’s producing.

In [None]:
from transformers import AutoTokenizer, MT5ForConditionalGeneration
from pathlib import Path
import json, random

BASE = Path("/content/drive/MyDrive/cs-senti")
MODELS = BASE / "models"
DATA = BASE / "data"
LING = DATA / "ling"

GEN_DIR = MODELS / "gan_stage1_generator_supervised"
tok_g = AutoTokenizer.from_pretrained(GEN_DIR)
gen_mdl = MT5ForConditionalGeneration.from_pretrained(GEN_DIR).to("cuda")

# load a few host sentences to test (the same host we used to build pairs)
FP_GAN_TR = LING / "gan_pairs_train.jsonl"   # from earlier
def read_jsonl(fp):
    rows=[]
    with open(fp, encoding="utf-8") as f:
        for line in f:
            line=line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

pairs = read_jsonl(FP_GAN_TR)
print("Total train pairs:", len(pairs))

# sample a few
for ex in random.sample(pairs, 5):
    host = ex["host"]
    gold_cs = ex["cs"]
    inputs = tok_g(
        host,
        return_tensors="pt",
        max_length=64,
        truncation=True
    ).to(gen_mdl.device)

    out_ids = gen_mdl.generate(
        **inputs,
        max_length=64,
        num_beams=4,
        do_sample=False
    )
    pred_cs = tok_g.decode(out_ids[0], skip_special_tokens=True)

    print("\nLABEL:", ex["label"], "| DOMAIN:", ex["domain"])
    print("HOST :", host)
    print("GOLD :", gold_cs)
    print("PRED :", pred_cs)


Total train pairs: 2621

LABEL: pos | DOMAIN: eesa
HOST : TAMIR & RANIN احلى حبيبين وزوجين
GOLD : TAMIR & RANIN احلى lovers and two spouses
PRED : <extra_id_0> وزوجين وزوجين وزوجين وزوجين وزوجين وزوجين وزوجين وزوجين وزوجة                                   

LABEL: pos | DOMAIN: amg
HOST : كنت عايزة أعتذر عن الموقف السخيف اللي حصل يوم الParents' Day
GOLD : كنت عايزة أعتذر عن الموقف annoying اللي حصل يوم الParents' Day
PRED : <extra_id_0> في Parents' Day اللي حصل في 

LABEL: pos | DOMAIN: eesa
HOST : ربنا يخليكي ليا ي قلبي farha
GOLD : ربنا let you to me ي قلبي farha
PRED : <extra_id_0> ي قلبي ي قلبي ي 

LABEL: neg | DOMAIN: eesa
HOST : 4K DISLIKE ? فعلا مجانين و بينكم و بين الزووق بلاد
GOLD : 4K DISLIKE ? فعلا crazy و بينكم و بين taste بلاد
PRED : <extra_id_0> ؟؟ ؟         ال                                                

LABEL: neg | DOMAIN: eesa
HOST : الناس اللى عاملة dislike دى مريضة نفسيا 
ربنا يعينكم عالبليلة اللى فدماغكم
GOLD : الناس اللى عاملة dislike دى مريضة نفسيا ربنا help 

In [None]:
import random
for row in random.sample(train_pairs, 5):
    print("\nHOST:", row['host'])
    print("CS  :", row['cs'])



HOST: الحوار كله "فاهمه قصدي" 
6 times
CS  : dialogue كله "فاهمه قصدي" 6 times

HOST: Hasan goztlik عامل شغل عالي
CS  : Hasan goztlik عامل شغل high

HOST: كل ما تحس بتطهير أو Catharsis.
CS  : all ما تحس بتطهير أو Catharsis.

HOST: اللي عاملين Dislike دول مرضى نفسيين بصراحه
CS  : اللي workers Dislike دول مرضى psychologists بصراحه

HOST: الممثل الوحيد اللي بيعمل heart للكومنتات والله بحبك
CS  : actor only one اللي بيعمل heart للكومنتات والله بحبك


Step 1 – Build RL dataset (prompts + labels)

We’ll use the host side of your supervised pairs + their labels:

In [None]:
from pathlib import Path
import json, random
from collections import Counter

BASE = Path("/content/drive/MyDrive/cs-senti")
DATA = BASE / "data"
LING = DATA / "ling"

FP_GAN_TR = LING / "gan_pairs_train.jsonl"

def read_jsonl(fp):
    rows = []
    with open(fp, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

pairs = read_jsonl(FP_GAN_TR)
print("Total supervised pairs:", len(pairs))

# RL dataset: (host_prompt, target_label)
rl_data = [
    {
        "host": p["host"],
        "label": p.get("label", "neu"),   # fallback neu
        "domain": p.get("domain", "eesa")
    }
    for p in pairs
    if p.get("host") and p.get("label") in ["pos","neu","neg"]
]

print("RL data size:", len(rl_data))
print("Label dist:", Counter([r["label"] for r in rl_data]))
print("Domain dist:", Counter([r["domain"] for r in rl_data]) )


Total supervised pairs: 2621
RL data size: 2621
Label dist: Counter({'neu': 997, 'pos': 988, 'neg': 636})
Domain dist: Counter({'eesa': 1868, 'mr': 590, 'amg': 163})


4️⃣ Step 2 – Load all oracles + generator

We’ll:

reload your supervised generator

load sentiment oracle

load switch oracle

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# --- generator (start from supervised G0) ---
G_SUP = BASE / "models/gan_stage1_generator_supervised"
tok_g = AutoTokenizer.from_pretrained(G_SUP)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(G_SUP).to(device)
gen_model.eval()  # PPO code will switch to train() later

# --- sentiment oracle (frozen) ---
SA_DIR = BASE / "models/sa_mixed_v3_frozen"
tok_sa = AutoTokenizer.from_pretrained(SA_DIR)
sa_model = AutoModelForSequenceClassification.from_pretrained(SA_DIR).to(device)
sa_model.eval()

sa_labels = json.load(open(SA_DIR / "label_map.json"))["labels"]  # ["pos","neg","neu"] or similar
sa_idx = {lab: i for i, lab in enumerate(sa_labels)}

# --- switch oracle (binary) ---
SW_DIR = BASE / "models/xlmr_switch_decider_lexsupervised"
tok_sw = AutoTokenizer.from_pretrained(SW_DIR)
sw_model = AutoModelForSequenceClassification.from_pretrained(SW_DIR).to(device)
sw_model.eval()


Device: cuda


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

5️⃣ Step 3 – Reward functions
5.1 Sentiment reward

We want a score in [0,1] measuring how confidently the sentiment matches the original label.

In [None]:
import numpy as np

@torch.no_grad()
def sentiment_reward(texts, target_labels):
    """
    texts: list[str] generated CS sentences
    target_labels: list[str] same length, each in {"pos","neu","neg"}
    returns: np.array of shape (batch,)
    """
    enc = tok_sa(
        texts,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    ).to(device)

    logits = sa_model(**enc).logits  # (B,3)
    probs = logits.softmax(-1)

    idx = torch.tensor([sa_idx[l] for l in target_labels], device=device)
    scores = probs[torch.arange(len(texts), device=device), idx]  # prob of correct label

    return scores.detach().cpu().numpy()  # values in [0,1]


5.2 Switch reward (how “code-switched” is it?)

The switch-decider was trained at token level, but we can derive a sentence-level score: for each token, probability of “switch”; then average over tokens.

In [None]:
@torch.no_grad()
def switch_reward(texts):
    """
    texts: list[str] generated CS sentences
    returns: np.array of shape (batch,), higher = more plausible switching
    """
    enc = tok_sw(
        texts,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    ).to(device)

    logits = sw_model(**enc).logits  # (B,2)
    probs = logits.softmax(-1)       # class 1 = "switch"
    # Take P(switch) as overall "switchiness" proxy.
    # If your model was token-level, you might want to adapt this.
    p_switch = probs[:, 1]

    return p_switch.detach().cpu().numpy()


5.3 Combine rewards

We combine them with simple weights:

In [None]:
def combined_reward(gen_texts, target_labels, w_sent=0.7, w_switch=0.3):
    r_sent = sentiment_reward(gen_texts, target_labels)   # [0,1]
    r_sw   = switch_reward(gen_texts)                     # [0,1]
    r_tot  = w_sent * r_sent + w_switch * r_sw
    return r_tot, r_sent, r_sw


6️⃣ Step 4 – Generation function for PPO loop

We need a helper that, given hosts, generates candidate CS texts from the current generator:

In [None]:
def generate_cs(host_batch, max_new_tokens=40):
    inputs = tok_g(
        host_batch,
        truncation=True,
        padding=True,
        max_length=64,
        return_tensors="pt"
    ).to(device)

    outputs = gen_model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_return_sequences=1
    )

    texts = tok_g.batch_decode(outputs, skip_special_tokens=True)
    return texts


7️⃣ Step 5 – PPO / RL training loop (conceptual skeleton)

Here’s the shape of the PPO loop, without going deeply into all math details (because a full implementation is long):

In [None]:
from torch.optim import AdamW

# the generator will now be trainable
gen_model.train()
optimizer = AdamW(gen_model.parameters(), lr=1e-5)

BATCH_SIZE = 16
RL_EPOCHS = 2  # you can increase once stable

def get_minibatch():
    batch = random.sample(rl_data, BATCH_SIZE)
    hosts = [b["host"] for b in batch]
    labels = [b["label"] for b in batch]
    return hosts, labels

for epoch in range(1, RL_EPOCHS+1):
    print(f"\n=== RL Epoch {epoch} ===")
    for step in range(200):  # number of PPO steps per epoch (tune this)
        hosts, labels = get_minibatch()

        # 1) generate CS from current generator
        gen_model.eval()
        with torch.no_grad():
            gen_texts = generate_cs(hosts)
        gen_model.train()

        # 2) compute rewards
        r_total, r_sent, r_sw = combined_reward(gen_texts, labels)
        r_total_t = torch.tensor(r_total, dtype=torch.float32, device=device)

        # 3) re-encode generated texts to compute logprobs
        enc = tok_g(
            hosts,
            truncation=True,
            padding=True,
            max_length=64,
            return_tensors="pt"
        ).to(device)

        with tok_g.as_target_tokenizer():
            tgt = tok_g(
                gen_texts,
                truncation=True,
                padding=True,
                max_length=64,
                return_tensors="pt"
            ).input_ids.to(device)

        # shift labels etc. for seq2seq cross-entropy
        out = gen_model(
            **enc,
            labels=tgt
        )
        # out.loss is average NLL over tokens; we want policy gradient approximated as:
        # loss_rl = - E[R * logpi] ≈ out.loss * (-R_normalized)
        # So we weight loss by normalized reward:

        # normalize rewards (zero mean, unit std) to stabilize
        r_norm = (r_total_t - r_total_t.mean()) / (r_total_t.std() + 1e-6)
        # we multiply loss by (-r_norm) so high reward -> low loss -> parameters push toward it
        loss = out.loss * (-r_norm.mean())

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(gen_model.parameters(), 1.0)
        optimizer.step()

        if step % 20 == 0:
            print(f"step {step:03d} | loss={loss.item():.4f} | R={r_total.mean():.3f} | R_sent={r_sent.mean():.3f} | R_sw={r_sw.mean():.3f}")

    # optionally save checkpoint after each RL epoch
    OUT_RL = BASE / "models/gan_stage2_generator_rl"
    OUT_RL.mkdir(parents=True, exist_ok=True)
    gen_model.save_pretrained(OUT_RL.as_posix())
    tok_g.save_pretrained(OUT_RL.as_posix())
    print("💾 Saved RL generator checkpoint to:", OUT_RL)



=== RL Epoch 1 ===




OutOfMemoryError: CUDA out of memory. Tried to allocate 490.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 432.12 MiB is free. Process 7112 has 14.32 GiB memory in use. Of the allocated memory 13.29 GiB is allocated by PyTorch, and 911.59 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import gc, torch

# 1) move oracles to CPU
sa_model.to("cpu")
sw_model.to("cpu")

# keep generator on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gen_model.to(device)

# 2) clear any leftover GPU cache
torch.cuda.empty_cache()
gc.collect()

print("Generator on:", device)
print("Sentiment oracle on CPU, switch oracle on CPU.")


Generator on: cuda
Sentiment oracle on CPU, switch oracle on CPU.


In [None]:
import numpy as np
import torch

@torch.no_grad()
def sentiment_reward(texts, target_labels):
    """
    texts: list[str] generated CS sentences
    target_labels: list[str] in {"pos","neu","neg"}
    Uses sa_model on CPU.
    """
    enc = tok_sa(
        texts,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )  # stays on CPU

    logits = sa_model(**enc).logits  # CPU
    probs = logits.softmax(-1)

    idx = torch.tensor([sa_idx[l] for l in target_labels], dtype=torch.long)
    scores = probs[torch.arange(len(texts)), idx]

    return scores.detach().numpy()  # [0,1]


@torch.no_grad()
def switch_reward(texts):
    """
    texts: list[str] generated CS sentences
    Uses sw_model on CPU.
    """
    enc = tok_sw(
        texts,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )  # CPU

    logits = sw_model(**enc).logits  # (B,2)
    probs = logits.softmax(-1)
    p_switch = probs[:, 1]

    return p_switch.detach().numpy()


def combined_reward(gen_texts, target_labels, w_sent=0.7, w_switch=0.3):
    r_sent = sentiment_reward(gen_texts, target_labels)   # [0,1]
    r_sw   = switch_reward(gen_texts)                     # [0,1]
    r_tot  = w_sent * r_sent + w_switch * r_sw
    return r_tot, r_sent, r_sw


In [None]:
from torch.optim import AdamW

gen_model.train()
optimizer = AdamW(gen_model.parameters(), lr=1e-5)

BATCH_SIZE = 4          # smaller batch to save memory
RL_EPOCHS  = 1          # start with 1 just to test
RL_STEPS   = 50         # PPO-ish steps per epoch (you can increase later)

def get_minibatch():
    batch = random.sample(rl_data, BATCH_SIZE)
    hosts  = [b["host"]  for b in batch]
    labels = [b["label"] for b in batch]
    return hosts, labels

def generate_cs(host_batch, max_new_tokens=40):
    inputs = tok_g(
        host_batch,
        truncation=True,
        padding=True,
        max_length=64,
        return_tensors="pt"
    ).to(device)  # generator on GPU

    outputs = gen_model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_return_sequences=1
    )

    texts = tok_g.batch_decode(outputs, skip_special_tokens=True)
    return texts

for epoch in range(1, RL_EPOCHS + 1):
    print(f"\n=== RL Epoch {epoch} ===")
    for step in range(RL_STEPS):
        hosts, labels = get_minibatch()

        # 1) generate CS with current generator
        gen_model.eval()
        with torch.no_grad():
            gen_texts = generate_cs(hosts)
        gen_model.train()

        # 2) compute rewards (on CPU)
        r_total, r_sent, r_sw = combined_reward(gen_texts, labels)
        r_total_t = torch.tensor(r_total, dtype=torch.float32, device=device)

        # 3) compute seq2seq loss for generated outputs
        enc = tok_g(
            hosts,
            truncation=True,
            padding=True,
            max_length=64,
            return_tensors="pt"
        ).to(device)

        tgt = tok_g(
            text_target=gen_texts,
            truncation=True,
            padding=True,
            max_length=64,
            return_tensors="pt"
        ).input_ids.to(device)

        out = gen_model(
            **enc,
            labels=tgt
        )  # out.loss is average NLL

        # normalize rewards
        r_norm = (r_total_t - r_total_t.mean()) / (r_total_t.std() + 1e-6)
        # scalar surrogate loss: high reward => reduce loss
        loss = out.loss * (-r_norm.mean())

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(gen_model.parameters(), 1.0)
        optimizer.step()

        if step % 10 == 0:
            print(
                f"step {step:03d} | loss={loss.item():.4f} | "
                f"R={r_total.mean():.3f} | R_sent={r_sent.mean():.3f} | R_sw={r_sw.mean():.3f}"
            )

    # save after RL epoch
    OUT_RL = BASE / "models/gan_stage2_generator_rl"
    OUT_RL.mkdir(parents=True, exist_ok=True)
    gen_model.save_pretrained(OUT_RL.as_posix())
    tok_g.save_pretrained(OUT_RL.as_posix())
    print("💾 Saved RL generator checkpoint to:", OUT_RL)



=== RL Epoch 1 ===


OutOfMemoryError: CUDA out of memory. Tried to allocate 490.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 306.12 MiB is free. Process 7112 has 14.44 GiB memory in use. Of the allocated memory 13.43 GiB is allocated by PyTorch, and 900.37 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import torch, json
from pathlib import Path
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification,
)

BASE   = Path("/content/drive/MyDrive/cs-senti")
MODELS = BASE / "models"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# 1) Generator (mT5) ON GPU
GEN_DIR = MODELS / "gan_stage1_generator_supervised"
tok_g   = AutoTokenizer.from_pretrained(GEN_DIR.as_posix())
gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_DIR.as_posix()).to(device)

# 2) Sentiment oracle ON CPU
SA_DIR = MODELS / "sa_mixed_v3_frozen"
tok_sa = AutoTokenizer.from_pretrained(SA_DIR.as_posix())
sa_model = AutoModelForSequenceClassification.from_pretrained(SA_DIR.as_posix())
sa_model.eval()     # CPU

sa_labels = json.load(open(SA_DIR / "label_map.json"))["labels"]
sa_idx    = {l:i for i,l in enumerate(sa_labels)}

# 3) Switch-decider ON CPU
SW_DIR = MODELS / "xlmr_switch_decider_lexsupervised"
tok_sw = AutoTokenizer.from_pretrained(SW_DIR.as_posix())
sw_model = AutoModelForSequenceClassification.from_pretrained(SW_DIR.as_posix())
sw_model.eval()     # CPU

print("✅ Loaded generator on", device, "| oracles on CPU")


Device: cuda
✅ Loaded generator on cuda | oracles on CPU


In [None]:
import json
from pathlib import Path

DATA = BASE / "data"
LING = DATA / "ling"

FP_GAN_TR = LING / "gan_pairs_train.jsonl"

def read_jsonl(fp):
    rows = []
    with open(fp, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

rl_data = read_jsonl(FP_GAN_TR)
print("RL data size:", len(rl_data))
print("Sample:", rl_data[0])


RL data size: 2621
Sample: {'host': 'اهداء مني إلى ميشو حبيبتي الغاليا   mesooo ⚘⚘⚘🌹🌼🌺🌼🌼🌻😍😍😍😚😙', 'cs': 'dedication مني إلى ميشو حبيبتي الغاليا mesooo ⚘⚘⚘🌹🌼🌺🌼🌼🌻😍😍😍😚😙', 'label': 'pos', 'domain': 'eesa'}


In [None]:
import numpy as np
import torch

@torch.no_grad()
def sentiment_reward(texts, target_labels):
    enc = tok_sa(
        texts,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"       # CPU
    )
    logits = sa_model(**enc).logits
    probs  = logits.softmax(-1)

    idx = torch.tensor([sa_idx[l] for l in target_labels], dtype=torch.long)
    scores = probs[torch.arange(len(texts)), idx]
    return scores.detach().numpy()

@torch.no_grad()
def switch_reward(texts):
    enc = tok_sw(
        texts,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )
    logits = sw_model(**enc).logits
    probs  = logits.softmax(-1)
    p_switch = probs[:, 1]
    return p_switch.detach().numpy()

def combined_reward(gen_texts, target_labels, w_sent=0.7, w_switch=0.3):
    r_sent = sentiment_reward(gen_texts, target_labels)
    r_sw   = switch_reward(gen_texts)
    r_tot  = w_sent * r_sent + w_switch * r_sw
    return r_tot, r_sent, r_sw


In [None]:
from transformers import Adafactor
import random

gen_model.train()

# Adafactor = more memory friendly than AdamW for T5/mT5
optimizer = Adafactor(
    gen_model.parameters(),
    lr=1e-4,
    relative_step=False,
    scale_parameter=False
)

BATCH_SIZE = 2
RL_EPOCHS  = 1
RL_STEPS   = 40   # keep it modest

def get_minibatch():
    batch = random.sample(rl_data, BATCH_SIZE)
    hosts  = [b["host"]  for b in batch]
    labels = [b["label"] for b in batch]
    return hosts, labels

def generate_cs(host_batch, max_new_tokens=32):
    inputs = tok_g(
        host_batch,
        truncation=True,
        padding=True,
        max_length=48,
        return_tensors="pt"
    ).to(device)   # only this goes to GPU

    outputs = gen_model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_return_sequences=1
    )

    texts = tok_g.batch_decode(outputs, skip_special_tokens=True)
    return texts

for epoch in range(1, RL_EPOCHS + 1):
    print(f"\n=== RL Epoch {epoch} (GPU) ===")
    for step in range(RL_STEPS):
        hosts, labels = get_minibatch()

        # 1) generate on GPU
        with torch.no_grad():
            gen_texts = generate_cs(hosts)

        # 2) rewards on CPU
        r_total, r_sent, r_sw = combined_reward(gen_texts, labels)
        r_total_t = torch.tensor(r_total, dtype=torch.float32, device=device)

        # 3) teacher-forcing loss on GPU w.r.t generated text
        enc = tok_g(
            hosts,
            truncation=True,
            padding=True,
            max_length=48,
            return_tensors="pt"
        ).to(device)

        tgt_ids = tok_g(
            text_target=gen_texts,
            truncation=True,
            padding=True,
            max_length=48,
            return_tensors="pt"
        ).input_ids.to(device)

        out = gen_model(
            **enc,
            labels=tgt_ids
        )
        base_loss = out.loss

        # normalize reward
        r_norm = (r_total_t - r_total_t.mean()) / (r_total_t.std() + 1e-6)
        # simple shaping: encourage higher reward ⇒ smaller loss
        loss = base_loss * (1.0 - r_norm.mean())

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(gen_model.parameters(), 1.0)
        optimizer.step()

        if step % 5 == 0:
            print(
                f"step {step:02d} | loss={loss.item():.4f} | "
                f"R={r_total.mean():.3f} | R_sent={r_sent.mean():.3f} | R_sw={r_sw.mean():.3f}"
            )
            torch.cuda.empty_cache()

OUT_RL = MODELS / "gan_stage2_generator_rl_gpu"
OUT_RL.mkdir(parents=True, exist_ok=True)
gen_model.save_pretrained(OUT_RL.as_posix())
tok_g.save_pretrained(OUT_RL.as_posix())
print("💾 Saved RL generator checkpoint to:", OUT_RL)



=== RL Epoch 1 (GPU) ===
step 00 | loss=16.7842 | R=0.564 | R_sent=0.520 | R_sw=0.667
step 05 | loss=7.7543 | R=0.399 | R_sent=0.273 | R_sw=0.693
step 10 | loss=7.3457 | R=0.382 | R_sent=0.215 | R_sw=0.772
step 15 | loss=10.1090 | R=0.307 | R_sent=0.084 | R_sw=0.828
step 20 | loss=7.1873 | R=0.261 | R_sent=0.069 | R_sw=0.710
step 25 | loss=9.8213 | R=0.569 | R_sent=0.500 | R_sw=0.732
step 30 | loss=9.4836 | R=0.584 | R_sent=0.486 | R_sw=0.814
step 35 | loss=8.2279 | R=0.624 | R_sent=0.563 | R_sw=0.767
💾 Saved RL generator checkpoint to: /content/drive/MyDrive/cs-senti/models/gan_stage2_generator_rl_gpu


Step 1: qualitative comparison between:

In [None]:
import json, random
from pathlib import Path

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

BASE   = Path("/content/drive/MyDrive/cs-senti")
DATA   = BASE / "data"
LING   = DATA / "ling"
MODELS = BASE / "models"

FP_GAN_TR = LING / "gan_pairs_train.jsonl"

GEN_SUP_DIR = MODELS / "gan_stage1_generator_supervised"
GEN_RL_DIR  = MODELS / "gan_stage2_generator_rl_gpu"

def read_jsonl(fp: Path):
    rows = []
    with open(fp, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

pairs = read_jsonl(FP_GAN_TR)
print("Total train pairs:", len(pairs))

# load tokenizer + both generators
tok_g = AutoTokenizer.from_pretrained(GEN_SUP_DIR)

gen_sup = AutoModelForSeq2SeqLM.from_pretrained(GEN_SUP_DIR).to(device)
gen_rl  = AutoModelForSeq2SeqLM.from_pretrained(GEN_RL_DIR).to(device)

gen_sup.eval()
gen_rl.eval()

def generate_cs(model, tokenizer, host_text, max_src_len=64, max_new_tokens=64):
    """Generate a CS sentence from a host sentence."""
    enc = tokenizer(
        host_text,
        return_tensors="pt",
        truncation=True,
        max_length=max_src_len
    ).to(device)

    with torch.no_grad():
        out_ids = model.generate(
            **enc,
            max_new_tokens=max_new_tokens,
            num_beams=4,
            do_sample=False
        )
    text = tokenizer.decode(out_ids[0], skip_special_tokens=True)
    return text.strip()


Device: cuda
Total train pairs: 2621


In [None]:
# how many examples to inspect
N = 15
sampled = random.sample(pairs, min(N, len(pairs)))

for i, ex in enumerate(sampled, 1):
    host   = ex["host"]
    cs_gold = ex["cs"]
    label  = ex.get("label", "neu")
    domain = ex.get("domain", "unk")

    cs_sup = generate_cs(gen_sup, tok_g, host)
    cs_rl  = generate_cs(gen_rl, tok_g, host)

    print("="*80)
    print(f"[{i}] LABEL: {label} | DOMAIN: {domain}")
    print(f"HOST : {host}")
    print(f"GOLD : {cs_gold}")
    print(f"SUP  : {cs_sup}")
    print(f"RL   : {cs_rl}")


[1] LABEL: neu | DOMAIN: eesa
HOST : ممكن تشجعوني و تشتركوا بقناتي . please 
📢📢📢📢😊😊😊😔😔😔
GOLD : ممكن encourage me و you subscribe بقناتي . please 📢📢📢📢😊😊😊😔😔😔
SUP  : <extra_id_0> 📢📢📢😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂
RL   : <extra_id_0>😊😊😊😊😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂
[2] LABEL: pos | DOMAIN: eesa
HOST : مش قادره اوصفلك اغانيك بتديني positive energy اد أي♥️♥️♥️♥️
GOLD : مش قادره describe to you اغانيك بتديني positive energy اد أي♥️♥️♥️♥️
SUP  : <extra_id_0> 😂♥️♥️♥️♥️♥️♥️♥️ 😂😂😂 😂😂😂 😂😂😂 😂😂😂 😂😂😂 😂😂😂 😂😂😂 😂😂😂 😂😂😂 😂😂😂 😂😂😂 😂😂
RL   : <extra_id_0> 😂❤️♥️♥️♥️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️ 😂 😂 😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂
[3] LABEL: pos | DOMAIN: eesa
HOST : We ⁦💜💜💜
 فخوره اني بشتغل فيها 💜⁦🖐️⁩
الاعلان تحفه 👏👏
GOLD : We ⁦💜💜💜 فخوره اني بشتغل فيها 💜⁦🖐️⁩ announcement masterpiece 👏👏
SUP  : <extra_id_0>😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂
RL   : <extra_id_0> ⁦💜💜💜 👍😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂
[4] LABEL: pos