In [None]:
import os, json, random
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"

BASE = "/content/drive/MyDrive/cs-senti"   # adjust if your mount path is different
DATA = f"{BASE}/data"
OUT  = f"{BASE}/chang"
os.makedirs(OUT, exist_ok=True)
os.makedirs(f"{OUT}/checkpoints", exist_ok=True)
os.makedirs(f"{OUT}/samples", exist_ok=True)


In [None]:
!pip install -q datasets


In [None]:
from datasets import load_dataset

# load the full dataset from HF
ds = load_dataset("HeshamHaroon/ArzEn-MultiGenre")
ds


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

dataset.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/26047 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['EGY', 'ENG'],
        num_rows: 26047
    })
})

In [None]:
# show first 5 examples from the main split (often 'train')
for i in range(5):
    print(ds["train"][i])


{'EGY': '\u202bلحق؟\u202c', 'ENG': 'Already?'}
{'EGY': '\u202bمعلش يا جماعة أخرتكم.\u202c', 'ENG': 'Sorry to keep you waiting.'}
{'EGY': '\u202bلا، ولا يهمك.\u202c', 'ENG': 'No problem.'}
{'EGY': '\u202bبس الsystem down.\u202c', 'ENG': 'The system was down.'}
{'EGY': '\u202bطيب. خلاص إحنا كدا قفلنا الjoint account \u202bزودنا عنوان هشام الجديد على النظام،\u202c\u202b وعملنا حساب لـعلا \u202c\u202bهيتحط فيه فلوس مصاريف الولاد كل شهر.\u202c', 'ENG': "Well, we closed your joint account. We added Hisham's new address. We created an account for Ola for the child support expenses."}


In [None]:
from datasets import load_dataset
import os, json, re

BASE = "/content/drive/MyDrive/cs-senti"
DATA = f"{BASE}/data"
os.makedirs(DATA, exist_ok=True)

# 1) load HF dataset
ds = load_dataset("HeshamHaroon/ArzEn-MultiGenre")["train"]
print("rows:", len(ds))

# regexes
ARABIC_RE  = re.compile(r"[\u0600-\u06FF]")
LATIN_RE   = re.compile(r"[A-Za-z]")

# clean RTL marks like \u202b \u202c
def strip_rtl(s: str) -> str:
    return s.replace("\u202b", "").replace("\u202c", "").strip()

# open outputs
mono_ar_out = open(f"{DATA}/amg_ar_mono.jsonl", "w", encoding="utf-8")
mono_en_out = open(f"{DATA}/amg_en_mono.jsonl", "w", encoding="utf-8")
cs_ar_out   = open(f"{DATA}/amg_cs_from_amg.jsonl", "w", encoding="utf-8")

mono_count = 0
cs_count = 0
skip_none = 0

for row in ds:
    ar_txt = strip_rtl(row.get("EGY", "") or "")
    en_txt = row.get("ENG", "")
    if en_txt is None:
        skip_none += 1
        continue
    en_txt = en_txt.strip()
    if not ar_txt or not en_txt:
        continue

    # must have Arabic
    has_ar = bool(ARABIC_RE.search(ar_txt))
    has_lat = bool(LATIN_RE.search(ar_txt))

    if has_ar and not has_lat:
        # pure/mostly Arabic (monolingual host)
        mono_ar_out.write(json.dumps({"text": ar_txt}, ensure_ascii=False) + "\n")
        mono_en_out.write(json.dumps({"text": en_txt}, ensure_ascii=False) + "\n")
        mono_count += 1
    else:
        # keep CS examples separate
        cs_ar_out.write(json.dumps({"text": ar_txt}, ensure_ascii=False) + "\n")
        cs_count += 1

mono_ar_out.close()
mono_en_out.close()
cs_ar_out.close()

print("mono host rows:", mono_count)
print("cs-ish rows  :", cs_count)
print("skipped ENG=None:", skip_none)
print("saved to:", f"{DATA}/amg_ar_mono.jsonl", f"{DATA}/amg_en_mono.jsonl", f"{DATA}/amg_cs_from_amg.jsonl")


rows: 26047
mono host rows: 24850
cs-ish rows  : 679
skipped ENG=None: 424
saved to: /content/drive/MyDrive/cs-senti/data/amg_ar_mono.jsonl /content/drive/MyDrive/cs-senti/data/amg_en_mono.jsonl /content/drive/MyDrive/cs-senti/data/amg_cs_from_amg.jsonl


In [None]:
import random, json

ar_path = f"{DATA}/amg_ar_mono.jsonl"
en_path = f"{DATA}/amg_en_mono.jsonl"

with open(ar_path, "r", encoding="utf-8") as f:
    ar_lines = [json.loads(l)["text"] for l in f]
with open(en_path, "r", encoding="utf-8") as f:
    en_lines = [json.loads(l)["text"] for l in f]

for i in random.sample(range(min(len(ar_lines), len(en_lines))), 5):
    print("="*70)
    print("AR:", ar_lines[i])
    print("EN:", en_lines[i])


AR: ليه بس كدا؟ ليه ليه؟
EN: Why?
AR: أيوه
EN: No, you don't.
AR: على الأقل لحد الوقت إللي المحامي العام وقف كلام فيه وبعد فترة من السكوت، كمل بصوت واطي جدا وعميق جدا: "نفس المحكمة دي يا سادة، هتحكم بكرة في أكثر جناية بشعة قتل أب"
EN: A moment came when the Prosecutor paused and, after a short silence, said in a low, vibrant voice: "This same court, gentlemen, will be called on to try tomorrow that most odious of crimes, the murder of a father by his son."
AR: لا يعني أنا آسفة إن أنا بقول كدا، بس يعني...
EN: Well, I'm sorry to be saying this, but what I mean is ...
AR: إيه هواية مامتك المفضلة؟
EN: What's your mother's favorite hobby?


In [None]:
import os, json, random
from pathlib import Path
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

BASE = "/content/drive/MyDrive/cs-senti"   # change if your mount is different
DATA = f"{BASE}/data"
OUT  = f"{BASE}/chang"
os.makedirs(OUT, exist_ok=True)
os.makedirs(f"{OUT}/checkpoints", exist_ok=True)
os.makedirs(f"{OUT}/samples", exist_ok=True)

def read_jsonl(path, n=None):
    out = []
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if not line.strip():
                continue
            obj = json.loads(line)
            txt = obj.get("text", "").strip()
            if txt:
                out.append(txt)
            if n is not None and len(out) >= n:
                break
    return out

# 1) host = cleaned AMG monolingual arabic
amg_ar_path = f"{DATA}/amg_ar_mono.jsonl"   # <--- the file we just made from HF
HOST_TEXTS  = read_jsonl(amg_ar_path)
print("HOST_TEXTS:", len(HOST_TEXTS))

# 2) parallel for lexicon (same filtered subset)
amg_en_path = f"{DATA}/amg_en_mono.jsonl"
AMG_AR = read_jsonl(amg_ar_path)
AMG_EN = read_jsonl(amg_en_path)

# clip to overlap to stay aligned
min_len = min(len(AMG_AR), len(AMG_EN))
AMG_AR  = AMG_AR[:min_len]
AMG_EN  = AMG_EN[:min_len]
print("AMG_AR lines:", len(AMG_AR))
print("AMG_EN lines:", len(AMG_EN))
print("Using parallel overlap:", min_len)

# 3) real CS pools
REAL_CS_TEXTS = []

# 3a) AMG human-labeled CS
amg_cs_human_path = f"{DATA}/amg_cs_human_labels.jsonl"
if os.path.exists(amg_cs_human_path):
    AMG_CS_TXT = read_jsonl(amg_cs_human_path)
    REAL_CS_TEXTS += AMG_CS_TXT
else:
    AMG_CS_TXT = []

# 3b) CS we auto-split from the HF AMG
amg_cs_auto_path = f"{DATA}/amg_cs_from_amg.jsonl"
if os.path.exists(amg_cs_auto_path):
    REAL_CS_TEXTS += read_jsonl(amg_cs_auto_path)

# 3c) EESA CS
eesa_train_path = f"{DATA}/eesa_train.jsonl"
if os.path.exists(eesa_train_path):
    EESA_TXT = read_jsonl(eesa_train_path)
    REAL_CS_TEXTS += EESA_TXT
else:
    EESA_TXT = []

# 3d) MR CS
mr_cs_path = f"{DATA}/mr_cs.jsonl"
if os.path.exists(mr_cs_path):
    MR_TEXTS = read_jsonl(mr_cs_path)
    REAL_CS_TEXTS += MR_TEXTS
else:
    MR_TEXTS = []

print("REAL_CS_TEXTS:", len(REAL_CS_TEXTS))
print("  AMG_CS_TXT (human):", len(AMG_CS_TXT))
print("  EESA_TXT           :", len(EESA_TXT))
print("  MR_TEXTS           :", len(MR_TEXTS))

# 4) quick preview to confirm
print("\n--- sample HOST (AMG AR mono) ---")
for t in random.sample(HOST_TEXTS, 3):
    print(t)

print("\n--- sample parallel AR/EN ---")
for i in random.sample(range(len(AMG_AR)), 3):
    print("="*70)
    print("AR:", AMG_AR[i])
    print("EN:", AMG_EN[i])

print("\n--- sample REAL CS ---")
for t in random.sample(REAL_CS_TEXTS, 3):
    print(t)


HOST_TEXTS: 24850
AMG_AR lines: 24850
AMG_EN lines: 24850
Using parallel overlap: 24850
REAL_CS_TEXTS: 15994
  AMG_CS_TXT (human): 373
  EESA_TXT           : 2463
  MR_TEXTS           : 12479

--- sample HOST (AMG AR mono) ---
لفيتها بلاد
بتخبي ليه
ده سهل.

--- sample parallel AR/EN ---
AR: إيه اللي إنتي بتعمليه ده؟
EN: What are you doing?
AR: كلها كام يوم كده إن شاء الله ونقولهم مع بعض
EN: We'll tell them together in a few days.
AR: كلمتك. تليفونك كان مقفول
EN: I did, but your phone was turned off.

--- sample REAL CS ---
وفي 2011، هتقدّم "آبل" أول Official Emoji Keyboard للمستخدمين،
بس فعشان كده Engineering، و بعدين لما دخلت كنت حاطط في دماغي اللي هو يا Mechatronics يا production يا ممكن شوية CS, يعني مش هابعد عن الحاجات دي اوي يعني، فيعني لاقيت الدنيا رسيت ان هو مش هسافر فهقعد في مصر، فهو كانت .. فاخترت الجامعة اللي أنا فيها دي دلوقت، يعني كانت احسن option
بس دي ال favorite song بتاعتي اللي بحب أسمعها دايما.


In [None]:
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# we already have these from the previous cell
# BASE, DATA, device, AMG_AR, AMG_EN are in scope

# 1) build lexicon from parallel AMG
ar2en = {}

for ar_sent, en_sent in zip(AMG_AR, AMG_EN):
    ar_toks = ar_sent.split()
    en_toks = en_sent.split()
    # keep it simple: only use sentences with same #tokens
    if len(ar_toks) == len(en_toks):
        for a, e in zip(ar_toks, en_toks):
            a = a.strip()
            e = e.strip().lower()
            if a and (a not in ar2en):
                ar2en[a] = e

print("lexicon size (from AMG mono):", len(ar2en))

# 2) MT fallback (for words not in the lexicon)
mt_name = "Helsinki-NLP/opus-mt-ar-en"
mt_tok  = AutoTokenizer.from_pretrained(mt_name)
mt_mod  = AutoModelForSeq2SeqLM.from_pretrained(mt_name).to(device)
mt_mod.eval()

@torch.no_grad()
def mt_translate_one(token: str) -> str:
    enc = mt_tok(token, return_tensors="pt").to(device)
    out = mt_mod.generate(**enc, max_length=16, num_beams=4)
    txt = mt_tok.decode(out[0], skip_special_tokens=True)
    # keep only the first word to stay token-like
    return txt.split()[0] if txt else token

def translate_token(ar_tok: str) -> str:
    ar_tok = ar_tok.strip()
    if ar_tok in ar2en:
        return ar2en[ar_tok]
    return mt_translate_one(ar_tok)

# 3) quick sanity check
test_words = [
    "الجامعة",
    "الموبايل",
    "البيت",
    "فلوس",
    "كمبيوتر",
    "العربي",
]

print("\n--- sample translations ---")
for w in test_words:
    print(f"{w:>10s} -> {translate_token(w)}")

# 4) check on a real host sentence
import random, regex as re
AR_WORD = re.compile(r"\S+")

def tokenize_ar(text: str):
    return AR_WORD.findall(text)

sample_host = random.choice(HOST_TEXTS)
toks = tokenize_ar(sample_host)
translated = [translate_token(t) for t in toks]

print("\n--- sample host sentence ---")
print("AR:", sample_host)
print("EN-tokens (1-1):", translated)


lexicon size (from AMG mono): 5160


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


--- sample translations ---
   الجامعة -> University
  الموبايل -> phone.
     البيت -> this
      فلوس -> you
   كمبيوتر -> Computer
    العربي -> Arab

--- sample host sentence ---
AR: أسطورة الشعب اللي كانوا مابيحلموش
EN-tokens (1-1): ['and', 'The', 'cover', 'were', "What's"]


fixing daatsets-  mr labelling

AMG CS SUBSET: we are checking the labels of the three annotators, and doing majority voting

In [None]:
import pandas as pd

path = "/content/drive/MyDrive/cs-senti/labeling/amg_cs_final_adjudicated.csv"
df = pd.read_csv(path)
print(df.columns.tolist())
df.head(3)


['id', 'label_model', 'pred_conf', 'label_human', 'label_llm', 'final_label', 'n_unique']


Unnamed: 0,id,label_model,pred_conf,label_human,label_llm,final_label,n_unique
0,0,neg,0.987439,neg,neg,neg,1
1,1,neu,0.846126,neu,neu,neu,1
2,2,neg,0.947345,neg,neg,neg,1


In [None]:
import pandas as pd
import json
from pathlib import Path

BASE = Path("/content/drive/MyDrive/cs-senti")

# 1) original AMG-CS with text (from when you first labeled with XLM)
orig_path  = BASE / "labeling" / "amg_cs_pred_labelsXLM.csv"
# 2) final adjudicated labels (no text)
final_path = BASE / "labeling" / "amg_cs_final_adjudicated.csv"
# 3) output jsonl
out_jsonl  = BASE / "data" / "amg_cs_final_adjudicated.jsonl"

orig_df  = pd.read_csv(orig_path)          # has: id, text, eng, pred_label, ...
final_df = pd.read_csv(final_path)         # has: id, final_label, ...

# merge on id
merged = orig_df.merge(final_df[["id","final_label"]], on="id", how="inner")

print("merged shape:", merged.shape)
print(merged.head(3)[["id","text","final_label"]])

# write to jsonl
with open(out_jsonl, "w", encoding="utf-8") as f:
    for _, row in merged.iterrows():
        rec = {
            "text": str(row["text"]),
            "label": str(row["final_label"])
        }
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("✅ wrote:", out_jsonl)


merged shape: (373, 6)
   id                                               text final_label
0   0                                  بس الsystem down.         neg
1   1  طيب. خلاص إحنا كدا قفلنا الjoint account زودنا...         neu
2   2                              كل ده والsystem down؟         neg
✅ wrote: /content/drive/MyDrive/cs-senti/data/amg_cs_final_adjudicated.jsonl


In [None]:
import json

# Path to your new jsonl
path = "/content/drive/MyDrive/cs-senti/data/amg_cs_final_adjudicated.jsonl"

# Read and preview 10 samples
with open(path, "r", encoding="utf-8") as f:
    samples = [json.loads(next(f)) for _ in range(10)]

for s in samples:
    print(f"🗣️ Text: {s['text']}")
    print(f"🎯 Label: {s['label']}")
    print("-"*80)


🗣️ Text: بس الsystem down.
🎯 Label: neg
--------------------------------------------------------------------------------
🗣️ Text: طيب. خلاص إحنا كدا قفلنا الjoint account زودنا عنوان هشام الجديد على النظام، وعملنا حساب لعلا هيتحط فيه فلوس مصاريف الولاد كل شهر.
🎯 Label: neu
--------------------------------------------------------------------------------
🗣️ Text: كل ده والsystem down؟
🎯 Label: neg
--------------------------------------------------------------------------------
🗣️ Text: ده fresh graduates مش لاقيين.
🎯 Label: neg
--------------------------------------------------------------------------------
🗣️ Text: حلو الlook ده.
🎯 Label: pos
--------------------------------------------------------------------------------
🗣️ Text: اه، قابلت بابي النهاردة وكله كان حلو أوي ومشي smooth أوي. واتفقنا على كل حاجة.
🎯 Label: pos
--------------------------------------------------------------------------------
🗣️ Text: وبعدين هو الصراحة كان متحضر ومحترم وهيتكلف بمصاريف العيال كلهم. حتى الextras.


NUMBER OF SAMPLES PER DATASETS AND SPLITS; AMG DOESNT HAVE A SPLIT YET

In [None]:
import json
from pathlib import Path

BASE = Path("/content/drive/MyDrive/cs-senti/data")

def read_jsonl(p):
    return [json.loads(l) for l in open(p, encoding="utf-8")]

eesa_train = read_jsonl(BASE / "eesa_train.jsonl")
eesa_dev   = read_jsonl(BASE / "eesa_dev.jsonl")
eesa_test  = read_jsonl(BASE / "eesa_test.jsonl")

amg_rows   = read_jsonl(BASE / "amg_cs_final_adjudicated.jsonl")

print("EESA train:", len(eesa_train))
print("EESA dev:", len(eesa_dev))
print("EESA test:", len(eesa_test))
print("AMG CS (final):", len(amg_rows))


EESA train: 2463
EESA dev: 817
EESA test: 817
AMG CS (final): 373


AMG SPLIT TRAIN, DEV (TEST?)

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

amg_df = pd.DataFrame(amg_rows)   # has 'text', 'label'

amg_train_df, amg_dev_df = train_test_split(
    amg_df,
    test_size=0.1,
    stratify=amg_df["label"],
    random_state=42
)

print("AMG train:", len(amg_train_df))
print("AMG dev:", len(amg_dev_df))

# (optional) see distribution
print("\nAMG train dist:\n", amg_train_df["label"].value_counts())
print("\nAMG dev dist:\n", amg_dev_df["label"].value_counts())


AMG train: 335
AMG dev: 38

AMG train dist:
 label
neu    147
pos     95
neg     93
Name: count, dtype: int64

AMG dev dist:
 label
neu    17
pos    11
neg    10
Name: count, dtype: int64


MERGING SPLITS

In [None]:
import json
from pathlib import Path

OUT_DIR = Path("/content/drive/MyDrive/cs-senti/data")

def write_jsonl(path, rows):
    with open(path, "w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps({"text": r["text"], "label": r["label"]}, ensure_ascii=False) + "\n")

# merge to train/dev
merged_train = eesa_train + amg_train_df.to_dict(orient="records")
merged_dev   = eesa_dev   + amg_dev_df.to_dict(orient="records")
merged_test  = eesa_test  # unchanged

write_jsonl(OUT_DIR / "eesa_amg_train.jsonl", merged_train)
write_jsonl(OUT_DIR / "eesa_amg_dev.jsonl", merged_dev)
write_jsonl(OUT_DIR / "eesa_test.jsonl", merged_test)

print("✅ wrote merged train/dev")
print("train:", len(merged_train), "dev:", len(merged_dev), "test:", len(merged_test))


✅ wrote merged train/dev
train: 2798 dev: 855 test: 817


MERGED STATISTICS OF AMG AND EESA

In [None]:
from collections import Counter
import json, os

for name in ["eesa_amg_train.jsonl", "eesa_amg_dev.jsonl", "eesa_test.jsonl"]:
    path = OUT_DIR / name
    rows = [json.loads(l) for l in open(path, encoding="utf-8")]
    cnt = Counter(r["label"] for r in rows)
    print(name, "→", len(rows), "samples")
    print(cnt)
    print("-"*40)


eesa_amg_train.jsonl → 2798 samples
Counter({'pos': 1187, 'neu': 925, 'neg': 686})
----------------------------------------
eesa_amg_dev.jsonl → 855 samples
Counter({'pos': 374, 'neu': 275, 'neg': 206})
----------------------------------------
eesa_test.jsonl → 817 samples
Counter({'pos': 362, 'neu': 258, 'neg': 197})
----------------------------------------


LABEL DISTRIBUTION FOR AMG+EESA

In [None]:
import json
from pathlib import Path

BASE = Path("/content/drive/MyDrive/cs-senti/data")

def read_jsonl(p):
    return [json.loads(l) for l in open(p, encoding="utf-8")]

train_rows = read_jsonl(BASE / "eesa_amg_train.jsonl")
dev_rows   = read_jsonl(BASE / "eesa_amg_dev.jsonl")
test_rows  = read_jsonl(BASE / "eesa_test.jsonl")

print(len(train_rows), len(dev_rows), len(test_rows))
print({l: sum(r["label"]==l for r in train_rows) for l in ["pos","neu","neg"]})


2798 855 817
{'pos': 1187, 'neu': 925, 'neg': 686}


FINE TUNING SENTIMENT CLASSIFIER

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import f1_score
from pathlib import Path

LABELS = ["pos","neu","neg"]  # note the order! match to your data
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

class JsonlDS(Dataset):
    def __init__(self, data, tok, max_len=128):
        self.data = data
        self.tok = tok
        self.max_len = max_len
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        ex = self.data[idx]
        enc = self.tok(
            ex["text"],
            max_length=self.max_len,
            truncation=True,
            padding="max_length"
        )
        enc["labels"] = label2id[ex["label"]]
        return {k: torch.tensor(v) for k,v in enc.items()}

# init
tok = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

ds_tr = JsonlDS(train_rows, tok, 128)
ds_de = JsonlDS(dev_rows, tok, 128)

dl_tr = DataLoader(ds_tr, batch_size=16, shuffle=True)
dl_de = DataLoader(ds_de, batch_size=32, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optim = AdamW(model.parameters(), lr=2e-5)

best_f1 = -1.0
best_state = None

EPOCHS = 3
for ep in range(1, EPOCHS+1):
    model.train()
    for batch in dl_tr:
        batch = {k:v.to(device) for k,v in batch.items()}
        out = model(**batch)
        out.loss.backward()
        optim.step()
        optim.zero_grad()

    # dev eval
    model.eval()
    preds, gold = [], []
    with torch.no_grad():
        for batch in dl_de:
            labels = batch["labels"].numpy().tolist()
            batch = {k:v.to(device) for k,v in batch.items()}
            logits = model(**batch).logits.detach().cpu().numpy()
            preds.extend(logits.argmax(axis=1).tolist())
            gold.extend(labels)
    macro_f1 = f1_score(gold, preds, average="macro")
    print(f"Epoch {ep} → dev macro-F1 = {macro_f1:.4f}")
    if macro_f1 > best_f1:
        best_f1 = macro_f1
        best_state = model.state_dict()

# save best
OUT_DIR = Path("/content/drive/MyDrive/cs-senti/models/xlmr_sentiment_eesa_amg")
OUT_DIR.mkdir(parents=True, exist_ok=True)
model.load_state_dict(best_state)
model.save_pretrained(OUT_DIR.as_posix())
tok.save_pretrained(OUT_DIR.as_posix())
print("✅ saved best to", OUT_DIR)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 → dev macro-F1 = 0.7504
Epoch 2 → dev macro-F1 = 0.7942
Epoch 3 → dev macro-F1 = 0.7756
✅ saved best to /content/drive/MyDrive/cs-senti/models/xlmr_sentiment_eesa_amg


In [None]:
import json
from pathlib import Path
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification

LABELS = ["pos","neu","neg"]
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

DATA_DIR = Path("/content/drive/MyDrive/cs-senti/data")
MODEL_DIR = "/content/drive/MyDrive/cs-senti/models/xlmr_sentiment_eesa_amg"

# load test (same old EESA test)
test_rows = [json.loads(l) for l in open(DATA_DIR / "eesa_test.jsonl", encoding="utf-8")]

class JsonlDS(torch.utils.data.Dataset):
    def __init__(self, data, tok, max_len=128):
        self.data=data; self.tok=tok; self.max_len=max_len
    def __len__(self): return len(self.data)
    def __getitem__(self, i):
        ex = self.data[i]
        enc = self.tok(ex["text"], max_length=self.max_len,
                       truncation=True, padding="max_length")
        enc["labels"] = label2id[ex["label"]]
        return {k: torch.tensor(v) for k,v in enc.items()}

tok = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

ds_te = JsonlDS(test_rows, tok, 128)
dl_te = DataLoader(ds_te, batch_size=32, shuffle=False)

device = next(model.parameters()).device
gold, preds = [], []
with torch.no_grad():
    for batch in dl_te:
        labels = batch["labels"].numpy().tolist()
        batch = {k:v.to(device) for k,v in batch.items()}
        logits = model(**batch).logits.detach().cpu().numpy()
        pred = logits.argmax(axis=1).tolist()
        gold.extend(labels); preds.extend(pred)

print("\n=== XLM-R (EESA+AMG) on EESA TEST ===")
print(classification_report(gold, preds, target_names=LABELS, digits=4))
print("Macro-F1:", f1_score(gold, preds, average="macro"))



=== XLM-R (EESA+AMG) on EESA TEST ===
              precision    recall  f1-score   support

         pos     0.8071    0.9475    0.8717       362
         neu     0.8804    0.6279    0.7330       258
         neg     0.7740    0.8173    0.7951       197

    accuracy                         0.8152       817
   macro avg     0.8205    0.7976    0.7999       817
weighted avg     0.8223    0.8152    0.8094       817

Macro-F1: 0.7999193171743851


MR TRI ANNOTATING

In [None]:
import json, re, random
import pandas as pd

MR_FP = "/content/drive/MyDrive/cs-senti/data/mr_cs.jsonl"

AR = re.compile(r"[\u0600-\u06FF]")
EN = re.compile(r"[A-Za-z]")

def en_share(s):
    toks = re.findall(r"[A-Za-z\u0600-\u06FF]+", s)
    if not toks: return 0.0
    en = sum(1 for t in toks if EN.search(t) and not AR.search(t))
    return en/len(toks)

rows = [json.loads(l) for l in open(MR_FP, encoding="utf-8")]
for r in rows:
    r["en_share"] = en_share(r["text"])

# bucket by en-share
low  = [r for r in rows if r["en_share"] < 0.10]
mid  = [r for r in rows if 0.10 <= r["en_share"] < 0.22]
high = [r for r in rows if r["en_share"] >= 0.22]

def sample(lst, k): return random.sample(lst, min(k, len(lst)))

sampled = sample(low, 300) + sample(mid, 350) + sample(high, 350)
random.shuffle(sampled)

mr_sample_df = pd.DataFrame(sampled)
mr_sample_df.to_csv("/content/drive/MyDrive/cs-senti/data/mr_cs_sample_for_labeling.csv", index=False)


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd

MODEL_DIR = "/content/drive/MyDrive/cs-senti/models/xlmr_sentiment_eesa_amg"
df = pd.read_csv("/content/drive/MyDrive/cs-senti/data/mr_cs_sample_for_labeling.csv")

tok = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()

LABELS = ["pos","neu","neg"]

pred_labels = []
pred_confs  = []

BATCH = 64
texts = df["text"].tolist()
for i in range(0, len(texts), BATCH):
    batch = texts[i:i+BATCH]
    enc = tok(batch, padding=True, truncation=True, max_length=160, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**enc).logits
    probs = torch.softmax(logits, dim=-1).cpu().numpy()
    labs  = probs.argmax(axis=1)
    confs = probs.max(axis=1)
    pred_labels.extend([LABELS[j] for j in labs])
    pred_confs.extend(confs)

df["label_model"] = pred_labels
df["pred_conf"]   = pred_confs

# save intermediate
df.to_csv("/content/drive/MyDrive/cs-senti/labeling/mr_cs_sample_with_model.csv", index=False, encoding="utf-8-sig")
print("✅ saved model-labelled MR sample")


✅ saved model-labelled MR sample


In [None]:
import pandas as pd
from pathlib import Path

LABEL_DIR = Path("/content/drive/MyDrive/cs-senti/labeling")
LABEL_DIR.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(LABEL_DIR / "mr_cs_sample_with_model.csv")

# give stable ids
df = df.reset_index(drop=True)
df["id"] = df.index

# 1) human file
human_path = LABEL_DIR / "mr_cs_for_human.xlsx"
df[["id","text","label_model","pred_conf","en_share"]].to_excel(human_path, index=False)
print("✅ human file:", human_path)

# 2) llm file
llm_path = LABEL_DIR / "mr_cs_for_llm.csv"
df[["id","text"]].to_csv(llm_path, index=False, encoding="utf-8-sig")
print("✅ llm file:", llm_path)


✅ human file: /content/drive/MyDrive/cs-senti/labeling/mr_cs_for_human.xlsx
✅ llm file: /content/drive/MyDrive/cs-senti/labeling/mr_cs_for_llm.csv


In [None]:
import json, pandas as pd, os

# path to your text/jsonl file
llm_path = "/content/drive/MyDrive/cs-senti/labeling/mr_cs_llm.txt"   # or .jsonl if that's the actual name

# load each line as json
rows = []
with open(llm_path, encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:
            try:
                rows.append(json.loads(line))
            except json.JSONDecodeError:
                print("❌ Skipping malformed line:", line[:80])

llm_df = pd.DataFrame(rows)

# keep only id + label
if "label" in llm_df.columns:
    llm_df = llm_df[["id","label"]].rename(columns={"label":"label_llm"})

out_path = "/content/drive/MyDrive/cs-senti/labeling/mr_cs_llm.csv"
llm_df.to_csv(out_path, index=False, encoding="utf-8-sig")
print(f"✅ Saved clean LLM labels to {out_path}")
print(llm_df.head())


✅ Saved clean LLM labels to /content/drive/MyDrive/cs-senti/labeling/mr_cs_llm.csv
   id label_llm
0   0       neg
1   1       neu
2   2       neu
3   3       neg
4   4       neu


In [None]:
import pandas as pd
from pathlib import Path

LABEL_DIR = Path("/content/drive/MyDrive/cs-senti/labeling")

model_df = pd.read_excel(LABEL_DIR / "/content/drive/MyDrive/cs-senti/labeling/mr_cs_for_human.xlsx")   # id, text, label_model, pred_conf, en_share
human_df = pd.read_excel(LABEL_DIR / "mr_cs_human.xlsx")            # id, (corrected) label_model
llm_df   = pd.read_csv(LABEL_DIR / "mr_cs_llm.csv")                 # id, label_llm

# make column names consistent
human_df = human_df.rename(columns={"label_model": "label_human"})

# merge on id
df = (
    model_df[["id","text","label_model","pred_conf","en_share"]]
    .merge(human_df[["id","label_human"]], on="id")
    .merge(llm_df[["id","label_llm"]], on="id")
)

print(df.head())
print(len(df))


   id                                               text label_model  \
0   0  أما ال .. ال .. الشغل at least جوا مصر, انت عن...         neg   
1   1                      إيه بقى هي حكاية BYD الصينية؟         neu   
2   2  أنت دلوقتي أي حد ممكن ي subs .. ي subscribe لل...         neu   
3   3  ليه؟ عشان يعني لو احتجت .. يعني let's say لو أ...         neg   
4   4  عشان أثناء ما بتعكس التيار الكهربي في الLoop ا...         neu   

   pred_conf  en_share label_human label_llm  
0   0.688790  0.176471         neg       neg  
1   0.691024  0.166667         neu       neu  
2   0.854123  0.166667         neu       neu  
3   0.531762  0.193548         neg       neg  
4   0.541873  0.000000         neu       neu  
1000


In [None]:
print(df.tail())

      id                                               text label_model  \
995  995                                     ماعرفش طبعا لأ         neg   
996  996  لأ و انا في .. لأ و انا ايام ال evaluation اه ...         pos   
997  997  ولا سفر ل"طوكيو" أو "باريس" أو "كاليفورنيا" أو...         neu   
998  998  ال languages اللي كت في المدرسة عربي English ,...         neu   
999  999  الكتاب الذي يحتوي على المركز الأول والتالت وال...         pos   

     pred_conf  en_share label_human label_llm  
995   0.488081  0.000000         neg       neg  
996   0.796531  0.190476         pos       neu  
997   0.772558  0.000000         neg       neu  
998   0.871015  0.300000         neu       neu  
999   0.985842  0.090909         pos       pos  


In [None]:
from sklearn.metrics import cohen_kappa_score

pairs = [
    ("label_model", "label_human"),
    ("label_model", "label_llm"),
    ("label_human", "label_llm"),
]

for a, b in pairs:
    kappa = cohen_kappa_score(df[a], df[b])
    print(f"Cohen’s κ ({a} vs {b}): {kappa:.3f}")

# how many rows had at least 1 disagreement?
df["n_unique"] = df[["label_model","label_human","label_llm"]].nunique(axis=1)
n_dis = (df["n_unique"] > 1).sum()
print(f"\nDisagreements: {n_dis} / {len(df)} = {n_dis/len(df)*100:.1f}%")


Cohen’s κ (label_model vs label_human): 0.618
Cohen’s κ (label_model vs label_llm): 0.430
Cohen’s κ (label_human vs label_llm): 0.662

Disagreements: 374 / 1000 = 37.4%


In [None]:
changed_by_human = (df["label_model"] != df["label_human"]).sum()
print("changed by human:", changed_by_human, "/", len(df),
      f"= {changed_by_human/len(df)*100:.1f}%")

changed_by_llm = (df["label_model"] != df["label_llm"]).sum()
print("changed by llm:", changed_by_llm, "/", len(df),
      f"= {changed_by_llm/len(df)*100:.1f}%")


changed by human: 233 / 1000 = 23.3%
changed by llm: 341 / 1000 = 34.1%


In [None]:
def majority_vote(row):
    votes = [row["label_model"], row["label_human"], row["label_llm"]]
    # simple plurality
    return max(set(votes), key=votes.count)

df["final_label"] = df.apply(majority_vote, axis=1)


In [None]:
# 1) csv
final_csv = LABEL_DIR / "mr_cs_final_adjudicated.csv"
df.to_csv(final_csv, index=False, encoding="utf-8-sig")
print("✅ saved:", final_csv)

# 2) jsonl (text + final_label) for training
import json
final_jsonl = LABEL_DIR / "mr_cs_final_adjudicated.jsonl"
with open(final_jsonl, "w", encoding="utf-8") as f:
    for _, row in df.iterrows():
        f.write(json.dumps({"text": row["text"], "label": row["final_label"]}, ensure_ascii=False) + "\n")
print("✅ saved:", final_jsonl)


✅ saved: /content/drive/MyDrive/cs-senti/labeling/mr_cs_final_adjudicated.csv
✅ saved: /content/drive/MyDrive/cs-senti/labeling/mr_cs_final_adjudicated.jsonl


In [None]:
# === paths ===
BASE = "/content/drive/MyDrive/cs-senti"
DATA = f"{BASE}/data"
LABL = f"{BASE}/labeling"

FP_EESA_TR = f"{DATA}/eesa_train.jsonl"
FP_EESA_DE = f"{DATA}/eesa_dev.jsonl"
FP_EESA_TE = f"{DATA}/eesa_test.jsonl"

FP_AMG     = f"{DATA}/amg_cs_final_adjudicated.jsonl"     # text + final_label
FP_MR_CSV  = f"{LABL}/mr_cs_final_adjudicated.csv"        # needs cleaning

# outputs
FP_MR_JSON = f"{DATA}/mr_cs_final_adjudicated.jsonl"
FP_AMG_TR  = f"{DATA}/amg_train.jsonl"
FP_AMG_DE  = f"{DATA}/amg_dev.jsonl"

FP_MIX_TR  = f"{DATA}/eesa_amg_mr_train.jsonl"
FP_MIX_DE  = f"{DATA}/eesa_amg_mr_dev.jsonl"
FP_MIX_TE  = f"{DATA}/mixed_test.jsonl"  # optional cross-domain test


In [None]:
import pandas as pd, re, json

MIN_ENSHARE = 0.0   # set to 0.05 if you want to filter very low EN-share

AR = re.compile(r"[\u0600-\u06FF]")
EN = re.compile(r"[A-Za-z]")
def en_share(s):
    toks = re.findall(r"[A-Za-z\u0600-\u06FF]+", s or "")
    if not toks: return 0.0
    en = sum(1 for t in toks if EN.search(t) and not AR.search(t))
    return en/len(toks)

mr = pd.read_csv(FP_MR_CSV)
# try common column names defensively
text_col = "text" if "text" in mr.columns else ("Text" if "Text" in mr.columns else None)
assert text_col is not None, f"Couldn't find a text column in {FP_MR_CSV}"

label_col = "final_label" if "final_label" in mr.columns else None
assert label_col is not None, f"Couldn't find final_label column in {FP_MR_CSV}"

mr = mr[[ "id", text_col, label_col ]].rename(columns={text_col:"text", label_col:"label"}).dropna(subset=["text","label"])
mr["en_share"] = mr["text"].map(en_share)
mr = mr[mr["en_share"] >= MIN_ENSHARE].reset_index(drop=True)

# write JSONL
with open(FP_MR_JSON, "w", encoding="utf-8") as f:
    for _, row in mr.iterrows():
        f.write(json.dumps({"id": int(row["id"]), "text": row["text"], "label": row["label"]}, ensure_ascii=False) + "\n")

len(mr), mr["label"].value_counts()


(1000,
 label
 neu    578
 pos    244
 neg    178
 Name: count, dtype: int64)

In [None]:
import json, random
from collections import Counter
import pandas as pd

random.seed(42)

def read_jsonl(fp):
    with open(fp, encoding="utf-8") as f:
        return [json.loads(l) for l in f]

def to_df(rows):
    return pd.DataFrame(rows)

# --- load sources ---
eesa_tr = read_jsonl(FP_EESA_TR)
eesa_de = read_jsonl(FP_EESA_DE)
eesa_te = read_jsonl(FP_EESA_TE)
amg_all = read_jsonl(FP_AMG)       # should have fields: text, final_label/label
mr_all  = read_jsonl(FP_MR_JSON)

# normalize label key for AMG if needed
for r in amg_all:
    if "label" not in r:
        r["label"] = r.get("final_label")
for r in amg_all:
    r.pop("final_label", None)

# stratified 90/10 split for AMG
from sklearn.model_selection import train_test_split
amg_df = to_df(amg_all)[["text","label"]].dropna()
amg_tr_df, amg_de_df = train_test_split(amg_df, test_size=0.10, random_state=42, stratify=amg_df["label"])

def write_jsonl(df, fp):
    with open(fp, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(json.dumps({"text": row["text"], "label": row["label"]}, ensure_ascii=False) + "\n")

write_jsonl(amg_tr_df, FP_AMG_TR)
write_jsonl(amg_de_df, FP_AMG_DE)

def show_stats(name, rows):
    lab = [r["label"] for r in rows]
    print(f"{name} → {len(rows)} samples")
    print(Counter(lab))
    print("-"*40)

show_stats("EESA train", eesa_tr)
show_stats("EESA dev",   eesa_de)
show_stats("EESA test",  eesa_te)
show_stats("AMG train",  read_jsonl(FP_AMG_TR))
show_stats("AMG dev",    read_jsonl(FP_AMG_DE))
show_stats("MR all (cleaned)", mr_all[:10])  # just to show it's loaded


EESA train → 2463 samples
Counter({'pos': 1092, 'neu': 778, 'neg': 593})
----------------------------------------
EESA dev → 817 samples
Counter({'pos': 363, 'neu': 258, 'neg': 196})
----------------------------------------
EESA test → 817 samples
Counter({'pos': 362, 'neu': 258, 'neg': 197})
----------------------------------------
AMG train → 335 samples
Counter({'neu': 147, 'pos': 95, 'neg': 93})
----------------------------------------
AMG dev → 38 samples
Counter({'neu': 17, 'pos': 11, 'neg': 10})
----------------------------------------
MR all (cleaned) → 10 samples
Counter({'neu': 7, 'neg': 2, 'pos': 1})
----------------------------------------


In [None]:
# re-check FULL MR and all splits (no slicing)
from collections import Counter
import json, pandas as pd

def read_jsonl(fp):
    with open(fp, encoding="utf-8") as f:
        return [json.loads(l) for l in f]

mr_all = read_jsonl(FP_MR_JSON)          # /content/drive/MyDrive/cs-senti/data/mr_cs_final_adjudicated.jsonl
amg_tr = read_jsonl(FP_AMG_TR)
amg_de = read_jsonl(FP_AMG_DE)

print("MR all (cleaned) →", len(mr_all), "samples")
print(Counter([r["label"] for r in mr_all]))
print("-"*40)

# if you already ran the split cell, re-materialize the MR splits it created:
mr_df  = pd.DataFrame(mr_all)[["text","label"]]
from sklearn.model_selection import train_test_split
mr_tr_df, mr_de_df = train_test_split(mr_df, test_size=0.20, random_state=42, stratify=mr_df["label"])

print("MR train →", len(mr_tr_df), Counter(mr_tr_df["label"]))
print("MR dev   →", len(mr_de_df), Counter(mr_de_df["label"]))
print("-"*40)

print("AMG train →", len(amg_tr), Counter([r["label"] for r in amg_tr]))
print("AMG dev   →", len(amg_de), Counter([r["label"] for r in amg_de]))


MR all (cleaned) → 1000 samples
Counter({'neu': 578, 'pos': 244, 'neg': 178})
----------------------------------------
MR train → 800 Counter({'neu': 462, 'pos': 195, 'neg': 143})
MR dev   → 200 Counter({'neu': 116, 'pos': 49, 'neg': 35})
----------------------------------------
AMG train → 335 Counter({'neu': 147, 'pos': 95, 'neg': 93})
AMG dev   → 38 Counter({'neu': 17, 'pos': 11, 'neg': 10})


3) Build mixed train/dev (EESA + AMG + MR) and an optional mixed_test

Train = EESA_train + AMG_train + 80% of MR

Dev = EESA_dev + AMG_dev + 20% of MR

Test (primary) = EESA_test (kept clean for comparability)

Test (optional mixed) = small slices from AMG + MR (+ tiny EESA slice if you want)

In [None]:
from sklearn.model_selection import train_test_split
import json, random
from collections import Counter

# MR split 80/20 stratified
mr_df = to_df(mr_all)[["text","label"]].dropna()
mr_tr_df, mr_de_df = train_test_split(mr_df, test_size=0.20, random_state=42, stratify=mr_df["label"])

# helpers
def append_rows(a, b):
    out = []
    out.extend(a); out.extend(b)
    return out

# load amg splits just written
amg_tr = read_jsonl(FP_AMG_TR)
amg_de = read_jsonl(FP_AMG_DE)

# build mixed train/dev
mix_tr = append_rows(eesa_tr, amg_tr) + [{"text":t,"label":l} for t,l in zip(mr_tr_df["text"], mr_tr_df["label"])]
mix_de = append_rows(eesa_de, amg_de) + [{"text":t,"label":l} for t,l in zip(mr_de_df["text"], mr_de_df["label"])]

# optional small mixed test (10% AMG + 10% MR + 100 EESA)
amg_test_slice = amg_de[:max(1, len(amg_de)//2)]
mr_test_slice  = [{"text":t,"label":l} for t,l in zip(mr_de_df["text"], mr_de_df["label"])]
eesa_test_slice= eesa_te[:100] if len(eesa_te) > 100 else eesa_te
mix_te = amg_test_slice + mr_test_slice + eesa_test_slice

# write
def write_rows(rows, fp):
    with open(fp,"w",encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps({"text": r["text"], "label": r["label"]}, ensure_ascii=False) + "\n")

write_rows(mix_tr, FP_MIX_TR)
write_rows(mix_de, FP_MIX_DE)
write_rows(mix_te, FP_MIX_TE)

print("written:")
print(FP_MIX_TR)
print(FP_MIX_DE)
print(FP_MIX_TE)

# show distributions
def dist(rows):
    return Counter([r["label"] for r in rows])

print("\nTRAIN mix:", len(mix_tr), dist(mix_tr))
print("DEV   mix:", len(mix_de), dist(mix_de))
print("TEST  EESA:", len(eesa_te), dist(eesa_te))
print("TEST  mixed:", len(mix_te), dist(mix_te))


written:
/content/drive/MyDrive/cs-senti/data/eesa_amg_mr_train.jsonl
/content/drive/MyDrive/cs-senti/data/eesa_amg_mr_dev.jsonl
/content/drive/MyDrive/cs-senti/data/mixed_test.jsonl

TRAIN mix: 3598 Counter({'neu': 1387, 'pos': 1382, 'neg': 829})
DEV   mix: 1055 Counter({'pos': 423, 'neu': 391, 'neg': 241})
TEST  EESA: 817 Counter({'pos': 362, 'neu': 258, 'neg': 197})
TEST  mixed: 319 Counter({'neu': 159, 'pos': 100, 'neg': 60})


In [None]:
BASE = "/content/drive/MyDrive/cs-senti"
DATA = f"{BASE}/data"

FP_MIX_TR = f"{DATA}/eesa_amg_mr_train.jsonl"
FP_MIX_DE = f"{DATA}/eesa_amg_mr_dev.jsonl"
FP_MIX_TE = f"{DATA}/mixed_test.jsonl"
FP_EESA_TE = f"{DATA}/eesa_test.jsonl"
import json

def read_jsonl(fp):
    with open(fp, encoding="utf-8") as f:
        return [json.loads(l) for l in f]

def reorder(rows):
    out = []
    for r in rows:
        lab = r["label"]
        if lab in ["positive", "negative", "neutral"]:
            lab = {"positive": "pos", "negative": "neg", "neutral": "neu"}[lab]
        out.append({"text": r["text"], "label": lab})
    return out
tr_rows = reorder(read_jsonl(FP_MIX_TR))
de_rows = reorder(read_jsonl(FP_MIX_DE))
te_eesa = reorder(read_jsonl(FP_EESA_TE))
te_mix  = reorder(read_jsonl(FP_MIX_TE))

print(len(tr_rows), "train")
print(len(de_rows), "dev")
print(len(te_eesa), "eesa test")
print(len(te_mix), "mixed test")


3598 train
1055 dev
817 eesa test
319 mixed test


In [None]:
from transformers import DataCollatorWithPadding

# ... keep the rest of Cell 7 as-is up to where you build DataLoaders

data_collator = DataCollatorWithPadding(tokenizer=tok)  # pads per-batch to longest

BATCH_SIZE = 16
# num_workers=0 is safer in Colab; you can try 2 later if you want
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                      num_workers=0, pin_memory=True, collate_fn=data_collator)
dev_dl   = DataLoader(dev_ds,   batch_size=BATCH_SIZE, shuffle=False,
                      num_workers=0, pin_memory=True, collate_fn=data_collator)
eesa_dl  = DataLoader(eesa_ds,  batch_size=BATCH_SIZE, shuffle=False,
                      num_workers=0, pin_memory=True, collate_fn=data_collator)
mixed_dl = DataLoader(mixed_ds, batch_size=BATCH_SIZE, shuffle=False,
                      num_workers=0, pin_memory=True, collate_fn=data_collator)


In [None]:
# replace the scaler line with:
scaler = torch.amp.GradScaler('cuda') if torch.cuda.is_available() else None

# and wrap the scale/backward/step conditionally
def do_backward(loss):
    if scaler is not None:
        scaler.scale(loss/GRAD_ACCUM).backward()
    else:
        (loss/GRAD_ACCUM).backward()

def do_step():
    if scaler is not None:
        scaler.step(optim); scaler.update()
    else:
        optim.step()

# inside the train loop:
with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
    out = model(**batch)
    loss = criterion(out.logits, batch["labels"])

do_backward(loss)

if step % GRAD_ACCUM == 0:
    do_step()
    optim.zero_grad(set_to_none=True)
    scheduler.step()


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


NameError: name 'batch' is not defined

In [None]:
import os, json, math, random, numpy as np, torch
from pathlib import Path
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score, classification_report

BASE = "/content/drive/MyDrive/cs-senti"
DATA = f"{BASE}/data"
MODELS_DIR = f"{BASE}/models"
REGISTRY_CSV = f"{BASE}/runs_sentiment.csv"
os.makedirs(MODELS_DIR, exist_ok=True)

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)

# load the JSONL prepared in Cell 6
def read_jsonl(fp):
    with open(fp, encoding="utf-8") as f:
        return [json.loads(l) for l in f]

train_rows = read_jsonl(f"{DATA}/eesa_amg_mr_train.jsonl")
dev_rows   = read_jsonl(f"{DATA}/eesa_amg_mr_dev.jsonl")
test_eesa  = read_jsonl(f"{DATA}/eesa_test.jsonl")
test_mixed = read_jsonl(f"{DATA}/mixed_test.jsonl")

label2id = {"pos":0, "neg":1, "neu":2}
id2label = {v:k for k,v in label2id.items()}

class TxtDS(Dataset):
    def __init__(self, rows, tok, max_len=160):
        self.rows = rows; self.tok = tok; self.max_len = max_len
    def __len__(self): return len(self.rows)
    def __getitem__(self, i):
        r = self.rows[i]
        enc = self.tok(
            r["text"],
            truncation=True, padding=False, max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k:v.squeeze(0) for k,v in enc.items()}
        item["labels"] = torch.tensor(label2id[r["label"]], dtype=torch.long)
        return item

MODEL_NAME = "xlm-roberta-base"
tok = AutoTokenizer.from_pretrained(MODEL_NAME)

BATCH_SIZE = 16
train_ds = TxtDS(train_rows, tok)
dev_ds   = TxtDS(dev_rows, tok)
eesa_ds  = TxtDS(test_eesa, tok)
mixed_ds = TxtDS(test_mixed, tok)

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=2, pin_memory=True, collate_fn=None)
dev_dl   = DataLoader(dev_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
eesa_dl  = DataLoader(eesa_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
mixed_dl = DataLoader(mixed_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

print("Train/Dev/Test sizes:",
      len(train_ds), len(dev_ds), len(eesa_ds), len(mixed_ds))

# class weights (optional; helps with imbalance)
from torch import nn
train_counts = Counter(r["label"] for r in train_rows)
weights = []
for lab in ["pos","neg","neu"]:
    weights.append(len(train_rows)/max(1, train_counts[lab]))
cls_weights = torch.tensor(weights, dtype=torch.float)
print("Class weights (pos,neg,neu):", cls_weights.tolist())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Train/Dev/Test sizes: 3598 1055 817 319
Class weights (pos,neg,neu): [2.60347318649292, 4.3401689529418945, 2.594088077545166]


In [None]:
from torch import nn
from tqdm.auto import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
    id2label=id2label,
    label2id=label2id,
).to(device)

EPOCHS = 3
LR = 2e-5
WARMUP_RATIO = 0.06
GRAD_ACCUM = 1
USE_CLASS_WEIGHTS = True

optim = torch.optim.AdamW(model.parameters(), lr=LR)
num_update_steps_per_epoch = math.ceil(len(train_dl)/GRAD_ACCUM)
t_total = EPOCHS * num_update_steps_per_epoch
scheduler = get_linear_schedule_with_warmup(
    optim,
    num_warmup_steps=int(WARMUP_RATIO*t_total),
    num_training_steps=t_total
)

criterion = nn.CrossEntropyLoss(weight=cls_weights.to(device)) if USE_CLASS_WEIGHTS else nn.CrossEntropyLoss()

scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())

def evaluate(dl):
    model.eval()
    all_preds, all_golds = [], []
    with torch.no_grad():
        for batch in dl:
            batch = {k:v.to(device) for k,v in batch.items()}
            with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
                out = model(**batch)
            logits = out.logits
            preds = logits.argmax(dim=-1).detach().cpu().numpy().tolist()
            golds = batch["labels"].detach().cpu().numpy().tolist()
            all_preds.extend(preds); all_golds.extend(golds)
    macro_f1 = f1_score(all_golds, all_preds, average="macro")
    return macro_f1, all_preds, all_golds

best_f1 = -1.0
best_path = f"{MODELS_DIR}/xlmr_sentiment_eesa_amg_mr"
history = []

for epoch in range(1, EPOCHS+1):
    model.train()
    running = 0.0
    optim.zero_grad(set_to_none=True)
    pbar = tqdm(train_dl, desc=f"Epoch {epoch}")
    for step, batch in enumerate(pbar, 1):
        batch = {k:v.to(device) for k,v in batch.items()}
        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            out = model(**batch)
            loss = criterion(out.logits, batch["labels"])  # override internal loss to inject weights
        scaler.scale(loss/GRAD_ACCUM).backward()
        running += loss.item()

        if step % GRAD_ACCUM == 0:
            scaler.step(optim); scaler.update()
            optim.zero_grad(set_to_none=True)
            scheduler.step()

        if step % 50 == 0:
            pbar.set_postfix(loss=f"{running/step:.4f}")

    # dev eval
    dev_f1, _, _ = evaluate(dev_dl)
    history.append((epoch, dev_f1))
    print(f"Epoch {epoch} → dev macro-F1 = {dev_f1:.4f}")

    if dev_f1 > best_f1:
        best_f1 = dev_f1
        model.save_pretrained(best_path)
        tok.save_pretrained(best_path)
        print(f"✅ saved best to {best_path}")

print("History:", history)
print("Best dev macro-F1:", best_f1)


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())


Epoch 1:   0%|          | 0/225 [00:00<?, ?it/s]

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/_utils/fetch.py", line 55, in fetch
    return self.collate_fn(data)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/_utils/collate.py", line 398, in default_collate
    return collate(batch, collate_fn_map=default_collate_fn_map)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/_utils/collate.py", line 172, in collate
    key: collate(
         ^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/_utils/collate.py", line 155, in collate
    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/_utils/collate.py", line 272, in collate_tensor_fn
    return torch.stack(batch, 0, out=out)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: stack expects each tensor to be equal size, but got [11] at entry 0 and [38] at entry 1


In [None]:
!pip -q install -U transformers datasets evaluate scikit-learn

import os, json, random, numpy as np
from pathlib import Path

BASE = "/content/drive/MyDrive/cs-senti"
DATA = f"{BASE}/data"
MODEL_DIR = f"{BASE}/models/xlmr_sentiment_eesa_amg_mr"  # where to save

FP_MIX_TR = f"{DATA}/eesa_amg_mr_train.jsonl"
FP_MIX_DE = f"{DATA}/eesa_amg_mr_dev.jsonl"
FP_EESA_TE = f"{DATA}/eesa_test.jsonl"
FP_MIX_TE  = f"{DATA}/mixed_test.jsonl"

SEED = 42
random.seed(SEED); np.random.seed(SEED)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/511.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Operation cancelled by user[0m[31m
[0m