In [None]:
!pip install transformers sentence-transformers datasets accelerate -q


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer
from pathlib import Path
import random, json
from tqdm import tqdm


In [None]:
!mkdir -p "/content/drive/MyDrive/cs-senti/labse"
!git lfs install
!git clone https://huggingface.co/sentence-transformers/LaBSE "/content/drive/MyDrive/cs-senti/labse"


In [None]:
import json, os
from pathlib import Path

BASE = Path("/content/drive/MyDrive/cs-senti/repo/data/annotated_with_id")

def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

eesa_train_annotated = load_jsonl(BASE/"eesa_train_annotated.jsonl")
eesa_dev_annotated   = load_jsonl(BASE/"eesa_dev_annotated.jsonl")
eesa_test_annotated  = load_jsonl(BASE/"eesa_test_annotated.jsonl")

mr_cs_labeled  = load_jsonl(BASE/"mr_cs_labeled_annotated.jsonl")
amg_cs_labeled = load_jsonl(BASE/"amg_cs_labeled_annotated.jsonl")
amg_ar_mono    = load_jsonl(BASE/"amg_ar_mono_annotated.jsonl")


In [None]:
import json

LEXICON_PATH = "/content/drive/MyDrive/cs-senti/data/ling/ar_en_lexicon_expanded_with_synonyms.jsonl"

def load_lexicon(path):
    lex = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)
            base = item["en"]
            syns = item.get("syn", [])
            lex[item["ar"]] = [base] + syns   # list
    return lex

LEXICON = load_lexicon(LEXICON_PATH)
print("Loaded lexicon entries:", len(LEXICON))


# **new lexicon**

In [None]:
import json

LEXICON_PATH = "/content/drive/MyDrive/cs-senti/repo/data/ar_en_lexicon_MERGED.jsonl"

def load_lexicon(path):
    lex = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)

            ar = item["ar"]

            # --- ensure EN is always a list ---
            en_list = item["en"]
            if isinstance(en_list, str):
                en_list = [en_list]

            # --- ensure SYN is always a list ---
            syns = item.get("syn", [])
            if isinstance(syns, str):
                syns = [syns]

            # --- merge + remove duplicates while keeping order ---
            final = list(dict.fromkeys(en_list + syns))

            lex[ar] = final

    return lex

LEXICON = load_lexicon(LEXICON_PATH)
print("Loaded lexicon entries:", len(LEXICON))


In [None]:
import re

def is_english_word(w):
    return bool(re.fullmatch(r"[A-Za-z]+", w))

def generate_candidates(tokens, mask_positions, lexicon, lang_tags, pos_tags, has_al, top_k=5):
    candidates = {}

    for pos in mask_positions:
        tok = tokens[pos]
        al_flag = has_al[pos]

        cand_list = []

        # 1. Lexicon
        if tok in lexicon:
            eng_list = lexicon[tok]
            eng = eng_list[0]     # keep original behavior
            if al_flag and is_english_word(eng):
                eng = "ال " + eng
            cand_list.append(eng)

        # 2. Fallback
        if is_english_word(tok):
            fb = tok.lower()
            if al_flag:
                fb = "ال " + fb
            if fb not in cand_list:
                cand_list.append(fb)

        candidates[pos] = cand_list[:top_k] if cand_list else []

    return candidates


In [None]:
def get_cand_embeddings(cand_list):
    """
    Returns LaBSE embeddings (768-d) for each candidate.
    """
    emb = similarity_model.encode(cand_list, convert_to_tensor=True)
    return emb.to("cuda")  # shape: [N, 768]


In [None]:
import torch
import torch.nn as nn
import random

class Generator(nn.Module):
    def __init__(self, hidden=256):
        super().__init__()
        self.linear1 = nn.Linear(768, hidden)   # <-- FIXED (300 → 768)
        self.activation = nn.ReLU()
        self.linear2 = nn.Linear(hidden, 1)

    def forward(self, x):
        h = self.activation(self.linear1(x))
        return self.linear2(h)


generator = Generator().to("cuda")
optimizer = torch.optim.Adam(generator.parameters(), lr=1e-4)

print("Generator ready ✓")


In [None]:
!pip uninstall -y transformers tokenizers huggingface_hub sentence-transformers
!pip install --upgrade transformers==4.40.2
!pip install sentence-transformers==2.6.1
!pip install huggingface_hub==0.22.2


In [None]:
import transformers, sentence_transformers, huggingface_hub

print("transformers:", transformers.__version__)
print("sentence-transformers:", sentence_transformers.__version__)
print("huggingface_hub:", huggingface_hub.__version__)


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

reward_tokenizer = AutoTokenizer.from_pretrained(
    "/content/drive/MyDrive/cs-senti/baseline_marbert_v1/checkpoint-1920"
)
reward_model = AutoModelForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/cs-senti/baseline_marbert_v1/checkpoint-1920"
).to("cuda")
reward_model.eval()


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

disc_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")

disc_model = AutoModelForSequenceClassification.from_pretrained(
    "aubmindlab/bert-base-arabertv2",
    num_labels=2    # REAL = 1, FAKE = 0
).to("cuda")

disc_optimizer = torch.optim.Adam(disc_model.parameters(), lr=2e-5)

print("Discriminator ready ✓")


In [None]:
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F

# Correct LaBSE load
similarity_model = SentenceTransformer("sentence-transformers/LaBSE")

def reward_similarity(orig, gen):
    emb_o = similarity_model.encode(orig, convert_to_tensor=True)
    emb_g = similarity_model.encode(gen, convert_to_tensor=True)
    return float(F.cosine_similarity(emb_o, emb_g, dim=0).item())

def semantic_similarity(a, b):
    emb_a = similarity_model.encode(a, convert_to_tensor=True)
    emb_b = similarity_model.encode(b, convert_to_tensor=True)
    return float(F.cosine_similarity(emb_a, emb_b, dim=0).item())


In [None]:
def train_discriminator(real_text, fake_text):
    disc_optimizer.zero_grad()

    # REAL
    real_in = disc_tokenizer(real_text, return_tensors="pt", truncation=True).to("cuda")
    real_out = disc_model(**real_in).logits
    real_loss = torch.nn.functional.cross_entropy(real_out, torch.tensor([1]).to("cuda"))

    # FAKE
    fake_in = disc_tokenizer(fake_text, return_tensors="pt", truncation=True).to("cuda")
    fake_out = disc_model(**fake_in).logits
    fake_loss = torch.nn.functional.cross_entropy(fake_out, torch.tensor([0]).to("cuda"))

    loss = real_loss + fake_loss
    loss.backward()
    disc_optimizer.step()

    return float(loss.item())


In [None]:
def reinforce_step_combined(
    example,
    candidates_dict,
    generator,
    optimizer,
    disc_model,
    disc_tokenizer,
    reward_model,
    reward_tokenizer,
    sim_model,
    alpha=0.5,
    beta=0.4,
    gamma=0.1,
):
    """
    Combined REINFORCE step using:
    - Discriminator reward     (alpha)
    - Sentiment consistency    (beta)
    - Semantic similarity      (gamma)
    """

    tokens = example["tokens"]
    sent_label = example.get("label")

    # Skip unlabeled samples
    if sent_label is None:
        return None

    idx_map = {"neg": 0, "neu": 1, "pos": 2}

    log_probs = []
    rewards = []

    for pos, cand_list in candidates_dict.items():

        # ============================
        # 1. Handle EMPTY candidate list
        # ============================
        if len(cand_list) == 0:
            continue

        # ============================
        # 2. Determine chosen candidate and log_prob (if learning)
        # ============================
        chosen = None # Initialize chosen

        if len(cand_list) == 1:
            chosen = cand_list[0]
            # If only one candidate, no choice is made by the generator.
            # This step does not contribute to generator's policy gradient.
        else:
            # Dummy embeddings for candidates - MUST require grad for generator to learn
            cand_vecs = get_cand_embeddings(cand_list).clone().detach().requires_grad_(True)

            logits = generator(cand_vecs).squeeze(-1)

            # Fix NaNs
            if torch.isnan(logits).any():
                logits = torch.zeros_like(logits)

            probs = torch.softmax(logits, dim=0)

            # Fix zero-prob or NaNs
            if probs.sum() == 0 or torch.isnan(probs).any():
                probs = torch.ones_like(probs) / len(probs)

            # Sample
            idx = torch.multinomial(probs, 1).item()
            chosen = cand_list[idx]

            log_prob = torch.log(probs[idx] + 1e-12)
            log_probs.append(log_prob) # Append only if generator made a choice

        # ============================
        # 3. Build the new switched sentence
        # ============================
        # This block should be executed for all cases where a chosen candidate exists
        if chosen is None:
            # This should ideally not happen if cand_list is not empty
            continue

        replaced = tokens.copy()
        replaced[pos] = chosen
        new_text = " ".join(replaced)

        # ============================
        # 4. DISCRIMINATOR reward (D)
        # ============================
        disc_inputs = disc_tokenizer(
            new_text, return_tensors="pt", truncation=True
        ).to("cuda")

        disc_logits = disc_model(**disc_inputs).logits.softmax(-1)
        D_reward = float(disc_logits[0][1].item())  # probability it is "real"

        # ============================
        # 5. SENTIMENT reward (S)
        # ============================
        sent_inputs = reward_tokenizer(
            new_text, return_tensors="pt", truncation=True
        ).to("cuda")

        sent_logits = reward_model(**sent_inputs).logits
        S_reward = float(sent_logits[0][idx_map[sent_label]].item())


        # ============================
        # 6. SEMANTIC SIMILARITY reward (Sim)
        # ============================
        # ============================
# 6. SEMANTIC SIMILARITY reward (Sim) - FIXED
# ============================
        Sim_reward = max(0, reward_similarity(example["original"], new_text))


        # ============================
        # 7. Combined reward
        # ============================
        R = alpha * D_reward + beta * S_reward + gamma * Sim_reward

        # Append R only if log_prob was also appended (i.e., generator made a choice)
        if len(cand_list) > 1:
            rewards.append(R)

    # ============================
    # 8. Policy gradient optimization
    # ============================
    if len(log_probs) > 0: # Only optimize if there were actual choices made by the generator
        log_probs = torch.stack(log_probs)
        rewards = torch.tensor(rewards).to("cuda")

        loss = -(log_probs * rewards).mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        return float(loss.item())

    return None

In [None]:
def gan_generate_text(example, generator, reward_tokenizer, reward_model):
    """
    Safe GAN sentence generation — now uses LaBSE embeddings (768-d)
    instead of random 300-d vectors.
    """
    tokens = example["tokens"].copy()
    mask_positions = example["mask_positions"]

    if len(mask_positions) == 0:
        return example["original"]

    candidates_dict = generate_candidates(
        tokens=tokens,
        mask_positions=mask_positions,
        lexicon=LEXICON,
        lang_tags=example["lang"],
        pos_tags=example["pos"],
        has_al=example["has_al"]
    )

    for pos, cand_list in candidates_dict.items():

        # skip empty candidate list
        if len(cand_list) == 0:
            continue

        # === FIX: use LaBSE embeddings (768-d) ===
        cand_vecs = similarity_model.encode(
            cand_list,
            convert_to_tensor=True
        ).to("cuda")  # shape: [N, 768]

        with torch.no_grad():
            logits = generator(cand_vecs).squeeze(-1)
            probs = torch.softmax(logits, dim=0)

        if probs.numel() == 0:
            continue

        best_idx = torch.argmax(probs).item()
        chosen = cand_list[best_idx]

        tokens[pos] = chosen

    generated_text = " ".join(tokens).replace("  ", " ").strip()
    return generated_text


training

In [None]:
import json, os
from pathlib import Path

BASE = Path("/content/drive/MyDrive/cs-senti/repo/data/annotated_with_id")

def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

# --------- MAIN DATASETS ----------
eesa_train_annotated = load_jsonl(BASE/"eesa_train_annotated.jsonl")
mr_cs_labeled        = load_jsonl(BASE/"mr_cs_labeled_annotated.jsonl")
amg_cs_labeled       = load_jsonl(BASE/"amg_cs_labeled_annotated.jsonl")
#amg_ar_mono          = load_jsonl(BASE/"amg_ar_mono_annotated.jsonl")

print("EESA:", len(eesa_train_annotated))
print("MR-CS:", len(mr_cs_labeled))
print("AMG-CS:", len(amg_cs_labeled))
#print("AMG-MONO:", len(amg_ar_mono))


In [None]:
import json
from tqdm import tqdm

# =====================================================
# 1. Load the original AMG-AR-MONO file
# =====================================================

AMG_MONO_PATH = "/content/drive/MyDrive/cs-senti/repo/data/annotated_with_id/amg_ar_mono_annotated.jsonl"

amg_mono = []
with open(AMG_MONO_PATH, "r", encoding="utf-8") as f:
    for line in f:
        amg_mono.append(json.loads(line))

print("Loaded AMG-AR-MONO samples:", len(amg_mono))

# =====================================================
# 2. Define function to compute sentiment label
#    USING YOUR EXISTING SENTIMENT MODEL
# =====================================================

def predict_sentiment(text):
    inputs = reward_tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True
    ).to("cuda")

    preds = reward_model(**inputs).logits.softmax(-1)
    label_index = preds.argmax(-1).item()

    return ["neg", "neu", "pos"][label_index]

# =====================================================
# 3. Auto-label entire AMG-AR-MONO dataset
# =====================================================

amg_mono_labeled = []

for ex in tqdm(amg_mono, desc="Auto-labeling AMG-AR-MONO"):
    text = ex["original"]

    # predict sentiment using your reward model
    auto_label = predict_sentiment(text)

    # attach new label
    ex["label"] = auto_label

    amg_mono_labeled.append(ex)

print("Done. Example labeled sample:")
print(amg_mono_labeled[0])

# =====================================================
# 4. Save NEW labeled version
# =====================================================

SAVE_PATH = "/content/drive/MyDrive/cs-senti/repo/data/annotated_with_id/amg_ar_mono_auto_labeled.jsonl"

with open(SAVE_PATH, "w", encoding="utf-8") as f:
    for ex in amg_mono_labeled:
        f.write(json.dumps(ex, ensure_ascii=False) + "\n")

print("Saved auto-labeled AMG-MONO to:")
print(SAVE_PATH)


In [None]:
amg_ar_mono = load_jsonl(BASE/"amg_ar_mono_auto_labeled.jsonl")


In [None]:
import random

# ---- create reproducible subset ----
random.seed(42)

amg_ar_subset = random.sample(amg_ar_mono, 2000)

print("Subset size:", len(amg_ar_subset))
print("Original size:", len(amg_ar_mono))


In [None]:
from tqdm import tqdm

def train_gan(
    dataset,
    epochs,
    batch_size,
    generator,
    optimizer,
    disc_model,
    disc_tokenizer,
    reward_model,
    reward_tokenizer,
    sim_model,
    alpha=0.5,
    beta=0.4,
    gamma=0.1
):
    """
    Full GAN training loop with:
    ✓ Generator REINFORCE (policy gradient)
    ✓ Discriminator training (real vs fake)
    ✓ Sentiment critic (MARBERT)
    ✓ Semantic similarity critic (LaBSE)
    """

    for ep in range(epochs):
        print(f"\n===== EPOCH {ep+1}/{epochs} =====")
        random.shuffle(dataset)

        gen_losses = []
        disc_losses = []

        # ==== Iterate over dataset in batches ====
        for i in tqdm(range(0, len(dataset), batch_size)):
            batch = dataset[i:i+batch_size]

            for ex in batch:

                # ---------- Skip unlabeled samples ----------
                if ex.get("label") is None:
                    continue

                tokens = ex["tokens"]
                mask_positions = ex["mask_positions"]

                # ---------- Candidate generation ----------
                candidates = generate_candidates(
                    tokens=tokens,
                    mask_positions=mask_positions,
                    lexicon=LEXICON,
                    lang_tags=ex["lang"],
                    pos_tags=ex["pos"],
                    has_al=ex["has_al"]
                )

                # ---------- GENERATOR UPDATE (RL) ----------
                loss_g = reinforce_step_combined(
                    ex,
                    candidates,
                    generator,
                    optimizer,
                    disc_model,
                    disc_tokenizer,
                    reward_model,
                    reward_tokenizer,
                    sim_model,
                    alpha,
                    beta,
                    gamma
                )

                if loss_g is not None:
                    gen_losses.append(loss_g)

                # ---------- DISCRIMINATOR UPDATE ----------
                fake_text = gan_generate_text(ex, generator, reward_tokenizer, reward_model)
                real_text = ex["original"]

                disc_optimizer.zero_grad()

                # Real example: target = 1
                real_inp = disc_tokenizer(real_text, return_tensors="pt", truncation=True).to("cuda")
                real_logits = disc_model(**real_inp).logits
                real_loss = torch.nn.functional.cross_entropy(
                    real_logits,
                    torch.tensor([1]).to("cuda")
                )

                # Fake example: target = 0
                fake_inp = disc_tokenizer(fake_text, return_tensors="pt", truncation=True).to("cuda")
                fake_logits = disc_model(**fake_inp).logits
                fake_loss = torch.nn.functional.cross_entropy(
                    fake_logits,
                    torch.tensor([0]).to("cuda")
                )

                loss_d = real_loss + fake_loss
                loss_d.backward()
                disc_optimizer.step()

                disc_losses.append(float(loss_d.item()))

        # ===== END OF EPOCH: PRINT LOSSES =====
        if gen_losses:
            print("GEN avg loss:", sum(gen_losses)/len(gen_losses))
        else:
            print("GEN avg loss: no updates")

        if disc_losses:
            print("DISC avg loss:", sum(disc_losses)/len(disc_losses))
        else:
            print("DISC avg loss: no updates")


# **new training V2**

In [None]:
SAVE_DIR = "/content/drive/MyDrive/cs-senti/gan_synthetic2"
os.makedirs(SAVE_DIR, exist_ok=True)


In [None]:
#lolll
print("====== TRAINING ON EESA ======")

train_gan(
    eesa_train_annotated,
    epochs=3,
    batch_size=8,
    generator=generator,
    optimizer=optimizer,
    disc_model=disc_model,
    disc_tokenizer=disc_tokenizer,
    reward_model=reward_model,
    reward_tokenizer=reward_tokenizer,
    sim_model=similarity_model
)


In [None]:
eesa_gan_samples2 = generate_all_samples(
    eesa_train_annotated,
    generator,
    reward_tokenizer,
    reward_model
)

with open(f"{SAVE_DIR}/eesa_gan_samples.jsonl", "w", encoding="utf-8") as f:
    for row in eesa_gan_samples2:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("Saved eesa-CS GAN synthetic data.")


In [None]:
print("====== ALIGN AMG-CS ======")
align_domain(det_rows, eesa_gan_samples2, domain="eesa", limit=20)
def show_examples(samples, title="Examples"):
    """
    Display ALL generated GAN samples for a domain.
    No max_show limit.
    """
    print(f"\n========== {title} ==========\n")

    for ex in samples:
        print(f"ID        : {ex.get('id', 'N/A')}")
        print(f"ORIGINAL  : {ex['original']}")
        print(f"GAN       : {ex['generated']}")
        print(f"LABEL     : {ex['label']}")
        print("-" * 70)

show_examples(eesa_gan_samples2,  "AMG Synthetic GAN Data")

In [None]:
#lolll
print("====== TRAINING ON AMG-CS ======")

train_gan(
    amg_cs_labeled,
    epochs=3,
    batch_size=8,
    generator=generator,
    optimizer=optimizer,
    disc_model=disc_model,
    disc_tokenizer=disc_tokenizer,
    reward_model=reward_model,
    reward_tokenizer=reward_tokenizer,
    sim_model=similarity_model
)


In [None]:
def generate_all_samples(dataset, generator, reward_tokenizer, reward_model):
    """
    Generate GAN synthetic data for the entire dataset.
    Includes the original sample ID for alignment.
    """
    out = []

    for ex in dataset:
        gan_text = gan_generate_text(ex, generator, reward_tokenizer, reward_model)
        out.append({
            "id": ex["id"],
            "original": ex["original"],
            "generated": gan_text,
            "label": ex["label"]
        })

    return out


In [None]:
amg_gan_samples2 = generate_all_samples(
    amg_cs_labeled,
    generator,
    reward_tokenizer,
    reward_model
)

with open(f"{SAVE_DIR}/amg_gan_samples.jsonl", "w", encoding="utf-8") as f:
    for row in amg_gan_samples2:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("Saved AMG-CS GAN synthetic data.")


In [None]:
DET_PATH = "/content/drive/MyDrive/cs-senti/data/ling/host_train_switched_lex_llm.jsonl"

det_rows = []
with open(DET_PATH, "r", encoding="utf-8") as f:
    for line in f:
        try:
            det_rows.append(json.loads(line))
        except:
            pass

print("Loaded deterministic:", len(det_rows))


In [None]:
def align_domain(det_rows, gan_samples, domain=None, limit=20):
    # Build dict original → deterministic
    det_map = {row["orig_text"]: row["switched_text"] for row in det_rows if (domain is None or row.get("domain") == domain)}

    gan_map = {row["original"]: row for row in gan_samples}

    aligned = []
    for orig, gan_row in gan_map.items():
        det_sw = det_map.get(orig, None)
        aligned.append({
            "id": gan_row["id"],
            "original": orig,
            "deterministic": det_sw,
            "gan": gan_row["generated"],
            "label": gan_row["label"]
        })

    # print first N
    for row in aligned[:limit]:
        print("ID:", row["id"])
        print("ORIGINAL:", row["original"])
        print("DETERMINISTIC:", row["deterministic"])
        print("GAN:", row["gan"])
        print("-" * 70)

    return aligned



In [None]:
print("====== ALIGN AMG-CS ======")
align_domain(det_rows, amg_gan_samples2, domain="amg", limit=20)
def show_examples(samples, title="Examples"):
    """
    Display ALL generated GAN samples for a domain.
    No max_show limit.
    """
    print(f"\n========== {title} ==========\n")

    for ex in samples:
        print(f"ID        : {ex.get('id', 'N/A')}")
        print(f"ORIGINAL  : {ex['original']}")
        print(f"GAN       : {ex['generated']}")
        print(f"LABEL     : {ex['label']}")
        print("-" * 70)

show_examples(amg_gan_samples2,  "AMG Synthetic GAN Data")

In [None]:
evaluate_domain("AMG-CS", amg_gan_samples2,
                reward_tokenizer, reward_model,
                similarity_model,
                disc_tokenizer, disc_model)

In [None]:
print("====== TRAINING ON MR-CS ======")

train_gan(
    mr_cs_labeled,
    epochs=3,
    batch_size=8,
    generator=generator,
    optimizer=optimizer,
    disc_model=disc_model,
    disc_tokenizer=disc_tokenizer,
    reward_model=reward_model,
    reward_tokenizer=reward_tokenizer,
    sim_model=similarity_model
)


In [None]:

mr_gan_samples2 = generate_all_samples(
    mr_cs_labeled,
    generator,
    reward_tokenizer,
    reward_model
)

with open(f"{SAVE_DIR}/mr_gan_samples.jsonl", "w", encoding="utf-8") as f:
    for row in mr_gan_samples2:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("Saved MR-CS GAN synthetic data.")


In [None]:
evaluate_domain("AMG-CS", mr_gan_samples2,
                reward_tokenizer, reward_model,
                similarity_model,
                disc_tokenizer, disc_model)

In [None]:
import pandas as pd
import re

def normalize(t):
    if t is None:
        return ""
    t = t.strip()
    t = re.sub(r"\s+", " ", t)  # collapse spaces
    t = re.sub(r"[ـ]+", "", t)  # remove tatweel or filler chars
    t = re.sub(r"[^\w\s\u0600-\u06FF]+$", "", t)  # strip trailing punctuation
    return t

def load_gan(path):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            rows.append({
                "id": obj.get("id"),
                "original": normalize(obj.get("original", "")),
                "text": normalize(obj.get("generated", "")),
                "label": obj.get("label"),
            })
    return pd.DataFrame(rows)


eesa_gan = load_gan("/content/drive/MyDrive/cs-senti/gan_synthetic2/eesa_gan_samples.jsonl")
amg_gan  = load_gan("/content/drive/MyDrive/cs-senti/gan_synthetic2/amg_gan_samples.jsonl")
mr_gan   = load_gan("/content/drive/MyDrive/cs-senti/gan_synthetic2/mr_gan_samples.jsonl")


In [None]:
def remove_no_change(df):
    before = len(df)
    df = df[df["text"] != df["original"]].reset_index(drop=True)
    print(f"Removed {before - len(df)} unchanged samples out of {before}")
    return df

eesa_gan_f = remove_no_change(eesa_gan)
amg_gan_f  = remove_no_change(amg_gan)
mr_gan_f   = remove_no_change(mr_gan)


In [None]:
import json
import pandas as pd
from pathlib import Path

BASE = Path("/content/drive/MyDrive/cs-senti/repo/data/cleaned")

def load_clean(path):
    return pd.DataFrame([json.loads(l) for l in open(path, "r", encoding="utf-8")])

eesa = load_clean(BASE / "eesa_train_clean.jsonl")
mr   = load_clean(BASE / "mr_cs_clean.jsonl")
amg  = load_clean(BASE / "amg_cs_clean.jsonl")

print(len(eesa), len(mr), len(amg))


In [None]:
print("EESA columns:", eesa.columns.tolist())
print("MR-CS columns:", mr.columns.tolist())
print("AMG-CS columns:", amg.columns.tolist())


In [None]:
print("EESA-GAN columns:", eesa_gan_f.columns.tolist())
print("AMG-GAN columns:", amg_gan_f.columns.tolist())
print("MR-GAN columns:", mr_gan_f.columns.tolist())


In [None]:
# Drop the original text column (not needed)
eesa_gan_clean = eesa_gan_f.drop(columns=["original"])
amg_gan_clean  = amg_gan_f.drop(columns=["original"])
mr_gan_clean   = mr_gan_f.drop(columns=["original"])

print(eesa_gan_clean.columns)
print(amg_gan_clean.columns)
print(mr_gan_clean.columns)


In [None]:
eesa_orig_small = eesa[['text','label']]
mr_orig_small   = mr[['text','label']]
amg_orig_small  = amg[['text','label']]

eesa_gan_small = eesa_gan_clean[['text','label']]
amg_gan_small  = amg_gan_clean[['text','label']]
mr_gan_small   = mr_gan_clean[['text','label']]


In [None]:
eesa_merged = pd.concat([eesa_orig_small, eesa_gan_small], ignore_index=True)
mr_merged   = pd.concat([mr_orig_small, mr_gan_small], ignore_index=True)
amg_merged  = pd.concat([amg_orig_small, amg_gan_small], ignore_index=True)


In [None]:
eesa_merged = eesa_merged.drop_duplicates(subset="text")
mr_merged   = mr_merged.drop_duplicates(subset="text")
amg_merged  = amg_merged.drop_duplicates(subset="text")


In [None]:
unified_gan_augmented = pd.concat(
    [eesa_merged, mr_merged, amg_merged],
    ignore_index=True
).drop_duplicates(subset="text")

print("Final unified size:", len(unified_gan_augmented))
unified_gan_augmented.head()


In [None]:
# Path to save
out_path_jsonl = "/content/drive/MyDrive/cs-senti/gan_synthetic2/unified_gan_augmented.jsonl"
out_path_csv   = "/content/drive/MyDrive/cs-senti/gan_synthetic2/unified_gan_augmented.csv"

# Save as JSONL
unified_gan_augmented.to_json(
    out_path_jsonl,
    orient="records",
    lines=True,
    force_ascii=False
)

# Save as CSV
unified_gan_augmented.to_csv(
    out_path_csv,
    index=False
)

print("Saved JSONL to:", out_path_jsonl)
print("Saved CSV to:", out_path_csv)
print("Final dataset size:", len(unified_gan_augmented))
unified_gan_augmented.head()


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/cs-senti/gan_synthetic2/unified_gan_augmented.csv")
df.head()


In [None]:
df[df["label"].isna() | ~df["label"].isin(["neg", "neu", "pos"])].head(20)


In [None]:
df = df[df["label"].isin(["neg", "neu", "pos"])]
df = df.dropna(subset=["label"])


In [None]:
label2id = {"neg":0, "neu":1, "pos":2}
id2label = {v:k for k,v in label2id.items()}

df["label_id"] = df["label"].map(label2id)


In [None]:
import json
import pandas as pd
from pathlib import Path

BASE = Path("/content/drive/MyDrive/cs-senti/repo/data/cleaned")
GAN  = Path("/content/drive/MyDrive/cs-senti/gan_synthetic2")


# ---------------------------
# 1. LOAD ORIGINAL CLEAN DATA
# ---------------------------
def load_jsonl(path):
    return [json.loads(l) for l in open(path, "r", encoding="utf-8")]

# Original datasets
eesa_train = pd.DataFrame(load_jsonl(BASE/"eesa_train_clean.jsonl"))
eesa_dev   = pd.DataFrame(load_jsonl(BASE/"eesa_dev_clean.jsonl"))
eesa_test  = pd.DataFrame(load_jsonl(BASE/"eesa_test_clean.jsonl"))

mr_df  = pd.DataFrame(load_jsonl(BASE/"mr_cs_clean.jsonl"))
amg_df = pd.DataFrame(load_jsonl(BASE/"amg_cs_clean.jsonl"))


# -------------------------------
# 2. LOAD UNIFIED GAN-AUGMENTED
# -------------------------------
unified_gan = pd.read_json(GAN/"unified_gan_augmented.jsonl", lines=True)

print("Train (GAN-augmented) size:", len(unified_gan))
print("Dev (EESA) size:", len(eesa_dev))
print("Test (EESA) size:", len(eesa_test))
print("Test (MR) size:", len(mr_df))
print("Test (AMG) size:", len(amg_df))


# --------------------------------------
# 3. ASSIGN TRAIN / DEV / TEST SPLITS
# --------------------------------------

train_df = unified_gan.copy()   # GAN-augmented unified training set
dev_df   = eesa_dev.copy()      # EESA dev for validation
test_in  = eesa_test.copy()     # EESA test (in-domain)
test_mr  = mr_df.copy()         # MR test (cross-domain)
test_amg = amg_df.copy()        # AMG test (generalization)


# -------------------------------
# 4. SHOW FINAL SHAPES
# -------------------------------
print("\n===== FINAL SPLITS =====")
print("TRAIN:", len(train_df))
print("DEV:", len(dev_df))
print("TEST-IN-DOMAIN (EESA):", len(test_in))
print("TEST-MR:", len(test_mr))
print("TEST-AMG:", len(test_amg))


In [None]:
import pandas as pd

# Paths
base = "/content/drive/MyDrive/cs-senti"

train_path = f"{base}/gan_synthetic2/unified_gan_augmented.csv"
eesa_dev_path = f"{base}/repo/data/cleaned/eesa_dev_clean.jsonl"
eesa_test_path = f"{base}/repo/data/cleaned/eesa_test_clean.jsonl"
mr_path = f"{base}/repo/data/cleaned/mr_cs_clean.jsonl"
amg_path = f"{base}/repo/data/cleaned/amg_cs_clean.jsonl"

# Load
train_df = pd.read_csv(train_path)
dev_df   = pd.read_json(eesa_dev_path, lines=True)
test_df  = pd.read_json(eesa_test_path, lines=True)
mr_df    = pd.read_json(mr_path, lines=True)
amg_df   = pd.read_json(amg_path, lines=True)

print(len(train_df), len(dev_df), len(test_df), len(mr_df), len(amg_df))


In [None]:
import pandas as pd

base = "/content/drive/MyDrive/cs-senti"

# Paths
train_path = f"{base}/gan_synthetic2/unified_gan_augmented.csv"
eesa_dev_path = f"{base}/repo/data/cleaned/eesa_dev_clean.jsonl"
eesa_test_path = f"{base}/repo/data/cleaned/eesa_test_clean.jsonl"
mr_path = f"{base}/repo/data/cleaned/mr_cs_clean.jsonl"
amg_path = f"{base}/repo/data/cleaned/amg_cs_clean.jsonl"

# ----------------------------
# Helper: load + standardize
# ----------------------------
def load_and_fix(path, filetype="jsonl"):
    if filetype == "jsonl":
        df = pd.read_json(path, lines=True)
    else:
        df = pd.read_csv(path)

    # Standardize column names
    if "text" not in df.columns:
        raise ValueError(f"Missing 'text' in {path}")

    if "label" not in df.columns:
        raise ValueError(f"Missing 'label' in {path}")

    # Keep only text + label
    df = df[["text", "label"]]

    # Drop NaN labels if any
    df = df.dropna(subset=["label"])

    # Strip text
    df["text"] = df["text"].astype(str).str.strip()

    return df

# ----------------------------
# Load all datasets
# ----------------------------
train_df = load_and_fix(train_path, filetype="csv")
dev_df   = load_and_fix(eesa_dev_path)
test_df  = load_and_fix(eesa_test_path)
mr_df    = load_and_fix(mr_path)
amg_df   = load_and_fix(amg_path)

print(
    "Train:", len(train_df),
    "\nDev:", len(dev_df),
    "\nTest:", len(test_df),
    "\nMR:", len(mr_df),
    "\nAMG:", len(amg_df)
)


In [None]:
label_map = {"neg":0, "neu":1, "pos":2}

def encode_df(df):
    df = df.copy()
    df["y"] = df["label"].map(label_map)
    return df

train_df = encode_df(train_df)
dev_df   = encode_df(dev_df)
test_df  = encode_df(test_df)
mr_df    = encode_df(mr_df)
amg_df   = encode_df(amg_df)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=3,
    max_df=0.95,
    sublinear_tf=True
)

X_train = vectorizer.fit_transform(train_df["text"])
X_dev   = vectorizer.transform(dev_df["text"])
X_test  = vectorizer.transform(test_df["text"])
X_mr    = vectorizer.transform(mr_df["text"])
X_amg   = vectorizer.transform(amg_df["text"])

y_train = train_df["y"]
y_dev   = dev_df["y"]
y_test  = test_df["y"]
y_mr    = mr_df["y"]
y_amg   = amg_df["y"]


In [None]:
label_map = {"neg":0, "neu":1, "pos":2}
train_df["y"] = train_df["label"].map(label_map)

# Check again
train_df["y"].isna().sum()


In [None]:
train_df["label"].value_counts(dropna=False)


In [None]:
label_map = {"neg":0, "neu":1, "pos":2}
train_df["y"] = train_df["label"].map(label_map)
dev_df["y"]   = dev_df["label"].map(label_map)
test_df["y"]  = test_df["label"].map(label_map)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    max_features=50000,
    ngram_range=(1,2),
    min_df=2
)

X_train = vectorizer.fit_transform(train_df["text"])
y_train = train_df["y"]

X_dev = vectorizer.transform(dev_df["text"])
y_dev = dev_df["y"]

X_test = vectorizer.transform(test_df["text"])
y_test = test_df["y"]


In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    max_iter=3000,
    class_weight="balanced",
    n_jobs=-1,
    C=3.0
)

lr.fit(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report, f1_score

pred_dev = lr.predict(X_dev)
pred_test = lr.predict(X_test)

print("DEV F1:", f1_score(y_dev, pred_dev, average="macro"))
print("TEST F1:", f1_score(y_test, pred_test, average="macro"))

print("\nTest Classification Report:\n")
print(classification_report(y_test, pred_test, target_names=["neg","neu","pos"]))


In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, classification_report

svm = LinearSVC(
    C=1.0,
    class_weight="balanced"
)

svm.fit(X_train, y_train)
pred_dev = svm.predict(X_dev)
print("DEV F1:", f1_score(y_dev, pred_dev, average="macro"))
pred_test = svm.predict(X_test)
print("EESA TEST F1:", f1_score(y_test, pred_test, average="macro"))
print("\nClassification Report (EESA TEST):\n")
print(classification_report(y_test, pred_test, target_names=["neg","neu","pos"]))


In [None]:
X_mr = vectorizer.transform(mr_test_df["text"])
y_mr = mr_test_df["label"].map(label_map)

pred_mr = svm.predict(X_mr)
print("MR Generalization F1:", f1_score(y_mr, pred_mr, average="macro"))
X_amg = vectorizer.transform(amg_test_df["text"])
y_amg = amg_test_df["label"].map(label_map)

pred_amg = svm.predict(X_amg)
print("AMG Generalization F1:", f1_score(y_amg, pred_amg, average="macro"))


In [None]:
import pandas as pd

# Paths
base = "/content/drive/MyDrive/cs-senti"

train_path = f"{base}/gan_synthetic2/unified_gan_augmented.csv"
eesa_dev_path = f"{base}/repo/data/cleaned/eesa_dev_clean.jsonl"
eesa_test_path = f"{base}/repo/data/cleaned/eesa_test_clean.jsonl"
mr_path = f"{base}/repo/data/cleaned/mr_cs_clean.jsonl"
amg_path = f"{base}/repo/data/cleaned/amg_cs_clean.jsonl"

# Load datasets
train_df = pd.read_csv(train_path)
dev_df   = pd.read_json(eesa_dev_path, lines=True)
test_df  = pd.read_json(eesa_test_path, lines=True)
mr_df    = pd.read_json(mr_path, lines=True)
amg_df   = pd.read_json(amg_path, lines=True)

print(len(train_df), len(dev_df), len(test_df), len(mr_df), len(amg_df))


In [None]:
label_map = {"pos": 2, "neu": 1, "neg": 0}

train_df["y"] = train_df["label"].map(label_map)
dev_df["y"]   = dev_df["label"].map(label_map)
test_df["y"]  = test_df["label"].map(label_map)
mr_df["y"]    = mr_df["label"].map(label_map)
amg_df["y"]   = amg_df["label"].map(label_map)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1,2),
    sublinear_tf=True
)

X_train = vectorizer.fit_transform(train_df["text"])
y_train = train_df["y"]


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# Logistic Regression
lr = LogisticRegression(max_iter=5000, class_weight="balanced")
lr.fit(X_train, y_train)

# SVM
svm = LinearSVC(class_weight="balanced")
svm.fit(X_train, y_train)


In [None]:
train_df[train_df["label"].isna()].head()
bad_labels = train_df[train_df["y"].isna()]
bad_labels
train_df["y"].isna().sum()


In [None]:
# Remove any rows with missing labels
train_df = train_df.dropna(subset=["label"])

# Ensure labels belong to {pos, neu, neg}
valid = {"pos", "neu", "neg"}
train_df = train_df[train_df["label"].isin(valid)]

# Remap labels after cleaning
train_df["y"] = train_df["label"].map({"pos":2, "neu":1, "neg":0})


In [None]:
train_df["y"].isna().sum()


In [None]:
X_train = vectorizer.fit_transform(train_df["text"])
y_train = train_df["y"]


In [None]:
from sklearn.metrics import f1_score

def eval_model(model, df, name):
    X = vectorizer.transform(df["text"])
    y = df["y"]
    preds = model.predict(X)
    f1 = f1_score(y, preds, average="macro")
    print(f"{name}: {f1:.4f}")
    return f1


In [None]:
print("====== LOGISTIC REGRESSION RESULTS ======")
eval_model(lr, dev_df,  "EESA Dev")
eval_model(lr, test_df, "EESA Test")
eval_model(lr, mr_df,   "MR Generalization")
eval_model(lr, amg_df,  "AMG Generalization")

print("\n====== SVM RESULTS ======")
eval_model(svm, dev_df,  "EESA Dev")
eval_model(svm, test_df, "EESA Test")
eval_model(svm, mr_df,   "MR Generalization")
eval_model(svm, amg_df,  "AMG Generalization")


neural

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Confirm train/dev/test
print(len(train_df), len(dev_df), len(test_df), len(mr_df), len(amg_df))

# Extract text + labels
train_texts = train_df["text"].astype(str).tolist()
dev_texts   = dev_df["text"].astype(str).tolist()
test_texts  = test_df["text"].astype(str).tolist()
mr_texts    = mr_df["text"].astype(str).tolist()
amg_texts   = amg_df["text"].astype(str).tolist()

train_labels = train_df["label"].tolist()
dev_labels   = dev_df["label"].tolist()
test_labels  = test_df["label"].tolist()
mr_labels    = mr_df["label"].tolist()
amg_labels   = amg_df["label"].tolist()

# Label mapping
label_map = {"neg":0, "neu":1, "pos":2}

y_train = np.array([label_map[x] for x in train_labels])
y_dev   = np.array([label_map[x] for x in dev_labels])
y_test  = np.array([label_map[x] for x in test_labels])
y_mr    = np.array([label_map[x] for x in mr_labels])
y_amg   = np.array([label_map[x] for x in amg_labels])


In [None]:
MAX_WORDS = 30000
MAX_LEN = 40   # tuned for short CS sentences

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

def encode(texts):
    seq = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seq, maxlen=MAX_LEN, padding="post", truncating="post")

X_train = encode(train_texts)
X_dev   = encode(dev_texts)
X_test  = encode(test_texts)
X_mr    = encode(mr_texts)
X_amg   = encode(amg_texts)

print(X_train.shape, X_dev.shape, X_test.shape)


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

EMB_DIM = 128
LSTM_UNITS = 128

model = models.Sequential([
    layers.Embedding(MAX_WORDS, EMB_DIM, input_length=MAX_LEN),
    layers.Bidirectional(layers.LSTM(LSTM_UNITS, return_sequences=False)),
    layers.Dense(128, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(3, activation="softmax"),
])

model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

model.summary()


In [None]:
!pip install transformers datasets evaluate accelerate -q


In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

# Label mapping
label_map = {"neg": 0, "neu": 1, "pos": 2}

train_df["label"] = train_df["label"].map(label_map)
dev_df["label"]   = dev_df["label"].map(label_map)
test_df["label"]  = test_df["label"].map(label_map)
mr_df["label"]    = mr_df["label"].map(label_map)
amg_df["label"]   = amg_df["label"].map(label_map)

# Convert to HF datasets
ds_train = Dataset.from_pandas(train_df[["text", "label"]])
ds_dev   = Dataset.from_pandas(dev_df[["text", "label"]])
ds_test  = Dataset.from_pandas(test_df[["text", "label"]])
ds_mr    = Dataset.from_pandas(mr_df[["text", "label"]])
ds_amg   = Dataset.from_pandas(amg_df[["text", "label"]])


In [None]:
from transformers import AutoTokenizer

model_name = "UBC-NLP/MARBERTv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

MAX_LEN = 64

def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )

ds_train_enc = ds_train.map(tokenize, batched=True)
ds_dev_enc   = ds_dev.map(tokenize, batched=True)
ds_test_enc  = ds_test.map(tokenize, batched=True)
ds_mr_enc    = ds_mr.map(tokenize, batched=True)
ds_amg_enc   = ds_amg.map(tokenize, batched=True)

cols = ["input_ids", "attention_mask", "label"]
ds_train_enc.set_format("torch", columns=cols)
ds_dev_enc.set_format("torch", columns=cols)
ds_test_enc.set_format("torch", columns=cols)
ds_mr_enc.set_format("torch", columns=cols)
ds_amg_enc.set_format("torch", columns=cols)


In [None]:
!pip install --upgrade transformers datasets evaluate accelerate -q

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
import json
import numpy as np
import evaluate
from sklearn.metrics import classification_report
import os

os.environ["WANDB_DISABLED"] = "true"


# ============================================================
# Helper: Load JSONL
# ============================================================
def load_jsonl(path):
    return [json.loads(l) for l in open(path, "r", encoding="utf-8")]


# ============================================================
# FIX: unify all augmented data to {text, label}
# ============================================================
clean_train = []
for item in combined_aug:
    obj = {}

    # choose generated text if available, else text
    if "generated" in item and item["generated"]:
        obj["text"] = item["generated"].strip()
    else:
        obj["text"] = item["text"].strip()

    obj["label"] = item["label"]
    clean_train.append(obj)

label2id = {"neg":0, "neu":1, "pos":2}
id2label = {0:"neg", 1:"neu", 2:"pos"}

def encode(example):
    example["label"] = label2id[example["label"]]
    return example

train_ds = Dataset.from_list(clean_train).map(encode)


# ============================================================
# DEV + TEST (EESA)
# ============================================================
DEV = "/content/drive/MyDrive/cs-senti/repo/data/cleaned/eesa_dev_clean.jsonl"
TEST = "/content/drive/MyDrive/cs-senti/repo/data/cleaned/eesa_test_clean.jsonl"

eesa_dev = Dataset.from_list(load_jsonl(DEV)).map(encode)
eesa_test = Dataset.from_list(load_jsonl(TEST)).map(encode)


# ============================================================
# DatasetDict
# ============================================================
dataset = DatasetDict({
    "train": train_ds,
    "dev": eesa_dev,
    "test": eesa_test
})

print(dataset)


# ============================================================
# TRAINING FUNCTION (fixed tokenizer handling)
# ============================================================
def train_any_model(model_name, output_dir):

    print(f"\n============== Training {model_name} ==============\n")

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    def tokenize(batch):
        return tokenizer(
            batch["text"],
            truncation=True,
            padding="max_length",
            max_length=128
        )

    tokenized = dataset.map(tokenize, batched=True)
    tokenized = tokenized.rename_column("label", "labels")
    tokenized.set_format("torch")

    # ---------------- Model ----------------
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3,
        id2label=id2label,
        label2id=label2id
    )

    f1_metric = evaluate.load("f1")

    def compute_metrics(pred):
        logits, labels = pred
        preds = np.argmax(logits, axis=-1)
        f1 = f1_metric.compute(predictions=preds, references=labels, average="macro")
        return {"macro_f1": f1["f1"]}

    args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=4,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        learning_rate=2e-5,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=2,
        logging_strategy="epoch",
        report_to=[]
    )

    trainer = Trainer(
        model=model,
        args=args,
        tokenizer=tokenizer,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["dev"],
        compute_metrics=compute_metrics
    )

    trainer.train()

    print("\n===== FINAL TEST RESULTS =====")
    test_out = trainer.evaluate(tokenized["test"])
    print(test_out)

    # Full classification report
    preds = trainer.predict(tokenized["test"]).predictions
    pred_labels = np.argmax(preds, axis=1)
    true_labels = np.array(tokenized["test"]["labels"])

    print("\n===== CLASSIFICATION REPORT =====")
    print(classification_report(
        true_labels, pred_labels,
        target_names=["neg", "neu", "pos"]
    ))

    return trainer


In [None]:
import pandas as pd
import json

base = "/content/drive/MyDrive/cs-senti"


# ============================================================
# 1. Helper Functions
# ============================================================

def load_jsonl(path):
    """Reads a JSONL file into a list of dicts."""
    return [json.loads(l) for l in open(path, "r", encoding="utf-8")]


def normalize_df(df, source_name=""):
    """
    Normalize any dataset to have columns:
        text, label, id
    Remove rows with missing or invalid entries.
    """
    # Standardize text column
    if "generated" in df.columns:
        df["text"] = df["generated"]
    elif "text" in df.columns:
        df["text"] = df["text"]
    elif "original" in df.columns:
        df["text"] = df["original"]
    else:
        raise ValueError(f"No text field found in {source_name}")

    # Standardize ID
    if "id" not in df.columns:
        df["id"] = range(len(df))

    # Ensure correct label column
    if "label" not in df.columns:
        raise ValueError(f"No label field found in {source_name}")

    # Clean text + label
    df["text"] = df["text"].astype(str).str.strip()
    df["label"] = df["label"].astype(str).str.strip()

    # Filter valid labels
    valid = {"neg", "neu", "pos"}
    df = df[df["label"].isin(valid)]

    # Remove rows with missing text
    df = df[df["text"].notna() & (df["text"] != "")]

    # Drop unused columns
    drop_cols = [c for c in ["generated", "original"] if c in df.columns]
    df = df.drop(columns=drop_cols, errors="ignore")

    return df.reset_index(drop=True)


# ============================================================
# 2. Load ORIGINAL CLEAN DATASETS
# ============================================================

# Unified GAN-train data
train_path = f"{base}/gan_synthetic2/unified_gan_augmented.csv"
train_df = pd.read_csv(train_path)
train_df = normalize_df(train_df, "GAN-TRAIN")

# EESA dev/test
eesa_dev = pd.DataFrame(load_jsonl(f"{base}/repo/data/cleaned/eesa_dev_clean.jsonl"))
eesa_dev = normalize_df(eesa_dev, "EESA-DEV")

eesa_test = pd.DataFrame(load_jsonl(f"{base}/repo/data/cleaned/eesa_test_clean.jsonl"))
eesa_test = normalize_df(eesa_test, "EESA-TEST")

# MR and AMG for generalization
mr_df = pd.DataFrame(load_jsonl(f"{base}/repo/data/cleaned/mr_cs_clean.jsonl"))
mr_df = normalize_df(mr_df, "MR-CS")

amg_df = pd.DataFrame(load_jsonl(f"{base}/repo/data/cleaned/amg_cs_clean.jsonl"))
amg_df = normalize_df(amg_df, "AMG-CS")


# ============================================================
# 3. Load GAN SYNTHETIC DATA
# ============================================================

gan_eesa = pd.DataFrame(load_jsonl(f"{base}/gan_synthetic2/eesa_gan_samples.jsonl"))
gan_eesa = normalize_df(gan_eesa, "GAN-EESA")

gan_mr = pd.DataFrame(load_jsonl(f"{base}/gan_synthetic2/mr_gan_samples.jsonl"))
gan_mr = normalize_df(gan_mr, "GAN-MR")

gan_amg = pd.DataFrame(load_jsonl(f"{base}/gan_synthetic2/amg_gan_samples.jsonl"))
gan_amg = normalize_df(gan_amg, "GAN-AMG")


# ============================================================
# 4. Print Dataset Sizes
# ============================================================

print("TRAIN (GAN-augmented unified):", len(train_df))
print("EESA DEV:", len(eesa_dev))
print("EESA TEST:", len(eesa_test))
print("MR-CS (Generalization):", len(mr_df))
print("AMG-CS (Generalization):", len(amg_df))

print("GAN-EESA:", len(gan_eesa))
print("GAN-MR:", len(gan_mr))
print("GAN-AMG:", len(gan_amg))


# Show first rows to confirm structure
train_df.head()


In [None]:
from datasets import Dataset, DatasetDict

# -------------------------------
# LABEL MAPPING
# -------------------------------
label2id = {"neg": 0, "neu": 1, "pos": 2}
id2label = {v:k for k,v in label2id.items()}

def encode_labels(batch):
    batch["label"] = label2id[batch["label"]]
    return batch


# -------------------------------
# Convert Pandas → HF Dataset
# -------------------------------
train_ds = Dataset.from_pandas(train_df)
dev_ds   = Dataset.from_pandas(eesa_dev)
test_ds  = Dataset.from_pandas(eesa_test)

# -------------------------------
# Keep only text + label columns
# -------------------------------
for col in train_ds.column_names:
    if col not in ["text", "label"]:
        train_ds = train_ds.remove_columns(col)

for col in dev_ds.column_names:
    if col not in ["text", "label"]:
        dev_ds = dev_ds.remove_columns(col)

for col in test_ds.column_names:
    if col not in ["text", "label"]:
        test_ds = test_ds.remove_columns(col)

# -------------------------------
# Encode labels
# -------------------------------
train_ds = train_ds.map(encode_labels)
dev_ds   = dev_ds.map(encode_labels)
test_ds  = test_ds.map(encode_labels)

# -------------------------------
# Final dataset structure
# -------------------------------
dataset = DatasetDict({
    "train": train_ds,
    "dev":   dev_ds,
    "test":  test_ds
})

dataset


In [None]:
from transformers import AutoTokenizer

MODEL_NAME = "UBC-NLP/MARBERTv2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_ds = dataset.map(tokenize, batched=True)

# Remove original text column
tokenized_ds = tokenized_ds.remove_columns(["text"])

# Rename label → labels automatically by Trainer
tokenized_ds = tokenized_ds.rename_column("label", "labels")

tokenized_ds.set_format("torch")

tokenized_ds


In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
from sklearn.metrics import classification_report

f1_metric = evaluate.load("f1")

def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)
    f1 = f1_metric.compute(predictions=preds, references=labels, average="macro")
    return {"macro_f1": f1["f1"]}

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

args = TrainingArguments(
    output_dir="/content/marbertv2-gan",
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    save_total_limit=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_strategy="epoch",
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["dev"],
    compute_metrics=compute_metrics
)

trainer.train()


In [None]:
print("=== FINAL TEST RESULTS (EESA Test) ===")
print(trainer.evaluate(tokenized_ds["test"]))

preds = trainer.predict(tokenized_ds["test"]).predictions
pred_labels = np.argmax(preds, axis=1)
true_labels = np.array(tokenized_ds["test"]["labels"])

print(classification_report(
    true_labels,
    pred_labels,
    target_names=["neg", "neu", "pos"]
))


In [None]:
import pandas as pd
from datasets import Dataset
import numpy as np
from sklearn.metrics import classification_report, f1_score

# -----------------------------
# Load clean external test sets
# -----------------------------
mr_path  = "/content/drive/MyDrive/cs-senti/repo/data/cleaned/mr_cs_clean.jsonl"
amg_path = "/content/drive/MyDrive/cs-senti/repo/data/cleaned/amg_cs_clean.jsonl"

mr_df  = pd.read_json(mr_path, lines=True)
amg_df = pd.read_json(amg_path, lines=True)

# Ensure consistent columns
mr_df  = mr_df[["text", "label"]]
amg_df = amg_df[["text", "label"]]

# -----------------------------
# Encode labels
# -----------------------------
def encode_labels(example):
    example["label"] = label2id[example["label"]]
    return example

mr_ds  = Dataset.from_pandas(mr_df).map(encode_labels)
amg_ds = Dataset.from_pandas(amg_df).map(encode_labels)

# -----------------------------
# Tokenize with existing tokenizer
# -----------------------------
def tokenize_eval(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

mr_tok  = mr_ds.map(tokenize_eval, batched=True)
amg_tok = amg_ds.map(tokenize_eval, batched=True)

mr_tok  = mr_tok.remove_columns(["text"])
amg_tok = amg_tok.remove_columns(["text"])

mr_tok  = mr_tok.rename_column("label", "labels")
amg_tok = amg_tok.rename_column("label", "labels")

mr_tok.set_format("torch")
amg_tok.set_format("torch")

# -----------------------------
# Evaluate with trainer
# -----------------------------
print("\n================= MR GENERALIZATION =================")
mr_results = trainer.evaluate(mr_tok)
print(mr_results)

mr_preds = np.argmax(trainer.predict(mr_tok).predictions, axis=1)
mr_true  = np.array(mr_tok["labels"])

print("\nMR Classification Report:")
print(classification_report(mr_true, mr_preds, target_names=["neg","neu","pos"]))

# Individual macro F1
mr_f1 = f1_score(mr_true, mr_preds, average="macro")
print("MR Macro-F1:", mr_f1)


print("\n================= AMG GENERALIZATION =================")
amg_results = trainer.evaluate(amg_tok)
print(amg_results)

amg_preds = np.argmax(trainer.predict(amg_tok).predictions, axis=1)
amg_true  = np.array(amg_tok["labels"])

print("\nAMG Classification Report:")
print(classification_report(amg_true, amg_preds, target_names=["neg","neu","pos"]))

# Individual macro F1
amg_f1 = f1_score(amg_true, amg_preds, average="macro")
print("AMG Macro-F1:", amg_f1)


In [None]:
# ============================================
# A R A B E R T   T R A I N I N G  C E L L
# ============================================

model_name = "aubmindlab/bert-base-arabertv2"

print(f"\n============== Training {model_name} ==============\n")

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Tokenization function — same as MARBERT
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

# Convert Pandas → HF Dataset
train_ds = Dataset.from_pandas(train_df[["text", "label"]]).map(encode_labels)
dev_ds   = Dataset.from_pandas(eesa_dev[["text", "label"]]).map(encode_labels)
test_ds  = Dataset.from_pandas(eesa_test[["text", "label"]]).map(encode_labels)

# Build DatasetDict
dataset = DatasetDict({
    "train": train_ds,
    "dev": dev_ds,
    "test": test_ds
})

# Apply tokenizer
tokenized = dataset.map(tokenize, batched=True)
tokenized = tokenized.remove_columns(["text"])
tokenized = tokenized.rename_column("label", "labels")
tokenized.set_format("torch")

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

# Metrics
f1_metric = evaluate.load("f1")

def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)
    f1 = f1_metric.compute(predictions=preds, references=labels, average="macro")
    return {"macro_f1": f1["f1"]}

# Training settings
args = TrainingArguments(
    output_dir="/content/drive/MyDrive/cs-senti/arabert_augmented",
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    save_total_limit=2,
    load_best_model_at_end=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["dev"],
    compute_metrics=compute_metrics
)

trainer.train()

# ================================
# Final Test Evaluation
# ================================
print("\n===== FINAL TEST RESULTS (EESA) =====")
print(trainer.evaluate(tokenized["test"]))

preds = trainer.predict(tokenized["test"]).predictions
pred_labels = np.argmax(preds, axis=1)
true_labels = np.array(tokenized["test"]["labels"])

print("\n===== CLASSIFICATION REPORT =====")
print(classification_report(
    true_labels, pred_labels,
    target_names=["neg", "neu", "pos"]
))


In [None]:
print("\n\n================= MR GENERALIZATION =================")

# Convert MR dataframe → HF dataset
mr_ds = Dataset.from_pandas(mr_df[["text", "label"]]).map(encode_labels)

# Tokenize
mr_tok = mr_ds.map(lambda b: tokenizer(
    b["text"],
    truncation=True,
    padding="max_length",
    max_length=128
), batched=True)

mr_tok = mr_tok.remove_columns(["text"])
mr_tok = mr_tok.rename_column("label", "labels")
mr_tok.set_format("torch")

# Evaluate
mr_results = trainer.evaluate(mr_tok)
print(mr_results)

# Classification report
preds = trainer.predict(mr_tok).predictions
pred_labels = np.argmax(preds, axis=1)
true_labels = np.array(mr_tok["labels"])

print("\nMR Classification Report:")
print(classification_report(
    true_labels, pred_labels,
    target_names=["neg", "neu", "pos"]
))


print("\n\n================= AMG GENERALIZATION =================")

# Convert AMG → HF dataset
amg_ds = Dataset.from_pandas(amg_df[["text", "label"]]).map(encode_labels)

# Tokenize
amg_tok = amg_ds.map(lambda b: tokenizer(
    b["text"],
    truncation=True,
    padding="max_length",
    max_length=128
), batched=True)

amg_tok = amg_tok.remove_columns(["text"])
amg_tok = amg_tok.rename_column("label", "labels")
amg_tok.set_format("torch")

# Evaluate
amg_results = trainer.evaluate(amg_tok)
print(amg_results)

# Classification report
preds = trainer.predict(amg_tok).predictions
pred_labels = np.argmax(preds, axis=1)
true_labels = np.array(amg_tok["labels"])

print("\nAMG Classification Report:")
print(classification_report(
    true_labels, pred_labels,
    target_names=["neg", "neu", "pos"]
))


In [None]:
# ============================================================
# XLM-R TRAINING (same pipeline you used for MARBERT/AraBERT)
# ============================================================

model_name = "xlm-roberta-base"
output_dir = "/content/drive/MyDrive/cs-senti/models/xlm-r-gan"

print(f"\n============== Training {model_name} ==============\n")

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

def tokenize(batch):
    # Use "text" always (all your datasets were normalized to have this field)
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized = dataset.map(tokenize, batched=True)

# Remove irrelevant columns
for col in ["id", "original", "generated"]:
    if col in tokenized["train"].column_names:
        tokenized = tokenized.remove_columns(col)

tokenized = tokenized.rename_column("label", "labels")
tokenized.set_format("torch")

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

# ---- Metrics ----
f1_metric = evaluate.load("f1")

def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)
    return {"macro_f1": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]}

# ---- Training Args ----
args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    save_total_limit=2,
    load_best_model_at_end=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["dev"],
    compute_metrics=compute_metrics
)

trainer.train()

# ========== TEST (EESA) ==========
print("\n===== FINAL TEST RESULTS (EESA) =====")
print(trainer.evaluate(tokenized["test"]))

preds = trainer.predict(tokenized["test"]).predictions
pred_labels = np.argmax(preds, axis=1)
true_labels = np.array([x["labels"] for x in tokenized["test"]])

print("\n===== CLASSIFICATION REPORT =====")
print(classification_report(true_labels, pred_labels, target_names=["neg", "neu", "pos"]))


In [None]:
print("\n================= MR GENERALIZATION =================")

mr_tokenized = mr_ds.map(tokenize, batched=True)
mr_tokenized = mr_tokenized.rename_column("label", "labels")
mr_tokenized.set_format("torch")

print(trainer.evaluate(mr_tokenized))

preds = trainer.predict(mr_tokenized).predictions
pred_labels = np.argmax(preds, axis=1)
true_labels = np.array([x["labels"] for x in mr_tokenized])

print("\nMR Classification Report:")
print(classification_report(true_labels, pred_labels, target_names=["neg", "neu", "pos"]))


In [None]:
print("\n================= AMG GENERALIZATION =================")

amg_tokenized = amg_ds.map(tokenize, batched=True)
amg_tokenized = amg_tokenized.rename_column("label", "labels")
amg_tokenized.set_format("torch")

print(trainer.evaluate(amg_tokenized))

preds = trainer.predict(amg_tokenized).predictions
pred_labels = np.argmax(preds, axis=1)
true_labels = np.array([x["labels"] for x in amg_tokenized])

print("\nAMG Classification Report:")
print(classification_report(true_labels, pred_labels, target_names=["neg", "neu", "pos"]))


In [None]:
train_df = train_df.dropna(subset=["label"])


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, classification_report

# ----------------------------------------------------------
# 1. LOAD ALL DATASETS
# ----------------------------------------------------------
base = "/content/drive/MyDrive/cs-senti"

train_path = f"{base}/gan_synthetic2/unified_gan_augmented.csv"
eesa_dev_path = f"{base}/repo/data/cleaned/eesa_dev_clean.jsonl"
eesa_test_path = f"{base}/repo/data/cleaned/eesa_test_clean.jsonl"
mr_path = f"{base}/repo/data/cleaned/mr_cs_clean.jsonl"
amg_path = f"{base}/repo/data/cleaned/amg_cs_clean.jsonl"

train_df = pd.read_csv(train_path)
eesa_dev = pd.read_json(eesa_dev_path, lines=True)
eesa_test = pd.read_json(eesa_test_path, lines=True)
mr_df = pd.read_json(mr_path, lines=True)
amg_df = pd.read_json(amg_path, lines=True)

print("Loaded sizes:")
print(len(train_df), len(eesa_dev), len(eesa_test), len(mr_df), len(amg_df))

# ----------------------------------------------------------
# 2. CLEAN LABELS (THE IMPORTANT PART)
# ----------------------------------------------------------

def clean_labels(df):
    df = df.copy()
    df["label"] = df["label"].astype(str).str.strip()
    df = df[df["label"].isin(["neg", "neu", "pos"])]
    df["label"] = df["label"].map({"neg":0, "neu":1, "pos":2})
    return df

train_df = clean_labels(train_df)
eesa_dev = clean_labels(eesa_dev)
eesa_test = clean_labels(eesa_test)
mr_df = clean_labels(mr_df)
amg_df = clean_labels(amg_df)

print("\nLabel distribution (train):")
print(train_df["label"].value_counts())

# ----------------------------------------------------------
# 3. BUILD TRAIN/DEV/TEST MATRICES
# ----------------------------------------------------------

vectorizer = TfidfVectorizer(max_features=6000, ngram_range=(1,2))

X_train = vectorizer.fit_transform(train_df["text"])
y_train = train_df["label"]

X_dev = vectorizer.transform(eesa_dev["text"])
y_dev = eesa_dev["label"]

X_test = vectorizer.transform(eesa_test["text"])
y_test = eesa_test["label"]

X_mr = vectorizer.transform(mr_df["text"])
y_mr = mr_df["label"]

X_amg = vectorizer.transform(amg_df["text"])
y_amg = amg_df["label"]

# ----------------------------------------------------------
# SAFETY CHECK — ENSURE NO NaN EXISTS
# ----------------------------------------------------------
print("\nAny NaN in y_train?", y_train.isna().sum())
print("Any NaN in X_train?", pd.isna(X_train.data).sum())

# ----------------------------------------------------------
# 4. TRAIN LOGISTIC REGRESSION
# ----------------------------------------------------------
print("\n================ Logistic Regression ================")
lr = LogisticRegression(max_iter=5000, class_weight="balanced")
lr.fit(X_train, y_train)

print("\nEESA Test F1:", f1_score(y_test, lr.predict(X_test), average="macro"))
print("MR F1:", f1_score(y_mr, lr.predict(X_mr), average="macro"))
print("AMG F1:", f1_score(y_amg, lr.predict(X_amg), average="macro"))

# ----------------------------------------------------------
# 5. TRAIN SVM
# ----------------------------------------------------------
print("\n====================== SVM ======================")
svm = LinearSVC(class_weight="balanced")
svm.fit(X_train, y_train)

print("\nEESA Test F1:", f1_score(y_test, svm.predict(X_test), average="macro"))
print("MR F1:", f1_score(y_mr, svm.predict(X_mr), average="macro"))
print("AMG F1:", f1_score(y_amg, svm.predict(X_amg), average="macro"))


In [None]:
# ============================================================
# 1. Imports
# ============================================================
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score

import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ============================================================
# 2. Load datasets
# ============================================================
base = "/content/drive/MyDrive/cs-senti"

train_path = f"{base}/gan_synthetic2/unified_gan_augmented.csv"
eesa_dev_path = f"{base}/repo/data/cleaned/eesa_dev_clean.jsonl"
eesa_test_path = f"{base}/repo/data/cleaned/eesa_test_clean.jsonl"
mr_path = f"{base}/repo/data/cleaned/mr_cs_clean.jsonl"
amg_path = f"{base}/repo/data/cleaned/amg_cs_clean.jsonl"

train_df = pd.read_csv(train_path)
dev_df   = pd.read_json(eesa_dev_path, lines=True)
test_df  = pd.read_json(eesa_test_path, lines=True)
mr_df    = pd.read_json(mr_path, lines=True)
amg_df   = pd.read_json(amg_path, lines=True)

print("Dataset sizes:")
print(len(train_df), len(dev_df), len(test_df), len(mr_df), len(amg_df))

# Ensure consistent columns
train_df = train_df[["text", "label"]]
dev_df   = dev_df[["text", "label"]]
test_df  = test_df[["text", "label"]]
mr_df    = mr_df[["text", "label"]]
amg_df   = amg_df[["text", "label"]]

# ============================================================
# Label mapping
# ============================================================
label_map = {"neg": 0, "neu": 1, "pos": 2}

train_df["label"] = train_df["label"].map(label_map)
dev_df["label"]   = dev_df["label"].map(label_map)
test_df["label"]  = test_df["label"].map(label_map)
mr_df["label"]    = mr_df["label"].map(label_map)
amg_df["label"]   = amg_df["label"].map(label_map)

# ============================================================
# 3. TF-IDF Vectorizer (shared across classical models)
# ============================================================
vectorizer = TfidfVectorizer(
    max_features=40000,
    ngram_range=(1,2),
    min_df=2
)

X_train = vectorizer.fit_transform(train_df["text"])
y_train = train_df["label"]

X_dev  = vectorizer.transform(dev_df["text"])
y_dev  = dev_df["label"]

X_test = vectorizer.transform(test_df["text"])
y_test = test_df["label"]

X_mr   = vectorizer.transform(mr_df["text"])
y_mr   = mr_df["label"]

X_amg  = vectorizer.transform(amg_df["text"])
y_amg  = amg_df["label"]

# ============================================================
# 4. Logistic Regression
# ============================================================
print("\n================ Logistic Regression ================")
lr = LogisticRegression(max_iter=5000, class_weight="balanced")
lr.fit(X_train, y_train)

pred_test = lr.predict(X_test)
pred_mr   = lr.predict(X_mr)
pred_amg  = lr.predict(X_amg)

print("\nEESA Test F1:", f1_score(y_test, pred_test, average="macro"))
print(classification_report(y_test, pred_test))

print("\nMR Generalization F1:", f1_score(y_mr, pred_mr, average="macro"))
print(classification_report(y_mr, pred_mr))

print("\nAMG Generalization F1:", f1_score(y_amg, pred_amg, average="macro"))
print(classification_report(y_amg, pred_amg))

# ============================================================
# 5. Linear SVM
# ============================================================
print("\n================ Linear SVM ================")
svm = LinearSVC(class_weight="balanced")
svm.fit(X_train, y_train)

pred_test = svm.predict(X_test)
pred_mr   = svm.predict(X_mr)
pred_amg  = svm.predict(X_amg)

print("\nEESA Test F1:", f1_score(y_test, pred_test, average="macro"))
print(classification_report(y_test, pred_test))

print("\nMR Generalization F1:", f1_score(y_mr, pred_mr, average="macro"))
print(classification_report(y_mr, pred_mr))

print("\nAMG Generalization F1:", f1_score(y_amg, pred_amg, average="macro"))
print(classification_report(y_amg, pred_amg))

# ============================================================
# 6. BiLSTM Neural Baseline
# ============================================================
print("\n================ BiLSTM ================")

# ----------------------------
# Tokenization
# ----------------------------
tokenizer = Tokenizer(num_words=50000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["text"])

def tokenize_series(series, max_len=64):
    seq = tokenizer.texts_to_sequences(series)
    return pad_sequences(seq, maxlen=max_len, padding="post", truncating="post")

X_train_nn = tokenize_series(train_df["text"])
X_dev_nn   = tokenize_series(dev_df["text"])
X_test_nn  = tokenize_series(test_df["text"])
X_mr_nn    = tokenize_series(mr_df["text"])
X_amg_nn   = tokenize_series(amg_df["text"])

# ----------------------------
# Build model
# ----------------------------
model = tf.keras.Sequential([
    Embedding(50000, 128, input_length=64),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dense(3, activation="softmax")
])

model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

model.summary()

# ----------------------------
# Train
# ----------------------------
history = model.fit(
    X_train_nn, y_train,
    validation_data=(X_dev_nn, y_dev),
    epochs=5,
    batch_size=64,
    verbose=1
)

# ----------------------------
# Evaluate BiLSTM
# ----------------------------
def evaluate_nn(split_name, X, y):
    pred = model.predict(X)
    pred = np.argmax(pred, axis=1)
    print(f"\n=== {split_name} F1 ===")
    print(f1_score(y, pred, average='macro'))
    print(classification_report(y, pred))

evaluate_nn("EESA Test", X_test_nn, y_test)
evaluate_nn("MR Generalization", X_mr_nn, y_mr)
evaluate_nn("AMG Generalization", X_amg_nn, y_amg)



# **old training**

In [None]:
#still needs to run
print("====== TRAINING ON EESA ======")

train_gan(
    eesa_train_annotated,
    epochs=3,
    batch_size=8,
    generator=generator,
    optimizer=optimizer,
    disc_model=disc_model,
    disc_tokenizer=disc_tokenizer,
    reward_model=reward_model,
    reward_tokenizer=reward_tokenizer,
    sim_model=similarity_model
)


In [None]:
print("====== TRAINING ON MR-CS ======")

train_gan(
    mr_cs_labeled,
    epochs=3,
    batch_size=8,
    generator=generator,
    optimizer=optimizer,
    disc_model=disc_model,
    disc_tokenizer=disc_tokenizer,
    reward_model=reward_model,
    reward_tokenizer=reward_tokenizer,
    sim_model=similarity_model
)


In [None]:
print("====== TRAINING ON AMG-CS ======")

train_gan(
    amg_cs_labeled,
    epochs=3,
    batch_size=8,
    generator=generator,
    optimizer=optimizer,
    disc_model=disc_model,
    disc_tokenizer=disc_tokenizer,
    reward_model=reward_model,
    reward_tokenizer=reward_tokenizer,
    sim_model=similarity_model
)


In [None]:
#still needs to run
print("====== TRAINING ON AMG-mono ======")

train_gan(
    amg_ar_subset,
    epochs=3,
    batch_size=8,
    generator=generator,
    optimizer=optimizer,
    disc_model=disc_model,
    disc_tokenizer=disc_tokenizer,
    reward_model=reward_model,
    reward_tokenizer=reward_tokenizer,
    sim_model=similarity_model
)


generation

In [None]:
SAVE_DIR = "/content/drive/MyDrive/cs-senti/gan_synthetic"
os.makedirs(SAVE_DIR, exist_ok=True)


In [None]:
def generate_all_samples(dataset, generator, reward_tokenizer, reward_model):
    """
    Generate GAN synthetic data for the entire dataset.
    Includes the original sample ID for alignment.
    """
    out = []

    for ex in dataset:
        gan_text = gan_generate_text(ex, generator, reward_tokenizer, reward_model)
        out.append({
            "id": ex["id"],
            "original": ex["original"],
            "generated": gan_text,
            "label": ex["label"]
        })

    return out


In [None]:
#still needs to run

eesa_gan_samples = generate_all_samples(
    eesa_train_annotated,
    generator,
    reward_tokenizer,
    reward_model
)

with open(f"{SAVE_DIR}/eesa_gan_samples.jsonl", "w", encoding="utf-8") as f:
    for row in eesa_gan_samples:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("Saved EESA GAN synthetic data.")


In [None]:

mr_gan_samples = generate_all_samples(
    mr_cs_labeled,
    generator,
    reward_tokenizer,
    reward_model
)

with open(f"{SAVE_DIR}/mr_gan_samples.jsonl", "w", encoding="utf-8") as f:
    for row in mr_gan_samples:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("Saved MR-CS GAN synthetic data.")


In [None]:
amg_gan_samples = generate_all_samples(
    amg_cs_labeled,
    generator,
    reward_tokenizer,
    reward_model
)

with open(f"{SAVE_DIR}/amg_gan_samples.jsonl", "w", encoding="utf-8") as f:
    for row in amg_gan_samples:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("Saved AMG-CS GAN synthetic data.")


In [None]:
#still needs to run
amg_ar_gan_samples = generate_all_samples(
    amg_ar_subset,
    generator,
    reward_tokenizer,
    reward_model
)

with open(f"{SAVE_DIR}/amg_ar_gan_samplesV2.jsonl", "w", encoding="utf-8") as f:
    for row in amg_ar_gan_samples:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("Saved AMG-ar-CS GAN synthetic data.")


In [None]:
# =======================================
#       TRAINING ON AMG-AR MONO
# =======================================

print("====== TRAINING ON AMG-AR ======")

train_gan(
    amg_ar_subset,            # ← your 2000-sentence auto-labeled subset
    epochs=3,                 # same as EESA
    batch_size=8,             # same as EESA
    generator=generator,
    optimizer=optimizer,
    disc_model=disc_model,
    disc_tokenizer=disc_tokenizer,
    reward_model=reward_model,
    reward_tokenizer=reward_tokenizer,
    sim_model=similarity_model
)


In [None]:
# =======================================================
#        GAN GENERATION FOR AMG-AR SYNTHETIC DATA
# =======================================================

print("====== GENERATING AMG-AR SYNTHETIC DATA ======")

amg_ar_gan_samples = generate_all_samples(
    amg_ar_subset,        # ← your 2000 auto-labeled mono subset
    generator,            # trained generator
    reward_tokenizer,     # sentiment reward tokenizer
    reward_model          # sentiment reward model
)

# Save output
OUTPUT_PATH = f"{SAVE_DIR}/amg_ar_gan_samplesV3.jsonl"

with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    for row in amg_ar_gan_samples:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("Saved AMG-AR GAN synthetic data to:")
print(OUTPUT_PATH)


In [None]:
show_examples(amg_ar_gan_samples,  "AMG ar Synthetic GAN Data")

comparison with deterministic

In [None]:
DET_PATH = "/content/drive/MyDrive/cs-senti/data/ling/host_train_switched_lex_llm.jsonl"

det_rows = []
with open(DET_PATH, "r", encoding="utf-8") as f:
    for line in f:
        try:
            det_rows.append(json.loads(line))
        except:
            pass

print("Loaded deterministic:", len(det_rows))


In [None]:
def align_domain(det_rows, gan_samples, domain=None, limit=20):
    # Build dict original → deterministic
    det_map = {row["orig_text"]: row["switched_text"] for row in det_rows if (domain is None or row.get("domain") == domain)}

    gan_map = {row["original"]: row for row in gan_samples}

    aligned = []
    for orig, gan_row in gan_map.items():
        det_sw = det_map.get(orig, None)
        aligned.append({
            "id": gan_row["id"],
            "original": orig,
            "deterministic": det_sw,
            "gan": gan_row["generated"],
            "label": gan_row["label"]
        })

    # print first N
    for row in aligned[:limit]:
        print("ID:", row["id"])
        print("ORIGINAL:", row["original"])
        print("DETERMINISTIC:", row["deterministic"])
        print("GAN:", row["gan"])
        print("-" * 70)

    return aligned


In [None]:
print("====== ALIGN EESA ======")
align_domain(det_rows, eesa_gan_samples, domain="eesa", limit=20)


In [None]:
print("====== ALIGN MR-CS ======")
align_domain(det_rows, mr_gan_samples, domain="mr", limit=20)


In [None]:
print("====== ALIGN AMG-CS ======")
align_domain(det_rows, amg_gan_samples, domain="amg", limit=20)


# **showing samples**

In [None]:
def show_examples(samples, title="Examples"):
    """
    Display ALL generated GAN samples for a domain.
    No max_show limit.
    """
    print(f"\n========== {title} ==========\n")

    for ex in samples:
        print(f"ID        : {ex.get('id', 'N/A')}")
        print(f"ORIGINAL  : {ex['original']}")
        print(f"GAN       : {ex['generated']}")
        print(f"LABEL     : {ex['label']}")
        print("-" * 70)


In [None]:
show_examples(eesa_gan_samples, "EESA Synthetic GAN Data")



In [None]:
show_examples(mr_gan_samples,   "MR-CS Synthetic GAN Data")


In [None]:
show_examples(amg_gan_samples,  "AMG Synthetic GAN Data")

In [None]:
show_examples(amg_ar_gan_samples,  "AMG ar Synthetic GAN Data")

In [None]:
#version2
show_examples(amg_ar_gan_samples,  "AMG ar Synthetic GAN Data")

trials

In [None]:
def show_examples(samples, title="Examples", max_show=None):
    print(f"\n====== {title} ======\n")
    count = 0
    for ex in samples:
        print("ORIGINAL :", ex["original"])
        print("GAN      :", ex["generated"])
        print("LABEL    :", ex["label"])
        print("-" * 60)

        count += 1
        if max_show and count >= max_show:
            break


# Example usage
show_examples(mr_gan_samples, "MR-CS GAN", max_show=50)


In [None]:
def save_samples(path, samples):
    with open(path, "w", encoding="utf-8") as f:
        for row in samples:
            f.write(json.dumps(row, ensure_ascii=False) + "\n")
    print("Saved:", path)


In [None]:
save_samples("/content/drive/MyDrive/cs-senti/repo/data/gan_outputs/mr_gan_samples.jsonl", mr_gan_samples)


# **intrinsic evaluation**

In [None]:
import torch
from statistics import mean

def get_sentiment(text, tokenizer, model):
    inp = tokenizer(
        text, return_tensors="pt",
        truncation=True, padding=True
    ).to("cuda")

    pred = model(**inp).logits.softmax(-1).argmax(-1).item()
    return ["neg", "neu", "pos"][pred]

def evaluate_sentiment(samples, reward_tokenizer, reward_model):
    matches = 0
    total = 0

    for row in samples:
        true_label = row["label"]         # dataset label
        gen_label  = get_sentiment(row["generated"], reward_tokenizer, reward_model)

        if true_label == gen_label:
            matches += 1
        total += 1

    return matches / total if total > 0 else 0


In [None]:
import numpy as np
import torch

def cosine(a, b):
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-12))

def evaluate_similarity(samples, sim_model):
    sims = []

    for row in samples:
        emb = sim_model.encode(
            [row["original"], row["generated"]],
            convert_to_numpy=True
        )
        sims.append(cosine(emb[0], emb[1]))

    return float(np.mean(sims)) if sims else 0


In [None]:
import re

def is_english(tok):
    return bool(re.fullmatch(r"[A-Za-z]+", tok))

def evaluate_switch_rate(samples):
    switch_ratios = []

    for row in samples:
        orig_tokens = row["original"].split()
        gen_tokens  = row["generated"].split()

        if len(orig_tokens) != len(gen_tokens):
            continue

        total = len(orig_tokens)
        switched = sum(
            1 for o,g in zip(orig_tokens, gen_tokens)
            if is_english(o) != is_english(g)
        )

        switch_ratios.append(switched / total)

    return float(np.mean(switch_ratios)) if switch_ratios else 0


In [None]:
def evaluate_realness(samples, disc_tokenizer, disc_model):
    scores = []

    for row in samples:
        text = row["generated"]

        inputs = disc_tokenizer(
            text, return_tensors="pt",
            truncation=True, padding=True
        ).to("cuda")

        with torch.no_grad():
            logits = disc_model(**inputs).logits.softmax(-1)

        prob_real = float(logits[0][1])     # class 1 = "real"
        scores.append(prob_real)

    return float(np.mean(scores)) if scores else 0


In [None]:
def evaluate_domain(name, samples, reward_tokenizer, reward_model, sim_model, disc_tokenizer, disc_model):
    print(f"\n\n========== EVALUATION FOR: {name} ==========\n")

    S = evaluate_sentiment(samples, reward_tokenizer, reward_model)
    M = evaluate_similarity(samples, sim_model)
    L = evaluate_switch_rate(samples)
    D = evaluate_realness(samples, disc_tokenizer, disc_model)

    print(f"Sentiment Consistency : {S:.4f}")
    print(f"Meaning Similarity    : {M:.4f}")
    print(f"Language Switch Rate  : {L:.4f}")
    print(f"Discriminator Realness: {D:.4f}")

    return {
        "sentiment": S,
        "similarity": M,
        "switch_rate": L,
        "realness": D
    }


In [None]:

evaluate_domain("AMG-CS", amg_gan_samples,
                reward_tokenizer, reward_model,
                similarity_model,
                disc_tokenizer, disc_model)


In [None]:
evaluate_domain("EESA", eesa_gan_samples,
                reward_tokenizer, reward_model,
                similarity_model,
                disc_tokenizer, disc_model)



In [None]:
evaluate_domain("MR-CS", mr_gan_samples,
                reward_tokenizer, reward_model,
                similarity_model,
                disc_tokenizer, disc_model)


In [None]:

evaluate_domain("AMG-ar-CS", amg_ar_gan_samples,
                reward_tokenizer, reward_model,
                similarity_model,
                disc_tokenizer, disc_model)


In [None]:

evaluate_domain("AMG-ar-CS", amg_ar_gan_samples,
                reward_tokenizer, reward_model,
                similarity_model,
                disc_tokenizer, disc_model)


# **augmentation**

In [None]:
import json

def load_jsonl(path):
    data=[]
    with open(path,"r",encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

# ORIGINAL CLEANED DATASETS
BASE_CLEAN = "/content/drive/MyDrive/cs-senti/repo/data/cleaned"

eesa_train = load_jsonl(f"{BASE_CLEAN}/eesa_train_clean.jsonl")
mr_clean   = load_jsonl(f"{BASE_CLEAN}/mr_cs_clean.jsonl")
amg_clean  = load_jsonl(f"{BASE_CLEAN}/amg_cs_clean.jsonl")

# EXISTING GAN SYNTHETIC DATA
BASE_GAN = "/content/drive/MyDrive/cs-senti/gan_synthetic"

eesa_gan = load_jsonl(f"{BASE_GAN}/eesa_gan_samples.jsonl")
mr_gan   = load_jsonl(f"{BASE_GAN}/mr_gan_samples.jsonl")
amg_gan  = load_jsonl(f"{BASE_GAN}/amg_gan_samples.jsonl")

print(len(eesa_train), len(eesa_gan))
print(len(mr_clean), len(mr_gan))
print(len(amg_clean), len(amg_gan))


In [None]:
def convert_gan_to_training(gan_list):
    out=[]
    for row in gan_list:
        out.append({
            "id": row["id"],
            "text": row["generated"],
            "label": row["label"]
        })
    return out

eesa_gan_conv = convert_gan_to_training(eesa_gan)
mr_gan_conv   = convert_gan_to_training(mr_gan)
amg_gan_conv  = convert_gan_to_training(amg_gan)


In [None]:
aug_eesa = eesa_train + eesa_gan_conv
aug_mr   = mr_clean + mr_gan_conv
aug_amg  = amg_clean + amg_gan_conv

print("NEW SIZES:")
print(len(aug_eesa), len(aug_mr), len(aug_amg))


In [None]:
SAVE_OUT = "/content/drive/MyDrive/cs-senti/augmented_datasets"
os.makedirs(SAVE_OUT, exist_ok=True)

def save_jsonl(data, path):
    with open(path, "w", encoding="utf-8") as f:
        for row in data:
            f.write(json.dumps(row, ensure_ascii=False) + "\n")

save_jsonl(aug_eesa, f"{SAVE_OUT}/eesa_augmented.jsonl")
save_jsonl(aug_mr,   f"{SAVE_OUT}/mr_augmented.jsonl")
save_jsonl(aug_amg,  f"{SAVE_OUT}/amg_augmented.jsonl")


In [None]:
from datasets import Dataset
import json

def load_jsonl(path):
    data=[]
    with open(path,"r",encoding="utf-8") as f:
        for l in f:
            data.append(json.loads(l))
    return data

AUG_PATH = "/content/drive/MyDrive/cs-senti/augmented_datasets"

eesa_aug = load_jsonl(f"{AUG_PATH}/eesa_augmented.jsonl")
mr_aug   = load_jsonl(f"{AUG_PATH}/mr_augmented.jsonl")
amg_aug  = load_jsonl(f"{AUG_PATH}/amg_augmented.jsonl")

print(len(eesa_aug), len(mr_aug), len(amg_aug))


In [None]:
# COMBINE ALL AUGMENTED DATA
combined_aug = eesa_aug + mr_aug + amg_aug

len(combined_aug)


In [None]:
bad = [x for x in combined_aug if x["label"] is None]
len(bad)


In [None]:
combined_aug = [x for x in combined_aug if x["label"] is not None]


In [None]:
label2id = {"neg":0, "neu":1, "pos":2}
id2label = {0:"neg", 1:"neu", 2:"pos"}

def encode_label(row):
    row["label"] = label2id[row["label"]]
    return row

ds = Dataset.from_list(combined_aug)
ds = ds.map(encode_label)



In [None]:
DEV = "/content/drive/MyDrive/cs-senti/repo/data/cleaned/eesa_dev_clean.jsonl"
TEST = "/content/drive/MyDrive/cs-senti/repo/data/cleaned/eesa_test_clean.jsonl"

eesa_dev = load_jsonl(DEV)
eesa_test = load_jsonl(TEST)

eesa_dev = Dataset.from_list(eesa_dev).map(encode_label)
eesa_test = Dataset.from_list(eesa_test).map(encode_label)

print(len(eesa_dev), len(eesa_test))


In [None]:
from transformers import AutoTokenizer

MODEL_NAME = "UBC-NLP/MARBERTv2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_ds = ds.map(tokenize, batched=True)
dev_ds   = eesa_dev.map(tokenize, batched=True)
test_ds  = eesa_test.map(tokenize, batched=True)

train_ds = train_ds.remove_columns(["text", "id"])
dev_ds   = dev_ds.remove_columns(["text", "id"])
test_ds  = test_ds.remove_columns(["text", "id"])


In [None]:
!pip install --upgrade transformers datasets evaluate accelerate -q

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
import json
import numpy as np
import evaluate
from sklearn.metrics import classification_report
import os

os.environ["WANDB_DISABLED"] = "true"


# ============================================================
# Helper: Load JSONL
# ============================================================
def load_jsonl(path):
    return [json.loads(l) for l in open(path, "r", encoding="utf-8")]


# ============================================================
# FIX: unify all augmented data to {text, label}
# ============================================================
clean_train = []
for item in combined_aug:
    obj = {}

    # choose generated text if available, else text
    if "generated" in item and item["generated"]:
        obj["text"] = item["generated"].strip()
    else:
        obj["text"] = item["text"].strip()

    obj["label"] = item["label"]
    clean_train.append(obj)

label2id = {"neg":0, "neu":1, "pos":2}
id2label = {0:"neg", 1:"neu", 2:"pos"}

def encode(example):
    example["label"] = label2id[example["label"]]
    return example

train_ds = Dataset.from_list(clean_train).map(encode)


# ============================================================
# DEV + TEST (EESA)
# ============================================================
DEV = "/content/drive/MyDrive/cs-senti/repo/data/cleaned/eesa_dev_clean.jsonl"
TEST = "/content/drive/MyDrive/cs-senti/repo/data/cleaned/eesa_test_clean.jsonl"

eesa_dev = Dataset.from_list(load_jsonl(DEV)).map(encode)
eesa_test = Dataset.from_list(load_jsonl(TEST)).map(encode)


# ============================================================
# DatasetDict
# ============================================================
dataset = DatasetDict({
    "train": train_ds,
    "dev": eesa_dev,
    "test": eesa_test
})

print(dataset)


# ============================================================
# TRAINING FUNCTION (fixed tokenizer handling)
# ============================================================
def train_any_model(model_name, output_dir):

    print(f"\n============== Training {model_name} ==============\n")

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    def tokenize(batch):
        return tokenizer(
            batch["text"],
            truncation=True,
            padding="max_length",
            max_length=128
        )

    tokenized = dataset.map(tokenize, batched=True)
    tokenized = tokenized.rename_column("label", "labels")
    tokenized.set_format("torch")

    # ---------------- Model ----------------
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3,
        id2label=id2label,
        label2id=label2id
    )

    f1_metric = evaluate.load("f1")

    def compute_metrics(pred):
        logits, labels = pred
        preds = np.argmax(logits, axis=-1)
        f1 = f1_metric.compute(predictions=preds, references=labels, average="macro")
        return {"macro_f1": f1["f1"]}

    args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=4,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        learning_rate=2e-5,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=2,
        logging_strategy="epoch",
        report_to=[]
    )

    trainer = Trainer(
        model=model,
        args=args,
        tokenizer=tokenizer,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["dev"],
        compute_metrics=compute_metrics
    )

    trainer.train()

    print("\n===== FINAL TEST RESULTS =====")
    test_out = trainer.evaluate(tokenized["test"])
    print(test_out)

    # Full classification report
    preds = trainer.predict(tokenized["test"]).predictions
    pred_labels = np.argmax(preds, axis=1)
    true_labels = np.array(tokenized["test"]["labels"])

    print("\n===== CLASSIFICATION REPORT =====")
    print(classification_report(
        true_labels, pred_labels,
        target_names=["neg", "neu", "pos"]
    ))

    return trainer


In [None]:
trainer = train_any_model(
    model_name="UBC-NLP/MARBERT",
    output_dir="/content/drive/MyDrive/cs-senti/marbert_gan_aug"
)


# **version 2 GAN Model**

In [None]:
!pip install transformers sentence-transformers -q

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import numpy as np
from tqdm import tqdm


In [None]:
import json
from pathlib import Path

BASE = Path("/content/drive/MyDrive/cs-senti/repo/data/annotated_with_id")

def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

# Load your GAN-ready annotated datasets
eesa_train_annotated = load_jsonl(BASE/"eesa_train_annotated.jsonl")
eesa_dev_annotated   = load_jsonl(BASE/"eesa_dev_annotated.jsonl")
eesa_test_annotated  = load_jsonl(BASE/"eesa_test_annotated.jsonl")

mr_cs_labeled  = load_jsonl(BASE/"mr_cs_labeled_annotated.jsonl")
amg_cs_labeled = load_jsonl(BASE/"amg_cs_labeled_annotated.jsonl")
amg_ar_mono    = load_jsonl(BASE/"amg_ar_mono_annotated.jsonl")

print("EESA train:", len(eesa_train_annotated))
print("MR-CS:", len(mr_cs_labeled))
print("AMG-CS:", len(amg_cs_labeled))
print("AMG-mono:", len(amg_ar_mono))


In [None]:
import json

LEXICON_PATH = "/content/drive/MyDrive/cs-senti/repo/data/ar_en_lexicon_MERGED.jsonl"

def load_lexicon(path):
    lex = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)

            ar = item["ar"]

            # Ensure English is always a list
            en_list = item["en"]
            if isinstance(en_list, str):
                en_list = [en_list]

            # Ensure synonyms exist and are a list
            syns = item.get("syn", [])
            if isinstance(syns, str):
                syns = [syns]

            # Merge and dedupe while preserving order
            merged = list(dict.fromkeys(en_list + syns))

            lex[ar] = merged

    return lex

LEXICON = load_lexicon(LEXICON_PATH)
print("✓ Loaded lexicon entries:", len(LEXICON))


In [None]:
DEVICE = "cuda"

# ============================
# (1) XLM-R contextual encoder
# ============================
xlmr_tok = AutoTokenizer.from_pretrained("xlm-roberta-base")
xlmr = AutoModel.from_pretrained("xlm-roberta-base").to(DEVICE)
xlmr.eval()

# ============================
# (2) NEW MARBERT sentiment reward model
# ============================
reward_path = "/content/drive/MyDrive/cs-senti/baseline_marbert_v1/checkpoint-1920"

reward_tokenizer = AutoTokenizer.from_pretrained(reward_path)
reward_model = AutoModelForSequenceClassification.from_pretrained(reward_path).to(DEVICE)
reward_model.eval()

print("✓ Loaded NEW MARBERT reward model for GAN sentiment reward")

# ============================
# (3) ArabicBERT discriminator (REAL vs FAKE)
# ============================
disc_tok = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
disc_model = AutoModelForSequenceClassification.from_pretrained(
    "aubmindlab/bert-base-arabertv2",
    num_labels=2
).to(DEVICE)

disc_model.eval()
print("✓ Loaded Discriminator (ArabicBERT REAL/FAKE classifier)")

# ============================
# (4) LaBSE semantic similarity model
# ============================
from sentence_transformers import SentenceTransformer
labse = SentenceTransformer("sentence-transformers/LaBSE").to(DEVICE)

print("✓ Loaded LaBSE for semantic similarity")


In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
import numpy as np

DEVICE = "cuda"

# Already loaded in Block 1
# xlmr_tok
# xlmr

@torch.no_grad()
def extract_xlmr_embeddings(tokens):
    """
    Returns contextual embedding per token (768-dim).
    """
    encoded = xlmr_tok(tokens, return_tensors="pt", is_split_into_words=True, truncation=True).to(DEVICE)

    outputs = xlmr(**encoded)
    hidden = outputs.last_hidden_state[0]  # (seq_len, 768)

    # align subwords → token-level embeddings
    token_embs = []
    word_ids = encoded.word_ids()
    current = []

    for hid, wid in zip(hidden, word_ids):
        if wid is None:
            continue
        if len(token_embs) <= wid:
            token_embs.append([hid])
        else:
            token_embs[wid].append(hid)

    final_embs = [torch.stack(w).mean(dim=0) for w in token_embs]
    return final_embs  # list of 768-dim tensors


In [None]:
# POS tag set you used during annotation:
POS_TAGS = sorted(list(set([
    "NOUN","VERB","ADJ","ADV","PRON","DET","ADP","CONJ",
    "PART","NUM","PROPN","X","PUNCT","SYM"
])))

pos2id = {p:i for i,p in enumerate(POS_TAGS)}

# trainable projection
proj = nn.Linear(768, 300).to(DEVICE)

def token_to_vector(tok_emb, pos, lang, maskable, has_al):
    pos_vec = torch.zeros(len(POS_TAGS), device=DEVICE)
    pos_vec[pos2id.get(pos, 0)] = 1.0

    lang_val = 1.0 if lang == "en" else 0.0

    return torch.cat([
        proj(tok_emb),                 # 300 dim
        pos_vec,                       # 14 dim
        torch.tensor([lang_val], device=DEVICE),
        torch.tensor([float(maskable)], device=DEVICE),
        torch.tensor([float(has_al)], device=DEVICE)
    ])  # total = 317 dim


In [None]:
class SwitchGenerator(nn.Module):
    def __init__(self, feature_size=FEATURE_SIZE, hidden=128):
        super().__init__()
        self.linear1 = nn.Linear(feature_size, hidden)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden, 1)

    def forward(self, x):
        logits = self.linear2(self.relu(self.linear1(x)))  # raw scores
        probs = torch.sigmoid(logits)                      # convert to probabilities
        return probs.squeeze(-1), logits.squeeze(-1)


generator = SwitchGenerator().to(DEVICE)
gen_optimizer = torch.optim.Adam(generator.parameters(), lr=1e-4)

print("✓ Generator updated with FEATURE_SIZE =", FEATURE_SIZE)


In [None]:
def reinforce_step(example):
    tokens = example["tokens"]
    mask_positions = example["mask_positions"]
    pos = example["pos"]
    lang = example["lang"]
    has_al = example["has_al"]
    label = example["label"]

    if len(mask_positions)==0:
        return None

    xlmr_embs = extract_xlmr_embeddings(tokens)

    # Build feature vectors for mask positions only
    vecs = []
    for i in mask_positions:
        vec = token_to_vector(
            xlmr_embs[i],
            pos[i],
            lang[i],
            True,          # mask positions are maskable
            has_al[i]
        )
        vecs.append(vec)
    vecs = torch.stack(vecs)

    # forward pass
    probs, logits = generator(vecs)
    dist = torch.distributions.Bernoulli(probs)
    actions = dist.sample()            # 1 = switch, 0 = keep
    log_probs = dist.log_prob(actions)

    # apply switches
    new_tokens = tokens.copy()
    for act, pos_idx in zip(actions, mask_positions):
        if act.item()==1:
            # choose a replacement (lexicon)
            cands = generate_candidates_for_single(example, pos_idx)
            if cands:
                new_tokens[pos_idx] = cands[0]  # best candidate
    new_text = " ".join(new_tokens)


In [None]:
def compute_reward(original, new_text, gold_label):
    # Sentiment reward
    lab2id = {"neg":0,"neu":1,"pos":2}
    sent_in = reward_tokenizer(new_text, return_tensors="pt", truncation=True).to(DEVICE)
    sent_probs = reward_model(**sent_in).logits.softmax(-1).detach()
    S_reward = sent_probs[0][lab2id[gold_label]].item()

    # Discriminator
    disc_in = disc_tok(new_text, return_tensors="pt", truncation=True).to(DEVICE)
    D_reward = disc_model(**disc_in).logits.softmax(-1)[0][1].item()

    # Semantic similarity
    emb_a = labse.encode(original, convert_to_tensor=True)
    emb_b = labse.encode(new_text, convert_to_tensor=True)
    Sim = torch.nn.functional.cosine_similarity(emb_a, emb_b, dim=0).item()

    # weights
    return 0.5*D_reward + 0.4*S_reward + 0.1*max(Sim,0)


In [None]:
def gan_generate(example):
    tokens = example["tokens"].copy()
    xlmr_embs = extract_xlmr_embeddings(tokens)

    for mp in example["mask_positions"]:
        vec = token_to_vector(
            xlmr_embs[mp],
            example["pos"][mp],
            example["lang"][mp],
            True,
            example["has_al"][mp]
        ).unsqueeze(0)

        with torch.no_grad():
            prob, _ = generator(vec)
        if prob.item() < 0.5:
            continue

        cands = generate_candidates_for_single(example, mp)
        if cands:
            tokens[mp] = cands[0]

    return " ".join(tokens)


In [None]:
def generate_candidates_for_single(example, pos_idx):
    tok = example["tokens"][pos_idx]
    return LEXICON.get(tok, [])


In [None]:
from tqdm import tqdm
import random

def train_gan_amg(dataset, epochs=3, batch_size=8):
    for ep in range(epochs):
        print(f"\n===== AMG-CS — EPOCH {ep+1}/{epochs} =====")
        random.shuffle(dataset)

        total_loss = []
        total_reward = []

        for i in tqdm(range(0, len(dataset), batch_size)):
            batch = dataset[i:i+batch_size]

            for ex in batch:

                # --- skip if no maskable positions ---
                if len(ex["mask_positions"]) == 0:
                    continue

                # -----------------------------------------
                # Step 1: Extract token embeddings
                # -----------------------------------------
                xlmr_embs = extract_xlmr_embeddings(ex["tokens"])

                # -----------------------------------------
                # Step 2: Build feature vectors for mask positions
                # -----------------------------------------
                vecs = []
                for mp in ex["mask_positions"]:
                    vec = token_to_vector(
                        xlmr_embs[mp],
                        ex["pos"][mp],
                        ex["lang"][mp],
                        True,
                        ex["has_al"][mp]
                    )
                    vecs.append(vec)

                vecs = torch.stack(vecs)  # (num_masked, 317)

                # -----------------------------------------
                # Step 3: Generator forward → probs & log_probs
                # -----------------------------------------
                probs, logits = generator(vecs)
                dist = torch.distributions.Bernoulli(probs)
                actions = dist.sample()
                log_probs = dist.log_prob(actions)

                # -----------------------------------------
                # Step 4: Build new sentence according to actions
                # -----------------------------------------
                new_tokens = ex["tokens"].copy()
                for act, pos_idx in zip(actions, ex["mask_positions"]):
                    if act.item() == 1:
                        cands = generate_candidates_for_single(ex, pos_idx)
                        if cands:
                            new_tokens[pos_idx] = cands[0]

                new_text = " ".join(new_tokens)

                # -----------------------------------------
                # Step 5: Compute reward
                # -----------------------------------------
                reward = compute_reward(
                    original=ex["original"],
                    new_text=new_text,
                    gold_label=ex["label"]
                )
                total_reward.append(reward)

                # -----------------------------------------
                # Step 6: Policy gradient update
                # -----------------------------------------
                loss = -(log_probs * reward).mean()

                gen_optimizer.zero_grad()
                loss.backward()
                gen_optimizer.step()

                total_loss.append(loss.item())

        # ----- end of epoch -----
        avg_loss = sum(total_loss) / len(total_loss) if total_loss else 0
        avg_reward = sum(total_reward) / len(total_reward) if total_reward else 0

        print(f"Epoch {ep+1} — Avg Loss = {avg_loss:.4f}, Avg Reward = {avg_reward:.4f}")


In [None]:
print("🚀 Training GAN Switch Predictor on AMG-CS…")
train_gan_amg(amg_cs_labeled, epochs=3, batch_size=8)


In [None]:
gan_amg_outputs = []

for ex in amg_cs_labeled:
    out = gan_generate(ex)
    gan_amg_outputs.append({
        "id": ex["id"],
        "original": ex["original"],
        "gan": out,
        "label": ex["label"]
    })


In [None]:
SAVE_DIR = "/content/drive/MyDrive/cs-senti/gan_synthetic"
os.makedirs(SAVE_DIR, exist_ok=True)

with open(f"{SAVE_DIR}/amg_gan_samplesV3.jsonl", "w", encoding="utf-8") as f:
    for row in gan_amg_outputs:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("✓ Saved AMG GAN synthetic samples.")


In [None]:
import json

def show_all_gan_samples(path):
    print("📌 Displaying ALL GAN-generated samples:\n")

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            row = json.loads(line)

            print("="*100)
            print(f"🆔 ID: {row['id']}   |   🏷️ Label: {row['label']}")
            print("\n🔹 Original:")
            print(row["original"])

            print("\n🔹 GAN Generated:")
            print(row["gan"])
            print("\n")
    print("="*100)
    print("✓ Finished displaying all samples.")


In [None]:
show_all_gan_samples(
    "/content/drive/MyDrive/cs-senti/gan_synthetic/amg_gan_samplesV3.jsonl"
)


# **extrinsic evaluation**

In [None]:
!pip install transformers datasets evaluate accelerate -q


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
import json
from pathlib import Path
import numpy as np
import evaluate
from sklearn.metrics import classification_report


In [None]:
def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data


In [None]:
BASE = Path("/content/drive/MyDrive/cs-senti/repo/data/cleaned")

eesa_train = load_jsonl(BASE/"eesa_train_clean.jsonl")
eesa_dev   = load_jsonl(BASE/"eesa_dev_clean.jsonl")
eesa_test  = load_jsonl(BASE/"eesa_test_clean.jsonl")

mr_cs      = load_jsonl(BASE/"mr_cs_clean.jsonl")
amg_cs     = load_jsonl(BASE/"amg_cs_clean.jsonl")

print(len(eesa_train), len(eesa_dev), len(eesa_test))
print(len(mr_cs), len(amg_cs))


In [None]:
train_data = eesa_train + mr_cs + amg_cs
dev_data   = eesa_dev
test_data  = eesa_test

print("Final train size:", len(train_data))
print("Final dev size:", len(dev_data))
print("Final test size:", len(test_data))


In [None]:
label2id = {"neg": 0, "neu": 1, "pos": 2}
id2label = {v: k for k, v in label2id.items()}

def encode_label(example):
    example["label"] = label2id[example["label"]]
    return example


In [None]:
train_dataset = Dataset.from_list(train_data).map(encode_label)
dev_dataset   = Dataset.from_list(dev_data).map(encode_label)
test_dataset  = Dataset.from_list(test_data).map(encode_label)

dataset = DatasetDict({
    "train": train_dataset,
    "dev": dev_dataset,
    "test": test_dataset
})


In [None]:
MODEL = "xlm-roberta-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)


In [None]:
def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized = dataset.map(tokenize_function, batched=True)
tokenized = tokenized.remove_columns(["text", "id"])
tokenized = tokenized.rename_column("label", "labels")
tokenized.set_format("torch")


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)


In [None]:
metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    f1 = metric.compute(predictions=preds, references=labels, average="macro")
    return {"macro_f1": f1["f1"]}


In [None]:
args = TrainingArguments(
    output_dir="/content/drive/MyDrive/cs-senti/baseline_xlmr_v0",
    num_train_epochs=4,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,

    # 🔥 Disable ALL logging integrations
    report_to=[],         # <-- THIS TURNS OFF wandb COMPLETELY
    logging_strategy="epoch",
    save_strategy="epoch",
    eval_strategy="epoch",
)


In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["dev"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_API_KEY"] = "dummy"
os.environ["WANDB_MODE"] = "offline"


In [None]:
trainer.train()


In [None]:
results = trainer.evaluate(tokenized["test"])
results


In [None]:
preds = trainer.predict(tokenized["test"]).predictions
pred_labels = np.argmax(preds, axis=1)

true_labels = np.array([x["labels"] for x in tokenized["test"]])

print(classification_report(true_labels, pred_labels, target_names=["neg", "neu", "pos"]))


In [None]:
mr_dataset = Dataset.from_list(mr_cs).map(encode_label)
amg_dataset = Dataset.from_list(amg_cs).map(encode_label)

mr_tokenized = mr_dataset.map(tokenize_function, batched=True)
mr_tokenized = mr_tokenized.remove_columns(["text", "id"])
mr_tokenized.set_format("torch")

amg_tokenized = amg_dataset.map(tokenize_function, batched=True)
amg_tokenized = amg_tokenized.remove_columns(["text", "id"])
amg_tokenized.set_format("torch")

print("MR-CS:", trainer.evaluate(mr_tokenized))
print("AMG-CS:", trainer.evaluate(amg_tokenized))


# **REAL FINAL transformer based sentiment classifier**

In [None]:
!pip install --upgrade transformers datasets evaluate accelerate -q

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
import json
import numpy as np
import evaluate
from pathlib import Path
from sklearn.metrics import classification_report
import os

os.environ["WANDB_DISABLED"] = "true"



# ======================================================================
# 1. Load Datasets
# ======================================================================
def load_jsonl(path):
    return [json.loads(l) for l in open(path, "r", encoding="utf-8")]

BASE = Path("/content/drive/MyDrive/cs-senti/repo/data/cleaned")

eesa_train = load_jsonl(BASE/"eesa_train_clean.jsonl")
eesa_dev   = load_jsonl(BASE/"eesa_dev_clean.jsonl")
eesa_test  = load_jsonl(BASE/"eesa_test_clean.jsonl")

mr_cs  = load_jsonl(BASE/"mr_cs_clean.jsonl")
amg_cs = load_jsonl(BASE/"amg_cs_clean.jsonl")

train_data = eesa_train + mr_cs + amg_cs
dev_data   = eesa_dev
test_data  = eesa_test

label2id = {"neg": 0, "neu": 1, "pos": 2}
id2label = {v: k for k, v in label2id.items()}

def encode_label(example):
    example["label"] = label2id[example["label"]]
    return example

dataset = DatasetDict({
    "train": Dataset.from_list(train_data).map(encode_label),
    "dev":   Dataset.from_list(dev_data).map(encode_label),
    "test":  Dataset.from_list(test_data).map(encode_label),
})



# ======================================================================
# 2. Main training function
# ======================================================================
def train_any_model(model_name, output_dir):

    print(f"\n============== Training {model_name} ==============\n")

    # ---------------- Tokenizer ----------------
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    def tokenize(batch):
        return tokenizer(
            batch["text"],
            truncation=True,
            padding="max_length",
            max_length=128
        )

    tokenized = dataset.map(tokenize, batched=True)

    # remove missing columns
    for col in ["text", "id"]:
        if col in tokenized["train"].column_names:
            tokenized = tokenized.remove_columns(col)

    tokenized = tokenized.rename_column("label", "labels")
    tokenized.set_format("torch")

    # ---------------- Model ----------------
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3,
        id2label=id2label,
        label2id=label2id
    )

    # ---------------- Metrics ----------------
    f1_metric = evaluate.load("f1")

    def compute_metrics(pred):
        logits, labels = pred
        preds = np.argmax(logits, axis=-1)
        f1 = f1_metric.compute(predictions=preds, references=labels, average="macro")
        return {"macro_f1": f1["f1"]}

    # ---------------- Arguments ----------------
    args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=4,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        learning_rate=2e-5,
        save_total_limit=2,
        load_best_model_at_end=True,
        # Corrected argument names
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        report_to=[]
    )

    # ---------------- Trainer ----------------
    trainer = Trainer(
        model=model,
        args=args,
        tokenizer=tokenizer,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["dev"],
        compute_metrics=compute_metrics
    )

    trainer.train()

    # final test set eval
    print(trainer.evaluate(tokenized["test"]))

    preds = trainer.predict(tokenized["test"]).predictions
    pred_labels = np.argmax(preds, axis=1)
    true_labels = np.array([x["labels"] for x in tokenized["test"]])

    print(classification_report(
        true_labels, pred_labels,
        target_names=["neg", "neu", "pos"]
    ))

    return trainer

In [None]:
trainer_xlmr = train_any_model(
    "xlm-roberta-base",
    "/content/drive/MyDrive/cs-senti/baseline_xlmr_fixed"
)


In [None]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base", use_fast=True)
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

mr_dataset = Dataset.from_list(mr_cs).map(encode_label)
amg_dataset = Dataset.from_list(amg_cs).map(encode_label)

mr_tokenized = mr_dataset.map(tokenize, batched=True)
amg_tokenized = amg_dataset.map(tokenize, batched=True)

# Remove unused columns if they exist
for col in ["text", "id"]:
    if col in mr_tokenized.column_names:
        mr_tokenized = mr_tokenized.remove_columns(col)
    if col in amg_tokenized.column_names:
        amg_tokenized = amg_tokenized.remove_columns(col)

mr_tokenized.set_format("torch")
amg_tokenized.set_format("torch")
print("MR-CS:", trainer_xlmr.evaluate(mr_tokenized))
print("AMG-CS:", trainer_xlmr.evaluate(amg_tokenized))


In [None]:
trainer_arabert = train_any_model(
    "aubmindlab/bert-base-arabertv2",
    "/content/drive/MyDrive/cs-senti/baseline_arabert_v1"
)


In [None]:
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2", use_fast=True)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
mr_tokenized = mr_dataset.map(tokenize, batched=True)
amg_tokenized = amg_dataset.map(tokenize, batched=True)
print("MR-CS:", trainer_arabert.evaluate(mr_tokenized))
print("AMG-CS:", trainer_arabert.evaluate(amg_tokenized))



In [None]:
trainer_marbert = train_any_model(
    "UBC-NLP/MARBERTv2",
    "/content/drive/MyDrive/cs-senti/baseline_marbert_v1"
)


In [None]:
tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/MARBERTv2", use_fast=True)
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

mr_dataset = Dataset.from_list(mr_cs).map(encode_label)
amg_dataset = Dataset.from_list(amg_cs).map(encode_label)

mr_tokenized = mr_dataset.map(tokenize, batched=True)
amg_tokenized = amg_dataset.map(tokenize, batched=True)

# Remove unused columns if they exist
for col in ["text", "id"]:
    if col in mr_tokenized.column_names:
        mr_tokenized = mr_tokenized.remove_columns(col)
    if col in amg_tokenized.column_names:
        amg_tokenized = amg_tokenized.remove_columns(col)

mr_tokenized.set_format("torch")
amg_tokenized.set_format("torch")
print("MR-CS:", trainer_marbert.evaluate(mr_tokenized))
print("AMG-CS:", trainer_marbert.evaluate(amg_tokenized))


# **Classical Baselines (TF-IDF + SVM + Logistic Regression)**

In [None]:
import json

BASE = "/content/drive/MyDrive/cs-senti/repo/data/cleaned"

def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

eesa_train = load_jsonl(f"{BASE}/eesa_train_clean.jsonl")
eesa_dev   = load_jsonl(f"{BASE}/eesa_dev_clean.jsonl")
eesa_test  = load_jsonl(f"{BASE}/eesa_test_clean.jsonl")

mr_cs      = load_jsonl(f"{BASE}/mr_cs_clean.jsonl")
amg_cs     = load_jsonl(f"{BASE}/amg_cs_clean.jsonl")

print(len(eesa_train), len(eesa_dev), len(eesa_test))
print(len(mr_cs), len(amg_cs))


In [None]:
train_data = eesa_train + mr_cs + amg_cs
dev_data   = eesa_dev
test_data  = eesa_test

print("TRAIN:", len(train_data))
print("DEV:", len(dev_data))
print("TEST:", len(test_data))


In [None]:
def prepare_xy(data):
    X = [d["text"] for d in data]
    y = [d["label"] for d in data]
    return X, y

X_train, y_train = prepare_xy(train_data)
X_dev,   y_dev   = prepare_xy(dev_data)
X_test,  y_test  = prepare_xy(test_data)


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_dev_enc   = le.transform(y_dev)
y_test_enc  = le.transform(y_test)

le.classes_


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1,2),
    min_df=2
)

X_train_vec = tfidf.fit_transform(X_train)
X_dev_vec   = tfidf.transform(X_dev)
X_test_vec  = tfidf.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    max_iter=2000,
    C=3.0,
    class_weight="balanced",
)

lr.fit(X_train_vec, y_train_enc)


In [None]:
from sklearn.svm import LinearSVC

svm = LinearSVC(
    C=1.0,
    class_weight="balanced"
)

svm.fit(X_train_vec, y_train_enc)


In [None]:
from sklearn.metrics import classification_report, accuracy_score, f1_score

def evaluate(model, X, y_true, title=""):
    pred = model.predict(X)
    print(f"\n===== {title} =====")
    print("Accuracy:", accuracy_score(y_true, pred))
    print("Macro F1:", f1_score(y_true, pred, average="macro"))
    print(classification_report(y_true, pred, target_names=le.classes_))


In [None]:
X_mr, y_mr = prepare_xy(mr_cs)
y_mr_enc   = le.transform(y_mr)

X_amg, y_amg = prepare_xy(amg_cs)
y_amg_enc    = le.transform(y_amg)

X_mr_vec  = tfidf.transform(X_mr)
X_amg_vec = tfidf.transform(X_amg)


In [None]:
evaluate(lr, X_test_vec, y_test_enc, "LR — EESA Test")
evaluate(lr, X_mr_vec,  y_mr_enc,   "LR — MR-CS Test")
evaluate(lr, X_amg_vec, y_amg_enc,  "LR — AMG-CS Test")


In [None]:
evaluate(svm, X_test_vec, y_test_enc, "SVM — EESA Test")
evaluate(svm, X_mr_vec,  y_mr_enc,   "SVM — MR-CS Test")
evaluate(svm, X_amg_vec, y_amg_enc,  "SVM — AMG-CS Test")


## **NEURAL BASELINES**

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.bin.gz
!gunzip cc.ar.300.bin.gz


In [None]:
!pip install fasttext
import fasttext
ft = fasttext.load_model("cc.ar.300.bin")

In [None]:
import json

BASE = "/content/drive/MyDrive/cs-senti/repo/data/cleaned"

def load_jsonl(path):
    return [json.loads(l) for l in open(path, "r", encoding="utf-8")]

eesa_train = load_jsonl(f"{BASE}/eesa_train_clean.jsonl")
eesa_dev   = load_jsonl(f"{BASE}/eesa_dev_clean.jsonl")
eesa_test  = load_jsonl(f"{BASE}/eesa_test_clean.jsonl")
mr_cs      = load_jsonl(f"{BASE}/mr_cs_clean.jsonl")
amg_cs     = load_jsonl(f"{BASE}/amg_cs_clean.jsonl")

train_data = eesa_train + mr_cs + amg_cs
dev_data   = eesa_dev
test_data  = eesa_test


In [None]:
def fix_labels(ds):
    fixed = []
    for row in ds:
        lbl = row["label"]

        # if label is list like ["neg"]
        if isinstance(lbl, list):
            lbl = lbl[0]

        row["label"] = lbl
        fixed.append(row)
    return fixed

final_train = fix_labels(final_train)
final_dev   = fix_labels(final_dev)
final_test_eesa = fix_labels(final_test_eesa)
final_test_amg  = fix_labels(final_test_amg)
final_test_mr   = fix_labels(final_test_mr)


In [None]:
valid = {"neg","neu","pos"}

for row in final_train:
    if row["label"] not in valid:
        print("Unexpected:", row)
        break


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

texts = [d["text"] for d in train_data]
tokenizer = Tokenizer(num_words=50000, lower=True)
tokenizer.fit_on_texts(texts)

max_len = 50

In [None]:
def to_seq(data):
    X = tokenizer.texts_to_sequences([d["text"] for d in data])
    X = pad_sequences(X, maxlen=max_len, padding="post")
    y = [d["label"] for d in data]
    return X, y

X_train, y_train = to_seq(train_data)
X_dev, y_dev = to_seq(dev_data)
X_test, y_test = to_seq(test_data)


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_dev   = le.transform(y_dev)
y_test  = le.transform(y_test)


In [None]:
import numpy as np

vocab_size = min(50000, len(tokenizer.word_index) + 1)
embedding_dim = 300

embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        continue
    embedding_matrix[i] = ft.get_word_vector(word)


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential

model_bilstm = Sequential([
    Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.3),
    Dense(3, activation="softmax")
])

model_bilstm.compile(loss="sparse_categorical_crossentropy",
                     optimizer="adam",
                     metrics=["accuracy"])
model_bilstm.summary()


In [None]:
history_bilstm = model_bilstm.fit(
    X_train, y_train,
    validation_data=(X_dev, y_dev),
    batch_size=64,
    epochs=8
)


In [None]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D

model_cnn_lstm = Sequential([
    Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False),
    Conv1D(128, 5, activation="relu"),
    MaxPooling1D(2),
    LSTM(128),
    Dropout(0.3),
    Dense(3, activation="softmax")
])

model_cnn_lstm.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["accuracy"])
model_cnn_lstm.summary()


In [None]:
history_cnn_lstm = model_cnn_lstm.fit(
    X_train, y_train,
    validation_data=(X_dev, y_dev),
    batch_size=64,
    epochs=8
)


In [None]:
from tensorflow.keras.layers import Layer

class Attention(Layer):
    def call(self, inputs):
        score = tf.nn.softmax(tf.matmul(inputs, inputs, transpose_b=True), axis=-1)
        context = tf.matmul(score, inputs)
        return tf.reduce_mean(context, axis=1)

model_attn = Sequential([
    Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False),
    Bidirectional(LSTM(128, return_sequences=True)),
    Attention(),
    Dropout(0.3),
    Dense(3, activation="softmax")
])

model_attn.compile(loss="sparse_categorical_crossentropy",
                   optimizer="adam",
                   metrics=["accuracy"])
model_attn.summary()


In [None]:
history_attn = model_attn.fit(
    X_train, y_train,
    validation_data=(X_dev, y_dev),
    batch_size=64,
    epochs=8
)


In [None]:
from sklearn.metrics import classification_report, f1_score

def eval_model(model, X, y, title):
    pred = np.argmax(model.predict(X), axis=1)
    print(f"\n===== {title} =====")
    print("Macro F1:", f1_score(y, pred, average="macro"))
    print(classification_report(y, pred, target_names=le.classes_))


In [None]:
eval_model(model_bilstm,    X_test, y_test, "BiLSTM — EESA Test")
eval_model(model_cnn_lstm, X_test, y_test, "CNN-LSTM — EESA Test")
eval_model(model_attn,     X_test, y_test, "BiLSTM-Attn — EESA Test")


In [None]:
X_mr, y_mr = to_seq(mr_cs)
y_mr = le.transform(y_mr)

eval_model(model_bilstm, X_mr, y_mr, "BiLSTM — MR-CS Test")


In [None]:
X_amg, y_amg = to_seq(amg_cs)
y_amg = le.transform(y_amg)

eval_model(model_bilstm, X_amg, y_amg, "BiLSTM — AMG-CS Test")


In [None]:
import pandas as pd

data = {
    "Model": [
        "Logistic Regression", "SVM",
        "BiLSTM", "CNN-LSTM", "BiLSTM-Attn",
        "XLM-R", "AraBERTv2", "MARBERTv2"
    ],
    "EESA_F1": [
        0.776, 0.774,
        0.701, 0.670, 0.725,
        0.816, 0.818, 0.883
    ],
    "MRCS_F1": [
        0.957, 0.980,
        0.671, None, None,
        0.810, 0.808, 0.950
    ],
    "AMGCS_F1": [
        0.949, 0.980,
        0.627, None, None,
        0.748, 0.769, 0.933
    ]
}

df = pd.DataFrame(data)
df


In [None]:
from tabulate import tabulate

print(tabulate(df, headers='keys', tablefmt='github'))


In [None]:
df.to_csv("pre_augmentation_results.csv", index=False)
print("Saved as pre_augmentation_results.csv")


# **post aug deterministic**

In [None]:
DET_PATH = "/content/drive/MyDrive/cs-senti/data/ling/host_train_switched_lex_llm.jsonl"

det_rows = []
with open(DET_PATH, "r", encoding="utf-8") as f:
    for line in f:
        try:
            det_rows.append(json.loads(line))
        except:
            pass

print("Loaded deterministic:", len(det_rows))


In [None]:
import json

DET_PATH = "/content/drive/MyDrive/cs-senti/data/ling/host_train_switched_lex_llm.jsonl"

print("=== RAW FILE INSPECTION ===")
with open(DET_PATH, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        print(f"\n--- LINE {i+1} ---")
        print(line)
        if i >= 4:  # show only first 5 lines
            break


In [None]:
def show_aug_samples(rows, n=10):
    print("\n===== SAMPLE AUGMENTED SENTENCES =====\n")
    rows = random.sample(rows, min(n, len(rows)))

    for r in rows:
        orig = r.get("orig_text", "")
        aug  = r.get("switched_text", "")
        lab  = r.get("label", "")

        print("ORIG :", orig)
        print("AUG  :", aug)
        print("LABEL:", lab)
        print("-" * 60)


show_aug_samples(det_rows)


In [None]:
import torch
import torch.nn.functional as F

# -----------------------------
# Semantic similarity
# -----------------------------
def semantic_similarity(a, b, model):
    ea = model.encode(a, convert_to_tensor=True)
    eb = model.encode(b, convert_to_tensor=True)
    return float(F.cosine_similarity(ea, eb, dim=0).item())

# -----------------------------
# Switch rate based on JSON
# -----------------------------
def switch_rate(row):
    switches = row.get("switches", [])
    total_tokens = len(row["orig_text"].split())
    return len(switches) / max(1, total_tokens)

# -----------------------------
# Sentiment consistency check
# (using your MARBERT reward model)
# -----------------------------
idx_map = {"neg":0, "neu":1, "pos":2}

def sentiment_match(text, gold_label, clf, tok):
    inp = tok(text, return_tensors="pt", truncation=True).to("cuda")
    pred = clf(**inp).logits.argmax(-1).item()
    return pred == idx_map[gold_label]


In [None]:
def sentiment_flip(orig_text, aug_text, gold_label, clf, tok, margin=0.15):
    """
    Returns True ONLY if:
    - original prediction matches gold label
    - augmented prediction is a different sentiment class
    - AND difference is confident enough (margin)
    """

    # Encode both using same model
    with torch.no_grad():
        o = clf(**tok(orig_text, return_tensors="pt", truncation=True).to("cuda")).logits
        a = clf(**tok(aug_text, return_tensors="pt", truncation=True).to("cuda")).logits

    o_soft = o.softmax(-1)
    a_soft = a.softmax(-1)

    orig_pred = o_soft.argmax(-1).item()
    aug_pred  = a_soft.argmax(-1).item()
    gold_idx  = idx_map[gold_label]

    # If model already misclassified original → ignore it
    if orig_pred != gold_idx:
        return False  # not a flip; model is wrong on original

    # If class didn't change → not a flip
    if orig_pred == aug_pred:
        return False

    # Apply confidence margin to avoid false flips
    drop = o_soft[0, gold_idx] - a_soft[0, gold_idx]
    if drop < margin:
        return False  # too weak to be a real flip

    return True  # true sentiment flip


In [None]:
def filter_augmented(
    rows,
    sim_model,
    clf,
    clf_tok,
    min_sim=0.75,
    max_switch=0.40,   # your new threshold
    margin=0.15
):
    kept = []
    rejected = []

    for r in rows:
        orig = r["orig_text"]
        aug  = r["switched_text"]
        lab  = r["label"]

        # 1. semantic similarity
        sim = semantic_similarity(orig, aug, sim_model)
        if sim < min_sim:
            r["reason"] = f"low similarity ({sim:.2f})"
            rejected.append(r)
            continue

        # 2. switch rate
        sr = switch_rate(r)
        if sr > max_switch:
            r["reason"] = f"over-switching ({sr:.2f})"
            rejected.append(r)
            continue

        # 3. sentiment flip (new logic)
        if sentiment_flip(orig, aug, lab, clf, clf_tok, margin):
            r['reason'] = "sentiment flip"
            rejected.append(r)
            continue

        kept.append(r)

    return kept, rejected


In [None]:
kept, rejected = filter_augmented(
    det_rows,
    sim_model=similarity_model,
    clf=reward_model,
    clf_tok=reward_tokenizer,
    min_sim=0.75,
    max_switch=0.40
)

print("========== FILTERING SUMMARY ==========")
print(f"Total samples        : {len(det_rows)}")
print(f"Kept after filtering : {len(kept)}")
print(f"Rejected             : {len(rejected)}")
print(f"Retention rate       : {len(kept)/len(det_rows):.2%}")
print("========================================\n")

# Show examples of kept data
print("===== KEPT SAMPLES =====")
for r in kept[:5]:
    print("\nORIG:", r["orig_text"])
    print("AUG :", r["switched_text"])
    print("LABEL:", r["label"])
    print("----")

# Show examples of rejected data
print("\n===== REJECTED SAMPLES =====")
for r in rejected[:5]:
    print("\nORIG:", r["orig_text"])
    print("AUG :", r["switched_text"])
    print("LABEL:", r["label"])
    print("REASON:", r.get("reason", ""))
    print("----")


In [None]:
from collections import Counter

# Count reasons among rejected rows
reason_counts = Counter([r["reason"] for r in rejected])

total_rej = sum(reason_counts.values())

print("===== REJECTION BREAKDOWN =====")
for reason, count in reason_counts.items():
    print(f"{reason:20} : {count:4d} ({count/total_rej*100:.2f}%)")

print("\nTotal rejected:", total_rej)


In [None]:
import json

save_path = "/content/drive/MyDrive/cs-senti/repo/data/filtered_aug.jsonl"

with open(save_path, "w", encoding="utf-8") as f:
    for r in kept:
        # Save only the transformed sentence + label
        f.write(json.dumps({
            "orig_text": r["orig_text"],
            "switched_text": r["switched_text"],
            "label": r["label"],
            "domain": r.get("domain", "")
        }, ensure_ascii=False) + "\n")

print("Saved filtered augmented dataset:", len(kept))


In [None]:
reject_path = "/content/drive/MyDrive/cs-senti/repo/data/rejected_aug.jsonl"

with open(reject_path, "w", encoding="utf-8") as f:
    for r in rejected:
        f.write(json.dumps({
            "orig_text": r["orig_text"],
            "switched_text": r["switched_text"],
            "label": r["label"],
            "reason": r["reason"],
            "domain": r.get("domain", "")
        }, ensure_ascii=False) + "\n")

print("Saved rejected samples:", len(rejected))


In [None]:
import json

AUG_PATH = "/content/drive/MyDrive/cs-senti/repo/data/filtered_aug.jsonl"

aug_rows = []
with open(AUG_PATH, "r", encoding="utf-8") as f:
    for line in f:
        aug_rows.append(json.loads(line))

print("Loaded augmented samples:", len(aug_rows))


In [None]:
aug_training_format = []

for r in aug_rows:
    aug_training_format.append({
        "text": r["switched_text"],
        "label": r["label"],
        "domain": "augmented"
    })

print("Formatted augmented samples:", len(aug_training_format))


In [None]:
from datasets import Dataset, DatasetDict
import json
from pathlib import Path

# load_jsonl again (if missing)
def load_jsonl(path):
    return [json.loads(l) for l in open(path, "r", encoding="utf-8")]

BASE = Path("/content/drive/MyDrive/cs-senti/repo/data/cleaned")

eesa_train = load_jsonl(BASE/"eesa_train_clean.jsonl")
eesa_dev   = load_jsonl(BASE/"eesa_dev_clean.jsonl")
eesa_test  = load_jsonl(BASE/"eesa_test_clean.jsonl")

mr_cs  = load_jsonl(BASE/"mr_cs_clean.jsonl")
amg_cs = load_jsonl(BASE/"amg_cs_clean.jsonl")

train_data = eesa_train + mr_cs + amg_cs
dev_data   = eesa_dev
test_data  = eesa_test


In [None]:
post_aug_train = eesa_train + mr_cs + amg_cs + aug_training_format
print("Original train:", len(eesa_train + mr_cs + amg_cs))
print("Post-aug train:", len(post_aug_train))


In [None]:
SAVE_PATH = "/content/drive/MyDrive/cs-senti/repo/data/post_aug_train.jsonl"

with open(SAVE_PATH, "w", encoding="utf-8") as f:
    for row in post_aug_train:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("Saved post_aug_train to:", SAVE_PATH)
print("Total samples saved:", len(post_aug_train))


In [None]:
import json
from collections import Counter

PATH = "/content/drive/MyDrive/cs-senti/repo/data/post_aug_train.jsonl"

rows = []
with open(PATH, "r", encoding="utf-8") as f:
    for line in f:
        rows.append(json.loads(line))

print("✅ Loaded samples:", len(rows))
print("\n🔍 Example row:")
print(rows[0])

# Show label distribution if labels exist
if "label" in rows[0]:
    labels = [r["label"] for r in rows]
    print("\n📊 Label distribution:")
    print(Counter(labels))
else:
    print("\n⚠ No 'label' field found in the rows!")


In [None]:
import random
import numpy as np
import torch

def set_global_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Ensures reproducible results for cuDNN (LSTMs, CNNs)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_global_seed(42)
print("Global SEED set to 42 ✓")


In [None]:
from transformers import set_seed
set_seed(42)


In [None]:
import random
random.shuffle(post_aug_train)


In [None]:
label2id = {"neg": 0, "neu": 1, "pos": 2}
id2label = {v: k for k, v in label2id.items()}

def encode_label(example):
    example["label"] = label2id[example["label"]]
    return example


In [None]:
dataset = DatasetDict({
    "train": Dataset.from_list(train_data).map(encode_label),
    "dev":   Dataset.from_list(dev_data).map(encode_label),
    "test":  Dataset.from_list(test_data).map(encode_label),
})


In [None]:
post_aug_dataset = DatasetDict({
    "train": Dataset.from_list(post_aug_train).map(encode_label),
    "dev":   dataset["dev"],
    "test":  dataset["test"],
})


In [None]:
from transformers import AutoTokenizer

model_name = "UBC-NLP/MARBERTv2"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_post = post_aug_dataset.map(tokenize, batched=True)

# remove unused columns
for col in ["text", "id"]:
    if col in tokenized_post["train"].column_names:
        tokenized_post = tokenized_post.remove_columns(col)

tokenized_post = tokenized_post.rename_column("label", "labels")
tokenized_post.set_format("torch")

print("✓ Tokenization complete")


In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
from sklearn.metrics import classification_report

f1_metric = evaluate.load("f1")

def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)
    f1 = f1_metric.compute(predictions=preds, references=labels, average="macro")
    return {"macro_f1": f1["f1"]}


model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

args = TrainingArguments(
    output_dir="/content/drive/MyDrive/cs-senti/post_aug_marbert",
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    save_total_limit=2,
    load_best_model_at_end=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    report_to=[]
)

trainer_post = Trainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=tokenized_post["train"],
    eval_dataset=tokenized_post["dev"],
    compute_metrics=compute_metrics
)

trainer_post.train()


In [None]:
print("\n=== POST-AUG MARBERT TEST RESULTS ===")
results = trainer_post.evaluate(tokenized_post["test"])
print(results)

preds = trainer_post.predict(tokenized_post["test"]).predictions
pred_labels = np.argmax(preds, axis=1)
true_labels = tokenized_post["test"]["labels"]

print(classification_report(
    true_labels,
    pred_labels,
    target_names=["neg", "neu", "pos"]
))


In [None]:
mr_cs_data = load_jsonl(BASE/"mr_cs_clean.jsonl")
amg_cs_data = load_jsonl(BASE/"amg_cs_clean.jsonl")
mr_dataset = Dataset.from_list(mr_cs_data).map(encode_label)
amg_dataset = Dataset.from_list(amg_cs_data).map(encode_label)
mr_tok = mr_dataset.map(tokenize, batched=True)
amg_tok = amg_dataset.map(tokenize, batched=True)

for ds in [mr_tok, amg_tok]:
    if "text" in ds.column_names:
        ds = ds.remove_columns("text")

mr_tok = mr_tok.rename_column("label", "labels")
amg_tok = amg_tok.rename_column("label", "labels")

mr_tok.set_format("torch")
amg_tok.set_format("torch")


In [None]:
print("\n=== POST-AUG → MR-CS GENERALIZATION ===")
mr_results = trainer_post.evaluate(mr_tok)
print(mr_results)

mr_preds = trainer_post.predict(mr_tok).predictions
mr_pred_labels = np.argmax(mr_preds, axis=1)
mr_true_labels = mr_tok["labels"]

print(classification_report(
    mr_true_labels,
    mr_pred_labels,
    target_names=["neg", "neu", "pos"]
))


print("\n=== POST-AUG → AMG-CS GENERALIZATION ===")
amg_results = trainer_post.evaluate(amg_tok)
print(amg_results)

amg_preds = trainer_post.predict(amg_tok).predictions
amg_pred_labels = np.argmax(amg_preds, axis=1)
amg_true_labels = amg_tok["labels"]

print(classification_report(
    amg_true_labels,
    amg_pred_labels,
    target_names=["neg", "neu", "pos"]
))


In [None]:
def train_transformer_post(model_name, output_dir):
    print(f"\n=== Training POST-AUG {model_name} ===")

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    def tokenize(batch):
        return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

    tokenized = post_aug_dataset.map(tokenize, batched=True)
    tokenized = tokenized.rename_column("label", "labels")
    tokenized = tokenized.remove_columns(["text", "id"]) if "id" in tokenized["train"].column_names else tokenized
    tokenized.set_format("torch")

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3,
        id2label=id2label,
        label2id=label2id
    )

    args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=4,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        learning_rate=2e-5,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        report_to=[]
    )

    trainer = Trainer(
        model=model,
        args=args,
        tokenizer=tokenizer,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["dev"],
        compute_metrics=compute_metrics
    )

    trainer.train()

    # Evaluate on EESA test
    eval_result = trainer.evaluate(tokenized["test"])
    print(f"\n=== POST-AUG → EESA TEST ({model_name}) ===")
    print(eval_result)

    # Full report
    preds = trainer.predict(tokenized["test"]).predictions.argmax(axis=1)
    print(classification_report(tokenized["test"]["labels"], preds, target_names=["neg","neu","pos"]))

    return trainer


In [None]:
trainer_xlmr = train_transformer_post("xlm-roberta-base", "/content/drive/MyDrive/cs-senti/xlmr_post_aug")
trainer_arabert = train_transformer_post("aubmindlab/bert-base-arabertv2", "/content/drive/MyDrive/cs-senti/arabert_post_aug")


In [None]:
label2id = {"neg": 0, "neu": 1, "pos": 2}

# POST-AUG TRAIN SET
train_texts  = [x["text"] for x in post_aug_train]
train_labels = [label2id[x["label"]] for x in post_aug_train]

# ORIGINAL TEST SET (EESA)
eesa_test_dataset = dataset["test"]

test_texts  = [x["text"] for x in eesa_test_dataset]
test_labels = [x["label"] for x in eesa_test_dataset]

print("Train label set:", set(train_labels))
print("Test label set:", set(test_labels))


classical classifiers

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# --------------------------
# TRAIN SET (post-augmented)
# --------------------------
train_texts  = [x["text"] for x in post_aug_train]
train_labels = [label2id[x["label"]] for x in post_aug_train]

# --------------------------
# TEST SET (EESA test)
# --------------------------
eesa_test_dataset = dataset["test"]
test_texts  = [x["text"] for x in eesa_test_dataset]
test_labels = [x["label"] for x in eesa_test_dataset]   # already numeric

# --------------------------
# TF-IDF
# --------------------------
tfidf = TfidfVectorizer(max_features=30000, ngram_range=(1,2))
X_train = tfidf.fit_transform(train_texts)
X_test  = tfidf.transform(test_texts)

# --------------------------
# Logistic Regression
# --------------------------
logreg = LogisticRegression(max_iter=4000)
logreg.fit(X_train, train_labels)
logreg_preds = logreg.predict(X_test)

print("\n=== POST-AUG Logistic Regression (EESA Test) ===")
print(classification_report(test_labels, logreg_preds, target_names=["neg","neu","pos"]))

# --------------------------
# Linear SVM
# --------------------------
svm = LinearSVC()
svm.fit(X_train, train_labels)
svm_preds = svm.predict(X_test)

print("\n=== POST-AUG SVM (EESA Test) ===")
print(classification_report(test_labels, svm_preds, target_names=["neg","neu","pos"]))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score

# --------------------------
# TRAIN SET (post-augmented)
# --------------------------
train_texts  = [x["text"] for x in post_aug_train]
train_labels = [x["label"] for x in post_aug_train]

# --------------------------
# TEST SET (EESA test)
# --------------------------
eesa_test_dataset = dataset["test"]
test_texts  = [x["text"] for x in eesa_test_dataset]
test_labels = [x["label"] for x in eesa_test_dataset]

# --------------------------
# TF-IDF
# --------------------------
tfidf = TfidfVectorizer(max_features=30000, ngram_range=(1,2))
X_train = tfidf.fit_transform(train_texts)
X_test  = tfidf.transform(test_texts)

# --------------------------
# Logistic Regression
# --------------------------
logreg = LogisticRegression(max_iter=4000)
logreg.fit(X_train, train_labels)
logreg_preds = logreg.predict(X_test)

print("\n=== POST-AUG Logistic Regression (EESA Test) ===")
print(classification_report(test_labels, logreg_preds, target_names=["neg","neu","pos"]))
eesa_f1_logreg = f1_score(test_labels, logreg_preds, average="macro")
print("Macro F1:", eesa_f1_logreg)

# --------------------------
# Linear SVM
# --------------------------
svm = LinearSVC()
svm.fit(X_train, train_labels)
svm_preds = svm.predict(X_test)

print("\n=== POST-AUG SVM (EESA Test) ===")
print(classification_report(test_labels, svm_preds, target_names=["neg","neu","pos"]))
eesa_f1_svm = f1_score(test_labels, svm_preds, average="macro")
print("Macro F1:", eesa_f1_svm)


# ====================================================
#        GENERALIZATION TESTS (MR-CS and AMG-CS)
# ====================================================

# --------------------------
# MR-CS
# --------------------------
mr_texts  = [x["text"] for x in mr_cs]
mr_labels = [[x["label"]] for x in mr_cs]

X_mr = tfidf.transform(mr_texts)

mr_logreg_preds = logreg.predict(X_mr)
mr_svm_preds    = svm.predict(X_mr)

print("\n=== POST-AUG Logistic Regression → MR-CS ===")
print(classification_report(mr_labels, mr_logreg_preds, target_names=["neg","neu","pos"]))
mr_f1_logreg = f1_score(mr_labels, mr_logreg_preds, average="macro")
print("Macro F1:", mr_f1_logreg)

print("\n=== POST-AUG SVM → MR-CS ===")
print(classification_report(mr_labels, mr_svm_preds, target_names=["neg","neu","pos"]))
mr_f1_svm = f1_score(mr_labels, mr_svm_preds, average="macro")
print("Macro F1:", mr_f1_svm)


# --------------------------
# AMG-CS
# --------------------------
amg_texts  = [x["text"] for x in amg_cs]
amg_labels = [[x["label"]] for x in amg_cs]

X_amg = tfidf.transform(amg_texts)

amg_logreg_preds = logreg.predict(X_amg)
amg_svm_preds    = svm.predict(X_amg)

print("\n=== POST-AUG Logistic Regression → AMG-CS ===")
print(classification_report(amg_labels, amg_logreg_preds, target_names=["neg","neu","pos"]))
amg_f1_logreg = f1_score(amg_labels, amg_logreg_preds, average="macro")
print("Macro F1:", amg_f1_logreg)

print("\n=== POST-AUG SVM → AMG-CS ===")
print(classification_report(amg_labels, amg_svm_preds, target_names=["neg","neu","pos"]))
amg_f1_svm = f1_score(amg_labels, amg_svm_preds, average="macro")
print("Macro F1:", amg_f1_svm)


# neural classifier

In [None]:
!pip install torch torchvision transformers -q


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from sklearn.metrics import f1_score, classification_report


In [None]:
import json
from pathlib import Path
from datasets import Dataset, DatasetDict

BASE = Path("/content/drive/MyDrive/cs-senti/repo/data/cleaned")

def load_jsonl(path):
    return [json.loads(l) for l in open(path, "r", encoding="utf-8")]

# Load original datasets
eesa_train = load_jsonl(BASE/"eesa_train_clean.jsonl")
eesa_dev   = load_jsonl(BASE/"eesa_dev_clean.jsonl")
eesa_test  = load_jsonl(BASE/"eesa_test_clean.jsonl")

mr_cs  = load_jsonl(BASE/"mr_cs_clean.jsonl")
amg_cs = load_jsonl(BASE/"amg_cs_clean.jsonl")

print(
    len(eesa_train), len(eesa_dev), len(eesa_test),
    len(mr_cs), len(amg_cs)
)


In [None]:
AUG_PATH = "/content/drive/MyDrive/cs-senti/repo/data/filtered_aug.jsonl"

aug_rows = [json.loads(l) for l in open(AUG_PATH, "r", encoding="utf-8")]
aug_training_format = [
    {"text": r["switched_text"], "label": r["label"], "domain": "augmented"}
    for r in aug_rows
]
print("Aug samples:", len(aug_training_format))


In [None]:
label2id = {"neg": 0, "neu": 1, "pos": 2}

def encode_label(row):
    row["label"] = label2id[row["label"]]
    return row


In [None]:
eesa_train = [encode_label(x) for x in eesa_train]
eesa_dev   = [encode_label(x) for x in eesa_dev]
eesa_test  = [encode_label(x) for x in eesa_test]

mr_cs = [encode_label(x) for x in mr_cs]
amg_cs = [encode_label(x) for x in amg_cs]
aug_training_format = [encode_label(x) for x in aug_training_format]


In [None]:
post_aug_train = eesa_train + mr_cs + amg_cs + aug_training_format


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
MAX_LEN = 128

def encode_batch(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )


In [None]:
post_aug_hf = Dataset.from_list(post_aug_train)
eesa_test_hf = Dataset.from_list(eesa_test)
mr_hf = Dataset.from_list(mr_cs)
amg_hf = Dataset.from_list(amg_cs)


In [None]:
tokenized_train = post_aug_hf.map(
    encode_batch,
    batched=True,
    remove_columns=["text", "label"]
)

tokenized_test = eesa_test_hf.map(
    encode_batch,
    batched=True,
    remove_columns=["text", "label"]
)

tokenized_mr = mr_hf.map(
    encode_batch,
    batched=True,
    remove_columns=["text", "label"]
)

tokenized_amg = amg_hf.map(
    encode_batch,
    batched=True,
    remove_columns=["text", "label"]
)


In [None]:
tokenized_train = tokenized_train.add_column("labels", [x["label"] for x in post_aug_train])
tokenized_test  = tokenized_test.add_column("labels",  [x["label"] for x in eesa_test])
tokenized_mr    = tokenized_mr.add_column("labels",    [x["label"] for x in mr_cs])
tokenized_amg   = tokenized_amg.add_column("labels",   [x["label"] for x in amg_cs])


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class HFTextDataset(Dataset):
    def __init__(self, ds):
        self.ds = ds

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        item = self.ds[idx]
        return {
            "input_ids": torch.tensor(item["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(item["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(item["labels"], dtype=torch.long),
        }


In [None]:
train_loader = DataLoader(HFTextDataset(tokenized_train), batch_size=32, shuffle=True)
test_loader  = DataLoader(HFTextDataset(tokenized_test),  batch_size=32)
mr_loader    = DataLoader(HFTextDataset(tokenized_mr),    batch_size=32)
amg_loader   = DataLoader(HFTextDataset(tokenized_amg),   batch_size=32)


In [None]:
import torch
import torch.nn as nn

class BiLSTM_MHAttn(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, lstm_dim=128, num_heads=4, num_classes=3):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        self.lstm = nn.LSTM(
            embed_dim,
            lstm_dim,
            batch_first=True,
            bidirectional=True
        )

        # Multi-Head Self-Attention
        self.attn = nn.MultiheadAttention(
            embed_dim=2*lstm_dim,
            num_heads=num_heads,
            batch_first=True
        )

        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(2*lstm_dim, num_classes)

    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)

        lstm_out, _ = self.lstm(x)

        # Multi-head self-attention
        attn_out, _ = self.attn(lstm_out, lstm_out, lstm_out,
                                key_padding_mask=(attention_mask == 0))

        # mean pooling
        pooled = attn_out.mean(dim=1)

        logits = self.fc(self.dropout(pooled))
        return logits


In [None]:
from torch.utils.data import Dataset, DataLoader

class HFWrapper(Dataset):
    def __init__(self, hf_ds):
        self.ds = hf_ds

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        item = self.ds[idx]
        return {
            "input_ids": torch.tensor(item["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(item["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(item["labels"], dtype=torch.long)
        }



In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

VOCAB_SIZE = tokenizer.vocab_size
model = BiLSTM_MHAttn(VOCAB_SIZE).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)


In [None]:
def collate_fn(batch):
    return {
        "input_ids": torch.stack([x["input_ids"] for x in batch]),
        "attention_mask": torch.stack([x["attention_mask"] for x in batch]),
        "labels": torch.stack([x["labels"] for x in batch])
    }


In [None]:
train_ds = HFWrapper(tokenized_train)
test_ds  = HFWrapper(tokenized_test)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader  = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [None]:
EPOCHS = 6

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in train_loader:
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        logits = model(ids, mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS} | Loss = {total_loss/len(train_loader):.4f}")


In [None]:
from sklearn.metrics import classification_report
import numpy as np

def evaluate_loader(model, loader):
    model.eval()
    preds = []
    trues = []

    with torch.no_grad():
        for batch in loader:
            ids = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            labels = batch["labels"].cpu().numpy()

            logits = model(ids, mask)
            pred = torch.argmax(logits, dim=1).cpu().numpy()

            preds.extend(pred)
            trues.extend(labels)

    print(classification_report(trues, preds, target_names=["neg","neu","pos"]))
    macro_f1 = f1_score(trues, preds, average="macro")
    print("Macro F1:", macro_f1)

    return macro_f1



In [None]:
print("\n=== BiLSTM + Multi-Head Self-Attention (EESA Test) ===")
evaluate_loader(model, test_loader)


In [None]:
mr_loader  = DataLoader(HFWrapper(tokenized_mr), batch_size=32, shuffle=False, collate_fn=collate_fn)
amg_loader = DataLoader(HFWrapper(tokenized_amg), batch_size=32, shuffle=False, collate_fn=collate_fn)

print("\n=== → MR-CS GENERALIZATION ===")
evaluate_loader(model, mr_loader)

print("\n=== → AMG-CS GENERALIZATION ===")
evaluate_loader(model, amg_loader)


summary

In [None]:
import pandas as pd

results = {
    "Model": [
        "Logistic Regression",
        "SVM",
        "BiLSTM-Attention",
        "XLM-R Base",
        "AraBERTv2",
        "MARBERTv2"
    ],

    # ------------------------------
    # EESA Test (in-domain performance)
    # ------------------------------
    "EESA_F1": [
        0.776,   # logistic (pre-aug baseline)
        0.778,   # svm (pre-aug baseline)
        0.727,   # BiLSTM-Attn
        0.840,   # xlm-r
        0.835,   # arabert
        0.886    # marbert
    ],

    # ------------------------------
    # MR-CS Generalization (post-augmentation)
    # ------------------------------
    "MRCS_F1": [
        0.926,     # Logistic Regression POST-AUG
        0.993,     # SVM POST-AUG
        0.937,     # BiLSTM-Attn
        None,      # xlm-r not computed
        "ERROR",   # arabert crashed
        0.888      # marbert
    ],

    # ------------------------------
    # AMG-CS Generalization (post-augmentation)
    # ------------------------------
    "AMGCS_F1": [
        0.902,     # Logistic Regression POST-AUG
        0.978,     # SVM POST-AUG
        0.941,     # BiLSTM-Attn
        None,      # xlm-r not computed
        "ERROR",   # arabert
        0.824      # marbert
    ]
}

df = pd.DataFrame(results)
df


|    | Model               |   EESA_F1 |   MRCS_F1 |   AMGCS_F1 |
|----|---------------------|-----------|-----------|------------|
|  0 | Logistic Regression |     0.776 |     0.957 |      0.949 |
|  1 | SVM                 |     0.774 |     0.98  |      0.98  |
|  2 | BiLSTM              |     0.701 |     0.671 |      0.627 |
|  3 | CNN-LSTM            |     0.67  |   nan     |    nan     |
|  4 | BiLSTM-Attn         |     0.725 |   nan     |    nan     |
|  5 | XLM-R               |     0.816 |     0.81  |      0.748 |
|  6 | AraBERTv2           |     0.818 |     0.808 |      0.769 |
|  7 | MARBERTv2           |     0.883 |     0.95  |      0.933 |

## **Transformer Baselines**

In [None]:
import json
from pathlib import Path

BASE = Path("/content/drive/MyDrive/cs-senti/repo/data/cleaned")

def load_jsonl(path):
    return [json.loads(l) for l in open(path, "r", encoding="utf-8")]

eesa_train = load_jsonl(BASE/"eesa_train_clean_fixed.jsonl")
eesa_dev   = load_jsonl(BASE/"eesa_dev_clean_fixed.jsonl")
eesa_test  = load_jsonl(BASE/"eesa_test_clean_fixed.jsonl")
mr_cs      = load_jsonl(BASE/"mr_cs_clean_fixed.jsonl")
amg_cs     = load_jsonl(BASE/"amg_cs_clean_fixed.jsonl")

# Unified train split
final_train = eesa_train + mr_cs + amg_cs
final_dev   = eesa_dev

# For reporting
final_test_eesa = eesa_test
final_test_mr   = mr_cs
final_test_amg  = amg_cs


In [None]:
from datasets import Dataset

train_ds = Dataset.from_list(final_train)
dev_ds   = Dataset.from_list(final_dev)

test_eesa_ds = Dataset.from_list(final_test_eesa)
test_mr_ds   = Dataset.from_list(final_test_mr)
test_amg_ds  = Dataset.from_list(final_test_amg)


In [None]:
import json
from collections import Counter

def check_labels(ds, name):
    print(f"\n=== Checking {name} ===")
    bad = []
    for i, row in enumerate(ds):
        if not isinstance(row["label"], str):
            bad.append((i, row["label"]))
    print("Total samples:", len(ds))
    print("Bad labels:", len(bad))
    if bad:
        print("Examples:")
        print(bad[:5])

check_labels(final_train, "TRAIN")
check_labels(final_dev,   "DEV")
check_labels(final_test_eesa, "TEST EESA")
check_labels(final_test_amg,  "TEST AMG")
check_labels(final_test_mr,   "TEST MR")


In [None]:
label2id = {"neg":0, "neu":1, "pos":2}
id2label = {v:k for k,v in label2id.items()}


In [None]:
def make_tokenize_fn(tokenizer):
    def tokenize(batch):
        enc = tokenizer(
            batch["text"],
            truncation=True,
            padding="max_length",
            max_length=128
        )
        enc["labels"] = [label2id[l] for l in batch["label"]]
        return enc
    return tokenize


In [None]:
from transformers import TrainingArguments

def make_args(output_dir):
    return TrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        logging_steps=50,

        save_steps=200,
        eval_steps=200,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        greater_is_better=True,

        report_to="none",   # disable wandb
    )


In [None]:
!pip install evaluate
import evaluate

f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return {
        "macro_f1": f1_metric.compute(
            predictions=preds,
            references=labels,
            average="macro"
        )["f1"]
    }


In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import Dataset
import evaluate

# -------------------------
# LABEL MAP
# -------------------------
label2id = {"neg":0, "neu":1, "pos":2}
id2label = {v:k for k,v in label2id.items()}

# -------------------------
# METRIC
# -------------------------
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return {
        "f1": f1_metric.compute(
            predictions=preds,
            references=labels,
            average="macro"
        )["f1"]
    }

# -------------------------
# TOKENIZATION FN
# -------------------------
def make_tokenize_fn(tokenizer):
    def tokenize(batch):
        enc = tokenizer(
            batch["text"],
            padding="max_length",
            truncation=True,
            max_length=128,
        )
        enc["labels"] = [label2id[l] for l in batch["label"]]
        return enc
    return tokenize

# -------------------------
# MAIN TRAINER
# -------------------------
def train_transformer(model_name, output_dir):

    print(f"\n============== Training {model_name} ==============\n")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    tokenize_fn = make_tokenize_fn(tokenizer)

    # BUILD DATASETS
    train_ds = Dataset.from_list(final_train).map(tokenize_fn, batched=True)
    dev_ds   = Dataset.from_list(final_dev).map(tokenize_fn, batched=True)
    test_ds  = Dataset.from_list(final_test_eesa).map(tokenize_fn, batched=True)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3,
        id2label=id2label,
        label2id=label2id
    )

    args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,

        # 4.57.2-friendly params
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,

        learning_rate=2e-5,
        weight_decay=0.01,

        logging_steps=100,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=args,
        train_dataset=train_ds,
        eval_dataset=dev_ds,
        compute_metrics=compute_metrics
    )

    trainer.train()
    return trainer


## **reward module**

In [None]:
import json
from datasets import Dataset, DatasetDict

def load_jsonl(path):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            rows.append(json.loads(line))
    return Dataset.from_list(rows)

BASE = "/content/drive/MyDrive/cs-senti/repo/data/cleaned"

eesa_train = load_jsonl(f"{BASE}/eesa_train_clean.jsonl")
eesa_dev   = load_jsonl(f"{BASE}/eesa_dev_clean.jsonl")

mr_cs = load_jsonl(f"{BASE}/mr_cs_clean.jsonl")
amg_cs = load_jsonl(f"{BASE}/amg_cs_clean.jsonl")

print(eesa_train, eesa_dev, mr_cs, amg_cs)


In [None]:
train_data = Dataset.from_list(
    eesa_train.to_list() + mr_cs.to_list() + amg_cs.to_list()
)

reward_dataset = DatasetDict({
    "train": train_data,
    "validation": eesa_dev
})

reward_dataset


In [None]:
from transformers import AutoTokenizer

reward_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def tokenize_fn(batch):
    return reward_tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_reward = reward_dataset.map(tokenize_fn, batched=True)

label2id = {"neg": 0, "neu": 1, "pos": 2}
id2label = {v:k for k,v in label2id.items()}

tokenized_reward = tokenized_reward.map(
    lambda x: {"labels": label2id[x["label"]]}
)

tokenized_reward = tokenized_reward.remove_columns(["text", "label"])
tokenized_reward


In [None]:
from transformers import AutoModelForSequenceClassification

reward_model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)


In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="/content/drive/MyDrive/cs-senti/reward_xlmr_v1",

    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,

    num_train_epochs=3,
    learning_rate=2e-5,

    # MUST MATCH!
    evaluation_strategy="epoch",
    save_strategy="epoch",

    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,

    logging_dir="/content/drive/MyDrive/cs-senti/logs_reward",
    logging_steps=50,

    save_total_limit=2,
    report_to="none"   # disable wandb
)


In [None]:
import transformers
print(transformers.__version__)


In [None]:
from transformers.training_args import TrainingArguments
help(TrainingArguments)
