# Kaggle Setup

In [None]:
import os

path = "/kaggle/working"
is_empty = (len(os.listdir(path)) <= 3)

!ls /kaggle/working/

# !cp -r /kaggle/input/arageneval2025-task1-sc/* /kaggle/working/

if is_empty:
    print(f"✅ '{path}' is empty. copying project strcuture into it ...")
    !cp -r /kaggle/input/arageneval2025-task1-sc/* /kaggle/working/
else:
    print(f"❌ '{path}' is not empty — contains {len(os.listdir(path))} items.")

# Change to project directory
%cd /kaggle/working/ 

# Confirm
!ls

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# # Navigate to project folder
# %cd /content/drive/MyDrive/AraGenEval2025
# !ls

In [None]:
# Add project root to Python path
import os
import sys
sys.path.insert(0, os.getcwd())
print("✅ Project root added to sys.path:", os.getcwd())

# Dependencies

In [None]:
!pip install transformers==4.41.2 datasets==2.19.1 torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 scikit-learn wandb python-dotenv evaluate --quiet
!pip install -U peft==0.11.1 --quiet

In [None]:
import os
os.kill(os.getpid(), 9)

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
import torch
from datasets import Dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score  
import pandas as pd
import numpy as np


In [None]:
# Update these paths with your actual dataset locations on Drive
train_dataset_path ='data/AuthorshipStyleTransferTrain.xlsx'
val_dataset_path =  'data/AuthorshipStyleTransferVal.xlsx'

In [None]:
import sys
import os

project_root = os.getcwd()  # Should be /content/drive/MyDrive/AraGenEval2025
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Project root added to sys.path: {project_root}")

In [None]:
!pwd

In [None]:
from config import Config

print(Config.TRAIN_FILE)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import gc

def load_and_sample_dataset(
    file_path,
    sample_mode="all",    # Options: "all", "random", "stratified"
    sample_size=None,     # For "random"/"stratified": float (proportion) or int (absolute count)
    random_state=42
):
    """
    Load dataset from Excel and apply sampling.

    Args:
        file_path (str): Path to dataset file.
        sample_mode (str): "all", "random", or "stratified".
        sample_size (float|int): Proportion (0-1) or count for sampling.
        random_state (int): Random seed for reproducibility.

    Returns:
        pd.DataFrame: Sampled dataset.
    """
    print(f"📂 Loading dataset from: {file_path}")
    df = pd.read_excel(file_path, engine='openpyxl')

    if sample_mode == "all":
        sampled_df = df.copy()
        print(f"✅ Loaded full dataset with {len(sampled_df)} samples.")

    elif sample_mode == "random":
        if sample_size is None:
            raise ValueError("For 'random' mode, SAMPLE_SIZE must be set.")
        sampled_df = df.sample(
            n=sample_size if isinstance(sample_size, int) else int(len(df) * sample_size),
            random_state=random_state
        )
        print(f"✅ Randomly sampled {len(sampled_df)} samples.")

    elif sample_mode == "stratified":
        if sample_size is None or not (0 < sample_size < 1):
            raise ValueError("For 'stratified' mode, SAMPLE_SIZE must be a proportion between 0 and 1.")
        sampled_df, _ = train_test_split(
            df,
            train_size=sample_size,
            stratify=df['author'],
            random_state=random_state
        )
        print(f"✅ Stratified sampled {len(sampled_df)} samples (author distribution preserved).")

    else:
        raise ValueError(f"Invalid sample_mode: {sample_mode}")

    del df
    gc.collect()

    return sampled_df


In [None]:
# from transformers import AutoTokenizer, AutoModelForSequenceClassification

# model_name = "megabot131/m-e5-large-toxic-classification-lora"   # "microsoft/Multilingual-MiniLM-L12-H384" #"CAMeL-Lab/bert-base-arabic-camelbert-ca" "aubmindlab/aragpt2-mega-detector-long"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# print("Max tokens:", tokenizer.model_max_length)

# model = AutoModelForSequenceClassification.from_pretrained(
#     model_name,
#     # num_labels=num_labels
#     # remove use_safetensors=True
# )
# print("Model context limit:", model.config.max_position_embeddings)

# Load Dataset

In [None]:
# === CONFIG ===
SAMPLE_MODE = "stratified"  # "all", "random", "stratified"
SAMPLE_SIZE = 0.1           # 10% for stratified or random. Set to None for full dataset.

# === LOAD AND SAMPLE ===
train_df = load_and_sample_dataset(
    Config.TRAIN_FILE,
    # sample_mode=SAMPLE_MODE,
    # sample_size=SAMPLE_SIZE
)

# === LOAD AND SAMPLE ===
# val_df = load_and_sample_dataset(
#     Config.VAL_FILE,
#     # sample_mode=SAMPLE_MODE,
#     # sample_size=SAMPLE_SIZE
# )

print("Train shape:", train_df.shape)
# print("Validation shape:", val_df.shape)
print("Authors in Train:", train_df['author'].nunique())
# print("Authors in Val:", val_df['author'].nunique())

# train_df.head()

## Labeling

In [None]:
# ── cell: mappings ──
# Arabic→ID (from your JSON)
author2id = {
  "أحمد أمين": 0,  "أحمد تيمور باشا": 1,  "أحمد شوقي": 2,
  "أمين الريحاني": 3,  "ثروت أباظة": 4,  "جبران خليل جبران": 5,
  "جُرجي زيدان": 6,  "حسن حنفي": 7,  "روبرت بار": 8,
  "سلامة موسى": 9,  "طه حسين": 10, "عباس محمود العقاد": 11,
  "عبد الغفار مكاوي": 12, "غوستاف لوبون": 13, "فؤاد زكريا": 14,
  "كامل كيلاني": 15, "محمد حسين هيكل": 16, "نجيب محفوظ": 17,
  "نوال السعداوي": 18, "ويليام شيكسبير": 19, "يوسف إدريس": 20
}

# ID→Arabic
id2author = {v:k for k,v in author2id.items()}

# ID→English (you supply these)
id2english = {
   0:"Ahmed_Amin",      1:"Ahmad_Taymour_Basha", 2:"Ahmed_Shawqi",
   3:"Ameen_Rihani",    4:"Tharwat_Abaza",        5:"Gibran_Khalil_Gibran",
   6:"Jurji_Zaydan",    7:"Hassan_Hanifi",        8:"Robert_Barr",
   9:"Salama_Moussa",   10:"Taha_Hussein",        11:"Abbas_Al-Aqqad",
   12:"AbdelGhaffar_Makawi", 13:"Gustave_Lebon",   14:"Fouad_Zakaria",
   15:"Kamel_Kilani",   16:"Mohamed_Hosseini_Hekal",17:"Naguib_Mahfouz",
   18:"Nawal_El_Saadawi",19:"William_Shakespeare",20:"Youssef_Edrees"
}

In [None]:
# ── cell: build binary train/val lists ──

from datasets import Dataset

# 1) specify the target author by its ID
TARGET_ID = 4  # ← e.g. 20 for يوسف إدريس

# 2) your existing author2id + id2english dicts
# (make sure these are defined above)
# author2id = { "أحمد أمين":0, …, "يوسف إدريس":20 }
# id2english  = { 0:"Ahmed_Amin", …, 20:"Youssef_Edrees" }

TARGET_AR = id2author[TARGET_ID]
TARGET_EN = id2english[TARGET_ID]
print(f"▶ Building binary dataset for: ({TARGET_ID}) {TARGET_AR} / {TARGET_EN}")

# 3) map both train & val to 0/1
train_data = [
    {"text": row["text_in_author_style"], "label": int(author2id[row["author"]] == TARGET_ID)}
    for _, row in train_df.iterrows()
]
# val_data   = [
#     {"text": row["text_in_author_style"], "label": int(author2id[row["author"]] == TARGET_ID)}
#     for _, row in val_df.iterrows()
# ]

# 4) wrap into 🤗 Dataset
train_dataset = Dataset.from_list(train_data)
# val_dataset   = Dataset.from_list(val_data)


In [None]:
# import json
# with open("/kaggle/working/evaluation/ar_style_classifier/results/author2id.json", "w") as f:
#     json.dump(author2id, f, indent=4)

# Tokenization

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "CAMeL-Lab/bert-base-arabic-camelbert-ca" #"allenai/longformer-base-4096" #"UBC-NLP/AraT5v2-base-1024"
# num_labels = len(unique_authors)
num_labels = 2
tokenizer = AutoTokenizer.from_pretrained(model_name,use_safetensors=True)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    # use_safetensors=True
).to(device)

def preprocess_sliding(examples):
    chunk_size = 512
    stride = 256
    input_ids_list = []
    attention_mask_list = []
    label_list = []
    sample_id_list = []  # NEW

    for idx, (text, label) in enumerate(zip(examples["text"], examples["label"])):
        encodings = tokenizer(
            text,
            truncation=True,
            max_length=chunk_size,
            stride=stride,
            return_overflowing_tokens=True,
            return_attention_mask=True,
        )
        
        num_chunks = len(encodings["input_ids"])
        for input_ids, attention_mask in zip(encodings["input_ids"], encodings["attention_mask"]):
            input_ids_list.append(input_ids)
            attention_mask_list.append(attention_mask)
            label_list.append(label)
            sample_id_list.append(idx)  # Use the index as sample identifier

    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
        "label": label_list,
        "sample_id": sample_id_list
    }


train_dataset = train_dataset.map(preprocess_sliding, batched=True, remove_columns=["text"])
# val_dataset = val_dataset.map(preprocess_sliding, batched=True, remove_columns=["text"])

In [None]:
print(train_dataset[0])


## Evaluation

# Training Setup

In [None]:
print(len(train_dataset))

In [None]:
import os
os.cpu_count()

In [None]:
from transformers import Trainer, TrainingArguments

## batch size calculation
effective_batch_size = 256
batch_size = 32
accumulation_steps = effective_batch_size // batch_size
epochs = 5
OUTPUT_DIR=f"evaluation/ar_style_classifier/sc_{TARGET_ID}_{TARGET_EN}"  # output_dir = f"sc_{TARGET_ID}_{TARGET_EN}"

#config training env
model.gradient_checkpointing_disable()
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# os.environ["WANDB_DISABLED"] = "true"



# args
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    max_steps=-1,
    save_strategy="epoch",
    # save_steps=500,
    # save_strategy="steps",
    # save_steps=10,
    # eval_strategy="epoch",
    # eval_steps=1000,
    # eval_strategy="steps",
    # eval_steps=10,  # ⬅️ super frequent eval for debugging
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    # per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    gradient_accumulation_steps=accumulation_steps,
    # load_best_model_at_end=True,
    # metric_for_best_model="f1",
    save_total_limit=1, 
    logging_steps=accumulation_steps * 25,
    logging_strategy="steps",
    dataloader_num_workers=0,
    report_to="none"
)

In [None]:
from transformers import DataCollatorForSeq2Seq
from transformers import Trainer, TrainingArguments, DefaultDataCollator

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    callbacks=[],  # Completely remove default callbacks like WandB
    # eval_dataset=val_dataset,
    # compute_metrics=compute_metrics,
)

print(f"Train dataset size: {len(train_dataset)}")
# print(f"Eval dataset size: {len(val_dataset)}")

In [None]:
import torch
import gc
gc.collect()
torch.cuda.empty_cache()

## Launch Training

In [None]:
from datetime import datetime
import time
import os
from config import Config


print(f"📢 Starting training at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

start_time = time.time()

# Detect checkpoint
last_checkpoint = None
CHECKPOINT_DIR = OUTPUT_DIR    #os.path.join(Config.MODEL_WEIGHTS_FOLDER,safe_model_name(MODEL_CONFIG["name"]))
RESUME_TRAINING=False

if RESUME_TRAINING:
    if os.path.isdir(CHECKPOINT_DIR):
        checkpoints = [d for d in os.listdir(CHECKPOINT_DIR) if d.startswith("checkpoint-")]

        if checkpoints:
            # Sort numerically based on step number
            latest_checkpoint =  max(checkpoints, key=lambda x: int(x.split("-")[-1]))
            last_checkpoint = os.path.join(CHECKPOINT_DIR, latest_checkpoint)
            print(f"✅ Found checkpoint at {last_checkpoint}. Will resume training from there.")
        else:
            print("⚠️ No checkpoints found. Starting from scratch.")
    else:
        print("⚠️ Checkpoint directory doesn't exist. Starting from scratch.")
else:
    print("🆕 Starting fresh training (no checkpoint resume).")

# Start training
try:
    if last_checkpoint:
        train_output = trainer.train(
            resume_from_checkpoint=last_checkpoint,
            # ignore_keys_for_eval=["optimizer", "scheduler"]
        )
    else:
        train_output = trainer.train()

    total_time = time.time() - start_time
    print(f"✅ Training completed in {total_time/3600:.2f} hours")

    if train_output.metrics:
        print("\n📊 Final Training Metrics:")
        for key, value in train_output.metrics.items():
            print(f" {key}: {value:.4f}")

except KeyboardInterrupt:
    print("\n⚠️ Training interrupted by user. Saving checkpoint...")
    interrupted_path = os.path.join(CHECKPOINT_DIR, "checkpoint-interrupted")
    trainer.save_model(interrupted_path)
    print(f"💾 Checkpoint saved at {interrupted_path}")

# Save Results

In [None]:
save_dir = f"evaluation/ar_style_classifier/sc_{TARGET_ID}_{TARGET_EN}/results"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"Model saved to {save_dir}")

## Save to Dataset

In [None]:
!pip install -q kaggle

!mkdir -p ~/.kaggle
# !echo '{"username": "kaggelone", "key": "14a0b752458ec2faed37fb584b1bae9f"}' > ~/.kaggle/kaggle.json
!cp /kaggle/input/kaggle-json/kaggle.json  ~/.kaggle/  # Assuming you uploaded it as kaggle-json dataset
!chmod 600 ~/.kaggle/kaggle.json
!mkdir -p /root/.config/kaggle/
!cp /kaggle/input/kaggle-json/kaggle.json /root/.config/kaggle/
!chmod 600 /root/.config/kaggle/kaggle.json

In [None]:
!cp /kaggle/input/kaggle-json/kaggle.json /root/.config/kaggle
!chmod 600 /root/.config/kaggle/kaggle.json

In [None]:
!ls /kaggle/input/kaggle-json
!ls ~/.kaggle
!ls /root/.config/kaggle/

In [None]:
import os
import json
import shutil
from kaggle.api.kaggle_api_extended import KaggleApi
from datetime import datetime
import zipfile

def upload_to_kaggle_dataset(dataset_name, folder_path, dataset_title=None,
                             override=True, version_append=True,
                             local_backup_path=None):
    """
    Uploads a folder or file to a Kaggle dataset, preserving older structure when version_append=True.
    If dataset_download_files fails (e.g. private dataset), falls back to local_backup_path.
    """
    kaggle_username = "hafsousaalilou"
    kaggle_json_path = os.path.expanduser("~/.kaggle/kaggle.json")
    if os.path.exists(kaggle_json_path):
        try:
            with open(kaggle_json_path) as f:
                creds = json.load(f)
                kaggle_username = creds.get("username", kaggle_username)
        except:
            pass
    kaggle_username = os.environ.get("KAGGLE_USERNAME", kaggle_username)
    dataset_slug = dataset_name.lower().replace("_", "-").replace(" ", "-")
    dataset_id = f"{kaggle_username}/{dataset_slug}"
    if dataset_title is None:
        dataset_title = dataset_name.replace("-", " ").title()

    api = KaggleApi()
    api.authenticate()

    # Detect existence
    dataset_exists = any(ds.ref.lower() == dataset_id.lower()
                         for ds in api.dataset_list(user=kaggle_username))

    temp_dir = f"/tmp/{dataset_slug}_upload"
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
    os.makedirs(temp_dir)

    # 1) Download old version if required
    if dataset_exists and version_append:
        print(f"⬇️ Downloading existing dataset '{dataset_id}'…")
        try:
            api.dataset_download_files(dataset_id, path=temp_dir, unzip=True)
        except Exception as e:
            print(f"⚠️ dataset_download_files failed: {e}")
            if local_backup_path and os.path.exists(local_backup_path):
                print(f"ℹ️ Falling back to local backup at '{local_backup_path}'")
                shutil.copytree(local_backup_path, temp_dir, dirs_exist_ok=True)
            else:
                print("❌ Cannot preserve old files—no download and no local backup.")
                version_append = False

    # Debug: show what’s in temp_dir before adding new zip
    print("📂 temp_dir contents BEFORE zipping:")
    for name in sorted(os.listdir(temp_dir)):
        print("   ", name)

    # 2) Create new folder zip
    folder_basename = os.path.basename(os.path.normpath(folder_path))
    zip_path = os.path.join(temp_dir, f"{folder_basename}.zip")

    # Pre‑zip conflict check
    if os.path.exists(zip_path):
        if override:
            os.remove(zip_path)
        else:
            print(f"⚠️ '{folder_basename}.zip' already exists—skipping.")
            return

    print(f"📦 Zipping '{folder_path}' → '{zip_path}'…")
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
        if os.path.isdir(folder_path):
            for root, _, files in os.walk(folder_path):
                for f in files:
                    abs_p = os.path.join(root, f)
                    rel_p = os.path.relpath(abs_p, os.path.dirname(folder_path))
                    zf.write(abs_p, arcname=os.path.join(folder_basename, rel_p))
        else:
            zf.write(folder_path, arcname=folder_basename)

    # 3) Final temp_dir debug
    print("📂 temp_dir contents BEFORE upload:")
    for name in sorted(os.listdir(temp_dir)):
        print("   ", name)

    # 4) Metadata
    meta = {"title": dataset_title, "id": dataset_id, "licenses":[{"name":"CC0-1.0"}]}
    with open(os.path.join(temp_dir, "dataset-metadata.json"), "w") as f:
        json.dump(meta, f, indent=2)

    # 5) Upload
    try:
        if dataset_exists:
            api.dataset_create_version(
                temp_dir,
                version_notes=f"Append {folder_basename} @ {datetime.now()}",
                convert_to_csv=False,
                dir_mode="zip"
            )
        else:
            api.dataset_create_new(
                temp_dir,
                convert_to_csv=False,
                dir_mode="zip"
            )
        print("✅ Upload successful.")
    except Exception as e:
        print(f"❌ Upload failed: {e}")

    shutil.rmtree(temp_dir)


In [None]:
upload_to_kaggle_dataset(
    dataset_name=f"binary-style-classifier-models-{TARGET_ID}",
    folder_path=f"evaluation/ar_style_classifier/sc_{TARGET_ID}_{TARGET_EN}", # "/kaggle/working/output/evaluation/ar_style_classifier/results", 
    dataset_title=f"BSC Model author {TARGET_ID} {TARGET_EN}",
    override=True,
    version_append=False
)

## Inference

In [None]:
# # --- Inference on full validation texts ---
# def predict_full_text(text):
#     enc = tokenizer(
#         text,
#         truncation=True,
#         padding="max_length",
#         max_length=512,
#         stride=256,
#         return_overflowing_tokens=True,
#         return_tensors="pt"
#     ).to(device)
#     logits = model(**enc).logits  # [num_chunks, num_labels]
#     avg_logits = logits.mean(dim=0)
#     probs = torch.softmax(avg_logits, dim=0)
#     return torch.argmax(probs).item(), probs.cpu().tolist()

# # --- Evaluate across validation set ---
# preds, refs = [], []
# for example in val_df.itertuples():
#     pred, _ = predict_full_text(example.text_in_author_style)
#     preds.append(pred)
#     refs.append(author2id[example.author])

# from sklearn.metrics import accuracy_score, f1_score
# print("Validation Accuracy:", accuracy_score(refs, preds))
# print("Validation F1‑macro:", f1_score(refs, preds, average="macro"))



In [None]:
# import numpy as np
# import evaluate

# # Load metrics
# metric_acc = evaluate.load("accuracy")
# metric_f1 = evaluate.load("f1")

# # Sanity check for labels
# EXPECTED_NUM_CLASSES = len(unique_authors)  # make sure this is defined correctly


# def compute_metrics(p):
#     preds = np.argmax(p.predictions, axis=1)

#     # Defensive check: Are labels within expected range?
#     if np.max(preds) >= EXPECTED_NUM_CLASSES or np.min(preds) < 0:
#         raise ValueError(f"Predictions contain invalid class indices: {np.unique(preds)}")

#     if np.max(p.label_ids) >= EXPECTED_NUM_CLASSES or np.min(p.label_ids) < 0:
#         raise ValueError(f"Label IDs contain invalid class indices: {np.unique(p.label_ids)}")

#     acc = metric_acc.compute(predictions=preds, references=p.label_ids)["accuracy"]
    
#     # Use macro average for multiclass tasks
#     # f1 = metric_f1.compute(predictions=preds, references=p.label_ids, average="macro")["f1"]

#     # binary F1 (pos_label=1)
#     f1    = metric_f1.compute(
#                 predictions=preds,
#                 references=p.label_ids,
#                 average="binary",
#                 pos_label=1
#              )["f1"]
    
#     return {"accuracy": acc, "f1": f1}



In [None]:
# # Dummy sanity check
# dummy_preds = np.random.randint(0, EXPECTED_NUM_CLASSES, size=100)
# dummy_labels = np.random.randint(0, EXPECTED_NUM_CLASSES, size=100)

# test_acc = metric_acc.compute(predictions=dummy_preds, references=dummy_labels)["accuracy"]
# test_f1 = metric_f1.compute(predictions=dummy_preds, references=dummy_labels, average="macro")["f1"]

# print(f"Sanity metric test - Accuracy: {test_acc:.4f}, F1 (macro): {test_f1:.4f}")
