# Kaggle Setup 🛠️
This cell clones your repo, adds the project root to PATH, and unzips data if needed.

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
# import kagglehub
# kagglehub.login()


In [None]:
# # IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# # THEN FEEL FREE TO DELETE THIS CELL.
# # NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# # ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# # NOTEBOOK.

# hafsousaalilou_arageneval2025_task1_sc_path = kagglehub.dataset_download('hafsousaalilou/arageneval2025-task1-sc')
# kaggelone_facebook_mbart_large_50_many_to_many_mmt_model_path = kagglehub.dataset_download('kaggelone/facebook-mbart-large-50-many-to-many-mmt-model')
# nojoomcolab_kaggle_json_path = kagglehub.dataset_download('nojoomcolab/kaggle-json')

# print('Data source import complete.')


In [None]:
# !unzip -o /kaggle/input/arageneval2025-task1-dataset/AraGenEval2025.zip -d /kaggle/working/output/
# %cd /kaggle/working/output/AraGenEval2025
# ------------------------------------------
# ------------------------------------------
# ------------------------------------------


import os

path = "/kaggle/working"
is_empty = (len(os.listdir(path)) <= 3)

# !cp -r /kaggle/input/arageneval2025-task1-sc/* /kaggle/working/

if is_empty:
    print(f"✅ '{path}' is empty. copying project strcuture into it ...")
    !mkdir -p /kaggle/working/
    !cp -r /kaggle/input/arageneval2025-task1-sc/* /kaggle/working/
else:
    print(f"❌ '{path}' is not empty — contains {len(os.listdir(path))} items.")

In [None]:
# Change to project directory
%cd /kaggle/working/

# Confirm
!ls

# Colab Setup

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# # Navigate to project folder
# %cd /content/drive/MyDrive/AraGenEval2025
# !ls

In [None]:
# Add project root to Python path
import os
import sys
sys.path.insert(0, os.getcwd())
print("✅ Project root added to sys.path:", os.getcwd())

# Fine-tuning mBART for Arabic Authorship Style Transfer (AraGenEval2025)
This notebook mounts Google Drive, loads your dataset, fine-tunes the model, and saves results.

## Dependecies & ENV

In [None]:
!pip uninstall -y transformers tokenizers sentencepiece
!pip install \
  transformers==4.52.4 protobuf==3.20.1 \
  tokenizers==0.21.2 \
  sentencepiece==0.1.99 --quiet

%pip install datasets==2.19.1 openpyxl==3.1.2 psutil==5.9.8 python-dotenv==1.0.1 --quiet

In [None]:
!pip install evaluate rouge_score sacrebleu --quiet

In [None]:
!rm -rf ~/.cache/huggingface

In [None]:
# import os
# os.kill(os.getpid(), 9)

In [None]:
import transformers
print(transformers.__version__)

## Reproducibility Configs

In [None]:
import random
import numpy as np
import torch
from transformers import set_seed

SEED = 42  # Or any number you prefer

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
set_seed(SEED)  # Huggingface-specific for full reproducibility

if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)


## Device Check

In [None]:
import torch
import gc


print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

# Clean up CUDA memory if available and not using TPU
try:
    # Check if CUDA is available AND torch_xla is NOT available (or not in TPU mode)
    # This prevents trying to clear CUDA memory when on TPU, which might cause issues
    using_tpu = False
    try:
        import torch_xla.runtime as xr
        if xr.process_count() > 0: # Check if XLA devices are available
            using_tpu = True
            print("TPU detected — no need for CUDA memory clear")
        else:
             print("torch_xla installed but no TPU devices found.")
    except ImportError:
        print("torch_xla not installed.")


    if torch.cuda.is_available() and not using_tpu:
        torch.cuda.empty_cache()
        print("CUDA memory cleared.")

except Exception as e:
    # Catch potential errors during detection or clearing
    print(f"Error during pre-training memory cleanup: {e}")

gc.collect()

# Safe device detection
try:
    import torch_xla.core.xla_model as xm
    import torch_xla.runtime as xr
    # Check if any XLA devices are actually available
    if xr.process_count() > 0:
         device = xm.xla_device()
         print(f'Using device: {device} (TPU)')
    else:
        # Fallback if torch_xla is installed but no TPU is connected/detected
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f'Using device: {device}')
        if device.type == 'cuda':
            print(f'GPU: {torch.cuda.get_device_name(0)}')
except (ImportError, RuntimeError) as e:
    # Fallback if torch_xla is not installed or encounters a fundamental error
    print(f"Could not initialize TPU. Error: {e}")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'Using device: {device}')
    if device.type == 'cuda':
        print(f'GPU: {torch.cuda.get_device_name(0)}')

In [None]:
if device == 'cuda':
  !nvidia-smi

## Loading Dataset

In [None]:
from config import Config

print(Config.TRAIN_FILE)
print(Config.VAL_FILE)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import gc

def load_and_sample_dataset(
    file_path,
    sample_mode="all",    # Options: "all", "random", "stratified"
    sample_size=None,     # For "random"/"stratified": float (proportion) or int (absolute count)
    random_state=42
):
    """
    Load dataset from Excel and apply sampling.

    Args:
        file_path (str): Path to dataset file.
        sample_mode (str): "all", "random", or "stratified".
        sample_size (float|int): Proportion (0-1) or count for sampling.
        random_state (int): Random seed for reproducibility.

    Returns:
        pd.DataFrame: Sampled dataset.
    """
    print(f"📂 Loading dataset from: {file_path}")
    df = pd.read_excel(file_path, engine='openpyxl')

    if sample_mode == "all":
        sampled_df = df.copy()
        print(f"✅ Loaded full dataset with {len(sampled_df)} samples.")

    elif sample_mode == "random":
        if sample_size is None:
            raise ValueError("For 'random' mode, SAMPLE_SIZE must be set.")
        sampled_df = df.sample(
            n=sample_size if isinstance(sample_size, int) else int(len(df) * sample_size),
            random_state=random_state
        )
        print(f"✅ Randomly sampled {len(sampled_df)} samples.")

    elif sample_mode == "stratified":
        if sample_size is None or not (0 < sample_size < 1):
            raise ValueError("For 'stratified' mode, SAMPLE_SIZE must be a proportion between 0 and 1.")
        sampled_df, _ = train_test_split(
            df,
            train_size=sample_size,
            stratify=df['author'],
            random_state=random_state
        )
        print(f"✅ Stratified sampled {len(sampled_df)} samples (author distribution preserved).")

    else:
        raise ValueError(f"Invalid sample_mode: {sample_mode}")

    del df
    gc.collect()

    return sampled_df


In [None]:
from datasets import Dataset

# === CONFIG ===
SAMPLE_MODE = "stratified"  # "all", "random", "stratified"
SAMPLE_SIZE = 0.1           # 10% for stratified or random. Set to None for full dataset.

# === LOAD AND SAMPLE ===
train_df = load_and_sample_dataset(
    Config.TRAIN_FILE,
    # sample_mode=SAMPLE_MODE,
    # sample_size=SAMPLE_SIZE
)

# === ADD REQUIRED COLUMNS ===
train_df['input_text'] = train_df['author'].astype(str) + ' [SEP] ' + train_df['text_in_msa'].astype(str)
train_df['target_text'] = train_df['text_in_author_style'].astype(str)

# === CONVERT TO HF DATASET ===
train_dataset = Dataset.from_pandas(train_df[['input_text', 'target_text']])

print(f"✅ Final dataset ready with {len(train_dataset)} samples.")


In [None]:
import os

print(Config.MODEL_WEIGHTS_FOLDER)

Config.MODEL_WEIGHTS_FOLDER = 'models/fine_tuned_model/model_weights' # os.path.dirname(Config.MODEL_WEIGHTS_FOLDER)

print(Config.MODEL_WEIGHTS_FOLDER)

In [None]:
from config import Config

!ls -R {Config.MODEL_WEIGHTS_FOLDER}

## Model Config

In [None]:
def safe_model_name(model_name):
    return model_name.replace("/", "_")

In [None]:
import os
from config import Config
# MODEL CONFIGURATION CELL — Modify only this block to switch models
MODEL_CONFIG = {
    "name": "sultan/ArabicT5-49GB-base" #"UBC-NLP/AraT5v2-base-1024", #"UBC-NLP/AraT5-base",  # Options: google/mt5-small, flax-community/arabic-t5-small, CAMeL-Lab/araT5-base, facebook/mbart-large-50-many-to-many-mmt, "UBC-NLP/AraT5-base", "moussaKam/AraBART", "CAMeL-Lab/araT5-large",
    "type": "arabic-t5",               # Options: mt5, arabic-t5, mbart
    "input_max_len": 1024,
    "target_max_len": 1024,
    "legacy": False                  # Explicit None means omit legacy flag
}

RESUME_TRAINING = False  # ✅ Change to False to start fresh
START_FRESH= True # varuable to clear the  model weitgh directory
CHECKPOINT_DIR = os.path.join(Config.MODEL_WEIGHTS_FOLDER, safe_model_name(MODEL_CONFIG["name"]))  # Your output_dir already set for checkpoints
print(CHECKPOINT_DIR)

In [None]:
!ls -R {CHECKPOINT_DIR}

In [None]:
print(MODEL_CONFIG["name"])
print(CHECKPOINT_DIR)

START_FRESH = 0
if START_FRESH :
  pass
    # !rm -r models/fine_tuned_model/model_weights/*
    # !rm -r models/fine_tuned_model/model_weights_final
    # !rm *.zip

    # !cp -r /kaggle/input/facebook-mbart-large-50-many-to-many-mmt-model/* {CHECKPOINT_DIR}

In [None]:
from config import Config

!ls -R models/fine_tuned_model/

## Tokenization

In [None]:
from transformers import AutoTokenizer

tokenizer_args = {"use_fast": False}
tokenizer_args["legacy"] = False  # MODEL_CONFIG["legacy"]

tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG["name"], **tokenizer_args)

if MODEL_CONFIG["type"] == "mbart":
    tokenizer.src_lang = 'ar_AR'
    tokenizer.tgt_lang = 'ar_AR'

input_max_len = MODEL_CONFIG["input_max_len"]
target_max_len = MODEL_CONFIG["target_max_len"]

print(f"✅ Loaded tokenizer for {MODEL_CONFIG['name']}")

def tokenize_data(example):
    # Tokenize source
    model_inputs = tokenizer(
        example['input_text'],
        max_length=input_max_len,
        truncation=True,
        padding='longest'
    )

    # Tokenize target
    labels = tokenizer(
        example['target_text'],
        max_length=target_max_len,
        truncation=True,
        padding='longest'
    )['input_ids']

    # Replace pad_token_id with -100 in labels
    labels = [token if token != tokenizer.pad_token_id else -100 for token in labels]

    model_inputs['labels'] = labels

    return model_inputs

train_dataset = train_dataset.map(
    tokenize_data,
    batched=False,            # one example at a time
    num_proc=4,
    remove_columns=train_dataset.column_names
)
print('Tokenization complete.')


In [None]:
# grab one example from the mapped dataset
example = train_dataset[0]
print("type(input_ids)  ▶",  type(example["input_ids"]))
print("len(input_ids)   ▶",  len(example["input_ids"]))
print("type(labels)     ▶",  type(example["labels"]))
print("unique label ids ▶", set(example["labels"]))


In [None]:
# 1. What columns do we actually have?
print("Columns:", train_dataset.column_names)

# 2. Peek at the very first example
ex0 = train_dataset[0]
print("Example 0 keys  ▶", ex0.keys())
print("Example 0 input_ids  ▶", ex0["input_ids"][:10], "…", len(ex0["input_ids"]))
print("Example 0 labels     ▶", ex0["labels"][:10], "…", len(ex0["labels"]))

# # 3. Decode to human‐readable
# from transformers import logging
# logging.set_verbosity_error()  # silence warnings
# print("Decoded input  ▶", tokenizer.decode(ex0["input_ids"], skip_special_tokens=True))
# print("Decoded target ▶", tokenizer.decode(
#     [tok for tok in ex0["labels"] if tok != -100],
#     skip_special_tokens=True
# ))


In [None]:
print(input_max_len)

## Evaluation

In [None]:
import evaluate

chrf_metric = evaluate.load("chrf")
bleu_metric = evaluate.load("bleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 with pad_token_id so labels decode correctly
    labels = [
        [(l if l != -100 else tokenizer.pad_token_id) for l in label]
        for label in labels
    ]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    bleu = bleu_metric.compute(predictions=decoded_preds, references=[[ref] for ref in decoded_labels])
    chrf = chrf_metric.compute(predictions=decoded_preds, references=[[ref] for ref in decoded_labels])

    return {"bleu": bleu["score"], "chrf": chrf["score"]}


## Fine tuning

In [None]:
import torch
import gc
gc.collect()
torch.cuda.empty_cache()
# torch.cuda.ipc_collect()
# torch.cuda.empty_cache()

In [None]:
# Model loading

from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import psutil
from datetime import datetime
import time
from pathlib import Path
import os


from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CONFIG["name"])
model.to(device)

print(f"✅ Loaded model {MODEL_CONFIG['name']} on {device}")

# ⚙️ Disable use_cache and enable gradient checkpointing
model.config.use_cache = False
# model.gradient_checkpointing_disable()
model.gradient_checkpointing_enable()


## batch size calculation
effective_batch_size = 32
per_device_train_batch_size = 4
gradient_accumulation_steps = effective_batch_size // per_device_train_batch_size
# → 32 // 16 = 2


## training args
training_args = Seq2SeqTrainingArguments(
    output_dir=os.path.join(Config.MODEL_WEIGHTS_FOLDER,safe_model_name(MODEL_CONFIG["name"])),
    ## Batch Size & Accumulation
    num_train_epochs=3,               # More epochs for real training
    max_steps=-1,                     # Ensures max_steps is an int—you won't hit the comparison bug                    # Let epochs determine total steps
    per_device_train_batch_size=per_device_train_batch_size,    # Keep batch size low due to long sequences
    gradient_accumulation_steps=gradient_accumulation_steps,    # Simulate effective batch size of 4
    learning_rate=5e-5,               # Standard fine-tuning LR
    ## Checkpointing & Logging
    logging_strategy="steps",     # log every num steps
    logging_steps=1,                 # Log occasionally
    logging_first_step=True,
    save_total_limit=1,               # Keep last 3 checkpoints
    save_strategy="steps",
    save_steps=1000,
    report_to='none',
    ## Evaluation
    predict_with_generate=False,
    # eval_accumulation_steps=2,
    # eval_strategy="steps",
    # eval_steps=40,
    # per_device_eval_batch_size=4,
    ## optimization
    fp16=False,                       # No FP16 on CPU

    dataloader_num_workers=4,         # Some parallelism for speed
    dataloader_pin_memory=True,
    gradient_checkpointing = False,
    optim='adamw_torch',
    max_grad_norm=1.0,
    # auto_find_batch_size=True  # Let trainer avoid memory issues
)

print(f'Memory before training: {psutil.Process().memory_info().rss / 1024 / 1024:.2f} MB')


from transformers import DataCollatorForSeq2Seq

# Use proper data collator for sequence-to-sequence tasks
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100  # <— make sure pad tokens in labels become -100
)


from transformers import TrainerCallback, TrainerState, TrainerControl
import shutil
import os


# Then pass it to your trainer:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    # eval_dataset=val_dataset,
    # compute_metrics=compute_metrics,
    # callbacks=[SaveToDriveCallback]
    # callbacks=[PrintLossCallback]
)


print(f'Model parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}')

In [None]:
# verify some batches strcuture before training

from torch.utils.data import DataLoader
loader = DataLoader(train_dataset, batch_size=2, collate_fn=data_collator)
batch = next(iter(loader))
# print("Unique labels:", torch.unique(batch['labels']))
print("Count of -100:", (batch['labels']==-100).sum().item())
print("Count of pad_id=1:", (batch['labels']==tokenizer.pad_token_id).sum().item())

In [None]:
from math import ceil

# Retrieve from your existing variables or Config
num_samples = len(train_dataset)
per_device_train_batch_size = training_args.per_device_train_batch_size
gradient_accumulation_steps = training_args.gradient_accumulation_steps
num_epochs = training_args.num_train_epochs

# Compute effective batch size and total steps
effective_bs = per_device_train_batch_size * gradient_accumulation_steps
steps_per_epoch = ceil(num_samples / effective_bs)
total_steps = steps_per_epoch * num_epochs

# # Compute save_steps programmatically
# try:
#     total_steps = training_args.max_steps
# except NameError:
#     total_steps = 0  # If not defined yet

print(f"📊 {steps_per_epoch=} steps/epoch, {total_steps=} total")

# Compute save_steps (5% of total or fallback to 2000)
if total_steps > 100:
    computed_save_steps = max(100, total_steps // 10)
else:
    computed_save_steps = 1000

computed_save_steps = 500

trainer.args.save_steps = computed_save_steps

print(f"💾 Checkpoints every {trainer.args.save_steps} steps")

#--------------------------------
#--------------------------------

# Ensure logging during training
trainer.args.logging_strategy = "steps"
trainer.args.logging_steps = trainer.args.gradient_accumulation_steps * 10

In [None]:
print(next(model.parameters()).device)  # Should be 'cuda:0'
#print(model.trainer)  # Should be True


## Training Launch

In [None]:
from datetime import datetime
import time
import os
from config import Config

# torch.cuda.empty_cache()

print(f"📢 Starting training at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

start_time = time.time()


# Detect checkpoint
last_checkpoint = None
CHECKPOINT_DIR = os.path.join(Config.MODEL_WEIGHTS_FOLDER,safe_model_name(MODEL_CONFIG["name"]))
RESUME_TRAINING=1

if RESUME_TRAINING:
    if os.path.isdir(CHECKPOINT_DIR):
        checkpoints = [d for d in os.listdir(CHECKPOINT_DIR) if d.startswith("checkpoint-")]
         # Filter only checkpoints with int-parsable suffix
        int_checkpoints = []
        for ckpt in checkpoints:
            try:
                step = int(ckpt.split("-")[-1])
                int_checkpoints.append((step, ckpt))
            except ValueError:
                continue
        
        if int_checkpoints:
            # Get the checkpoint with the highest step
            latest_checkpoint = max(int_checkpoints, key=lambda x: x[0])[1]
        elif "interrupted" in checkpoints:
            latest_checkpoint = "interrupted"
        else:
            latest_checkpoint = None
        
        if latest_checkpoint:
            last_checkpoint = os.path.join(CHECKPOINT_DIR, latest_checkpoint)
            print(f"✅ Found checkpoint at {last_checkpoint}. Will resume training from there.")
        else:
            print("🆕 No valid checkpoint found. Starting fresh training.")
        # if checkpoints:
        #     # Sort numerically based on step number
        #     latest_checkpoint =  max(checkpoints, key=lambda x: int(x.split("-")[-1]))
        #     last_checkpoint = os.path.join(CHECKPOINT_DIR, latest_checkpoint)
        #     print(f"✅ Found checkpoint at {last_checkpoint}. Will resume training from there.")
        # else:
        #     print("⚠️ No checkpoints found. Starting from scratch.")
    else:
        print("⚠️ Checkpoint directory doesn't exist. Starting from scratch.")
else:
    print("🆕 Starting fresh training (no checkpoint resume).")

# Start training
try:
    if last_checkpoint:
        train_output = trainer.train(
            resume_from_checkpoint=last_checkpoint,
            # ignore_keys_for_eval=["optimizer", "scheduler"]
        )
    else:
        train_output = trainer.train()

    total_time = time.time() - start_time
    print(f"✅ Training completed in {total_time/3600:.2f} hours")

    if train_output.metrics:
        print("\n📊 Final Training Metrics:")
        for key, value in train_output.metrics.items():
            print(f" {key}: {value:.4f}")

except KeyboardInterrupt:
    print("\n⚠️ Training interrupted by user. Saving checkpoint...")
    interrupted_path = os.path.join(CHECKPOINT_DIR, "checkpoint-interrupted")
    trainer.save_model(interrupted_path)
    print(f"💾 Checkpoint saved at {interrupted_path}")


## Saving Results

In [None]:
# Save final model to Drive
# from pathlib import Path
import os

MODEL_WEIGHT_FINAL_DIR = os.path.join( Config.MODEL_WEIGHTS_FOLDER , safe_model_name(MODEL_CONFIG["name"]) , 'final' )
trainer.save_model(MODEL_WEIGHT_FINAL_DIR)
print(f'✅ Model saved TO {MODEL_WEIGHT_FINAL_DIR}')

## Backup

In [None]:
# !zip -r'{MODEL_WEIGHT_FINAL_DIR}.zip' MODEL_WEIGHT_FINAL_DIR

In [None]:
# from IPython.display import FileLink
# FileLink(F'{MODEL_WEIGHT_FINAL_DIR}.zip')

In [None]:
!ls /kaggle/input/kaggle-json

In [None]:
!pip install -q kaggle

!mkdir -p ~/.kaggle
# !echo '{"username": "kaggelone", "key": "14a0b752458ec2faed37fb584b1bae9f"}' > ~/.kaggle/kaggle.json
!cp /kaggle/input/kaggle-json/kaggle.json ~/.kaggle/  # Assuming you uploaded it as kaggle-json dataset
!chmod 600 ~/.kaggle/kaggle.json
!mkdir -p /root/.config
!cp /kaggle/input/kaggle-json/kaggle.json /root/.config/  # Assuming you uploaded it as kaggle-json dataset
!chmod 600 /root/.config/kaggle.json

In [None]:
import os
import json
import shutil
from kaggle.api.kaggle_api_extended import KaggleApi
from datetime import datetime
import zipfile

def upload_to_kaggle_dataset(dataset_name, folder_path, dataset_title=None,
                             override=True, version_append=True,
                             local_backup_path=None):
    """
    Uploads a folder or file to a Kaggle dataset, preserving older structure when version_append=True.
    If dataset_download_files fails (e.g. private dataset), falls back to local_backup_path.
    """
    kaggle_username = "kaggelone"
    kaggle_json_path = os.path.expanduser("~/.kaggle/kaggle.json")
    if os.path.exists(kaggle_json_path):
        try:
            with open(kaggle_json_path) as f:
                creds = json.load(f)
                kaggle_username = creds.get("username", kaggle_username)
        except:
            pass
    kaggle_username = os.environ.get("KAGGLE_USERNAME", kaggle_username)
    dataset_slug = dataset_name.lower().replace("_", "-").replace(" ", "-")
    dataset_id = f"{kaggle_username}/{dataset_slug}"
    if dataset_title is None:
        dataset_title = dataset_name.replace("-", " ").title()

    api = KaggleApi()
    api.authenticate()

    # Detect existence
    dataset_exists = any(ds.ref.lower() == dataset_id.lower()
                         for ds in api.dataset_list(user=kaggle_username))

    temp_dir = f"/tmp/{dataset_slug}_upload"
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
    os.makedirs(temp_dir)

    # 1) Download old version if required
    if dataset_exists and version_append:
        print(f"⬇️ Downloading existing dataset '{dataset_id}'…")
        try:
            api.dataset_download_files(dataset_id, path=temp_dir, unzip=True)
        except Exception as e:
            print(f"⚠️ dataset_download_files failed: {e}")
            if local_backup_path and os.path.exists(local_backup_path):
                print(f"ℹ️ Falling back to local backup at '{local_backup_path}'")
                shutil.copytree(local_backup_path, temp_dir, dirs_exist_ok=True)
            else:
                print("❌ Cannot preserve old files—no download and no local backup.")
                version_append = False

    # Debug: show what’s in temp_dir before adding new zip
    print("📂 temp_dir contents BEFORE zipping:")
    for name in sorted(os.listdir(temp_dir)):
        print("   ", name)

    # 2) Create new folder zip
    folder_basename = os.path.basename(os.path.normpath(folder_path))
    zip_path = os.path.join(temp_dir, f"{folder_basename}.zip")

    # Pre‑zip conflict check
    if os.path.exists(zip_path):
        if override:
            os.remove(zip_path)
        else:
            print(f"⚠️ '{folder_basename}.zip' already exists—skipping.")
            return

    print(f"📦 Zipping '{folder_path}' → '{zip_path}'…")
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
        if os.path.isdir(folder_path):
            for root, _, files in os.walk(folder_path):
                for f in files:
                    abs_p = os.path.join(root, f)
                    rel_p = os.path.relpath(abs_p, os.path.dirname(folder_path))
                    zf.write(abs_p, arcname=os.path.join(folder_basename, rel_p))
        else:
            zf.write(folder_path, arcname=folder_basename)

    # 3) Final temp_dir debug
    print("📂 temp_dir contents BEFORE upload:")
    for name in sorted(os.listdir(temp_dir)):
        print("   ", name)

    # 4) Metadata
    meta = {"title": dataset_title, "id": dataset_id, "licenses":[{"name":"CC0-1.0"}]}
    with open(os.path.join(temp_dir, "dataset-metadata.json"), "w") as f:
        json.dump(meta, f, indent=2)

    # 5) Upload
    try:
        if dataset_exists:
            api.dataset_create_version(
                temp_dir,
                version_notes=f"Append {folder_basename} @ {datetime.now()}",
                convert_to_csv=False,
                dir_mode="zip"
            )
        else:
            api.dataset_create_new(
                temp_dir,
                convert_to_csv=False,
                dir_mode="zip"
            )
        print("✅ Upload successful.")
    except Exception as e:
        print(f"❌ Upload failed: {e}")

    shutil.rmtree(temp_dir)


In [None]:
!ls .

In [None]:
upload_to_kaggle_dataset(
    dataset_name="UBC-NLP-raT5v2-base-1024-model",
    folder_path="/kaggle/working/models/fine_tuned_model/model_weights/UBC-NLP_AraT5v2-base-1024/", # "/kaggle/working/output/evaluation/ar_style_classifier/results",
    dataset_title="UBC-NLP_AraT5v2-base-1024 AST model",
    override=True,
    version_append=False
)

In [None]:
# !zip -r checkpoint-3000.zip /kaggle/working/output/models/fine_tuned_model/model_weights/google_mt5-small/facebook_mbart-large-50-many-to-many-mmt/checkpoint-3000

In [None]:
# from IPython.display import FileLink
# FileLink(r'checkpoint-3000.zip')


In [None]:
# ! mv /kaggle/working/output/models/fine_tuned_model/model_weights/google_mt5-small/facebook_mbart-large-50-many-to-many-mmt/checkpoint-3000 /kaggle/working/output/models/fine_tuned_model/model_weights/facebook_mbart-large-50-many-to-many-mmt/checkpoint-3000

In [None]:
# !rm -r /kaggle/working/output/models/fine_tuned_model/model_weights/google_mt5-small/facebook_mbart-large-50-many-to-many-mmt/checkpoint-2000

In [None]:
# pwd

In [None]:
# from transformers import snapshot_download
# folder = snapshot_download(MODEL_CONFIG["name"])
# !ls {folder}

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
# model     = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")

# print("pad_token_id:", tokenizer.pad_token_id)
# print("eos_token_id:", tokenizer.eos_token_id)


In [None]:
# from config import Config

# val_df = load_and_sample_dataset(Config.VAL_FILE, sample_mode="all")
# val_df['input_text'] = val_df['author'].astype(str) + ' [SEP] ' + val_df['text_in_msa'].astype(str)
# val_df['target_text'] = val_df['text_in_author_style'].astype(str)

# val_dataset = Dataset.from_pandas(val_df[['input_text', 'target_text']])

# # 1. Update tokenize_data to work in batched mode:
# def tokenize_data(examples):
#     # examples['input_text'] is a list of str
#     src = tokenizer(
#         examples['input_text'],
#         max_length=input_max_len,
#         truncation=True,
#         padding='max_length'
#     )
#     tgt = tokenizer(
#         examples['target_text'],
#         max_length=target_max_len,
#         truncation=True,
#         padding='max_length'
#     )['input_ids']
#     # replace pad tokens in each sequence
#     labels = [
#         [(tok if tok != tokenizer.pad_token_id else -100) for tok in seq]
#         for seq in tgt
#     ]
#     return {'input_ids': src['input_ids'],
#             'attention_mask': src['attention_mask'],
#             'labels': labels}

# # 2. Map on validation dataset using batched mapping:
# val_dataset = val_dataset.map(
#     tokenize_data,
#     batched=True,         # process batches at once
#     batch_size=64,        # adjust as RAM/Vram allows
#     num_proc=4,           # parallel processes
#     remove_columns=val_dataset.column_names
# )
# print("✅ Validation tokenization complete")


In [None]:
# import torch
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# tokeniz = AutoTokenizer.from_pretrained(MODEL_CONFIG["name"], use_fast=False)
# model     = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CONFIG["name"]).cuda()
# torch.cuda.empty_cache()

# def peak_mem(batch_size):
#     input_ids = torch.randint(
#         low=0,
#         high=tokeniz.vocab_size,
#         size=(batch_size, MODEL_CONFIG["input_max_len"]),
#         dtype=torch.long,
#         device="cuda"
#     )
#     attention_mask = torch.ones_like(input_ids)

#     # For T5, decoder inputs must be provided; we can reuse input_ids as a stand‑in
#     decoder_input_ids = input_ids.clone()

#     try:
#         with torch.no_grad():
#             _ = model(
#                 input_ids=input_ids,
#                 attention_mask=attention_mask,
#                 decoder_input_ids=decoder_input_ids
#             )
#         peak = torch.cuda.max_memory_allocated() / (1024**3)
#         return f"{peak:.2f} GB"
#     except RuntimeError as e:
#         return f"OOM at BS={batch_size}"

# for bs in [2,4,8, 16, 24, 32, 48]:
#     torch.cuda.reset_peak_memory_stats()
#     print(f"BS={bs:<2} → peak GPU used:", peak_mem(bs))


In [None]:
# from torch.utils.data import DataLoader
# dl = DataLoader(
#   train_dataset,
#   batch_size=2,
#   collate_fn=data_collator
# )
# batch = next(iter(dl))

# print("input_ids:", batch["input_ids"].shape)
# print("labels:   ", batch["labels"].shape)
# print("unique labels:", torch.unique(batch["labels"]))



In [None]:
# outputs = model(
#   input_ids      = batch["input_ids"].to(device),
#   attention_mask = batch["attention_mask"].to(device),
#   labels         = batch["labels"].to(device),
#   return_dict=True
# )
# print("Model-reported loss:", outputs.loss)
# print("Logits shape:", outputs.logits.shape)


In [None]:
# batch = next(iter(dl))
# model.train()
# outputs = model(
#   input_ids      = batch["input_ids"].to(device),
#   attention_mask = batch["attention_mask"].to(device),
#   labels         = batch["labels"].to(device),
# )
# loss = outputs.loss
# loss.backward()
# print("Manual step loss:", loss.item())


In [None]:
# batch = next(iter(train_dataloader))
# print("Unique label IDs:", set(batch["labels"].tolist()))
# # You should see {-100, … actual token IDs …}, not {0, …}
# print("Count of -100 positions:", (batch["labels"] == -100).sum())

## Save Callback

In [None]:
# !pip install -q kaggle

# !mkdir -p ~/.kaggle
# # !echo '{"username": "kaggelone", "key": "14a0b752458ec2faed37fb584b1bae9f"}' > ~/.kaggle/kaggle.json
# !cp /kaggle/input/kaggle-json/kaggle.json ~/.kaggle/  # Assuming you uploaded it as kaggle-json dataset
# !chmod 600 ~/.kaggle/kaggle.json

In [None]:
# import os
# import shutil
# import zipfile
# from datetime import datetime

# def is_valid_checkpoint_folder(folder_path):
#     """Check if a folder is a valid HuggingFace checkpoint."""
#     required_files = {"config.json", "pytorch_model.bin", "trainer_state.json"}
#     files_present = set(os.listdir(folder_path))
#     return required_files.issubset(files_present)

# def get_most_recent_checkpoint(parent_folder):
#     """Return the path to the most recently modified valid checkpoint directory."""
#     checkpoint_folders = [
#         os.path.join(parent_folder, d) for d in os.listdir(parent_folder)
#         if os.path.isdir(os.path.join(parent_folder, d)) and d.startswith("checkpoint-")
#     ]
#     valid_folders = [f for f in checkpoint_folders if is_valid_checkpoint_folder(f)]
#     if not valid_folders:
#         return None
#     # Sort by last modified time
#     return max(valid_folders, key=os.path.getmtime)

# #
# # Callback function
# def checkpoint_save_callback(checkpoint_root_dir, kaggle_dataset_name, kaggle_dataset_title):
#     """
#     Finds latest checkpoint and uploads it to Kaggle dataset.
#     Call this after a checkpoint is saved.
#     """
#     checkpoint_path = get_most_recent_checkpoint(checkpoint_root_dir)
#     if checkpoint_path:
#         print(f"📦 Found latest valid checkpoint at: {checkpoint_path}")
#         upload_to_kaggle_dataset(
#             dataset_name=kaggle_dataset_name,
#             folder_path=checkpoint_path,
#             dataset_title=kaggle_dataset_title,
#             override=True,
#             version_append=False
#         )
#     else:
#         print("⚠️ No valid checkpoint found to upload.")


In [None]:
# from transformers import TrainerCallback, TrainerState, TrainerControl, TrainingArguments
# import os
# import time
# import traceback

# class SaveToKaggleCallback(TrainerCallback):
#     def __init__(self, checkpoint_root_dir, kaggle_dataset_name, kaggle_dataset_title):
#         self.checkpoint_root_dir = checkpoint_root_dir
#         self.kaggle_dataset_name = kaggle_dataset_name
#         self.kaggle_dataset_title = kaggle_dataset_title

#     def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
#         try:
#             print(f"🛠️ Running SaveToKaggleCallback at step {state.global_step}")
#             latest_ckpt = self._get_latest_complete_checkpoint(self.checkpoint_root_dir)
#             if latest_ckpt:
#                 print(f"📦 Found checkpoint: {latest_ckpt}")
#                 upload_to_kaggle_dataset(
#                     dataset_name=self.kaggle_dataset_name,
#                     folder_path=latest_ckpt,
#                     dataset_title=self.kaggle_dataset_title,
#                     override=True,
#                     version_append=False
#                 )
#                 print(f"✅ Uploaded {latest_ckpt} to Kaggle")
#             else:
#                 print("⚠️ No complete checkpoint found to upload.")
#         except Exception as e:
#             print(f"❌ [SaveToKaggleCallback] Error occurred:\n{traceback.format_exc()}")
#             print("⚠️ Continuing training despite callback failure.")

#     def _get_latest_complete_checkpoint(self, root_dir):
#         if not os.path.isdir(root_dir):
#             return None

#         candidates = sorted([
#             os.path.join(root_dir, d) for d in os.listdir(root_dir)
#             if d.startswith("checkpoint") and os.path.isdir(os.path.join(root_dir, d))
#         ], key=os.path.getmtime, reverse=True)

#         # required_files = {"pytorch_model.bin", "config.json", "tokenizer_config.json", "tokenizer.json"}
#         required_files = {
#             # "pytorch_model.bin",
#             "config.json",
#             "training_args.bin",
#             "trainer_state.json",
#             "optimizer.pt",
#             "scheduler.pt",
#             # "rng_state.pth",                 # optional for reproducibility
#             "tokenizer_config.json",
#             "tokenizer.json",
#             # "special_tokens_map.json",
#             # "spiece.model"                   # if using SentencePiece
#         }


#         for path in candidates:
#             files = set(os.listdir(path))
#             if required_files.issubset(files):
#                 return path
#         return None
