# pipeline_deberta-v3-small.ipynb
This notebook contains some template code to help you with loading/preprocessing the data.

We start with some imports and constants.
The training data is found in the `data` subfolder.
There is also a tokenizer I've trained for you which you can use for the project.

Can be executed once — does not depend on the execution environment.

In [None]:
# ========================================
# 1. Clone the Repository (if required)
# ========================================
import os
TOKEN = os.getenv('GITHUB_TOKEN')  # Set this as environment variable: export GITHUB_TOKEN="your_token_here"
if not TOKEN:
    raise ValueError("Please set GITHUB_TOKEN environment variable")
# XXXXXXXXXXXXXXXXXXXXXXXXXXXX
!git clone https://{TOKEN}@github.com/KatsuhitoArasaka/BabyLM-Tiny.git  # your repository link  ⚠️
%cd /content/BabyLM-Tiny

# ===============
# 2. Set Paths
# ===============
TRAIN_PATH = './datasets/PATH_TO_TRAINING_DATA.train'       # Path to your training data  ⚠️
DEV_PATH = './datasets/PATH_TO_VALIDATION_DATA.train'           # Path to your validation data  ⚠️

Execute after restarting the environment -- when changing the device (CPU ↔ GPU) you need to restart the kernel.

In [None]:
# ==========================
# 3. Install Dependencies
# ==========================
%pip install transformers datasets wandb trl bitsandbytes huggingface_hub --quiet


# =====================================
# 4. Import Libraries and Set Device
# =====================================
import subprocess
import json
import torch
import datasets
from functools import partial
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling, set_seed
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM
from trl import SFTTrainer, SFTConfig

from transformers import set_seed
set_seed(42)


DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'  # Check for GPU availability
print(f"Using device: {DEVICE}")

In [None]:
# ======================================
# 5. Authentication with wandb and hf
# ======================================
import wandb
wandb.login()  # Enter your API key when prompted ⚠️

from huggingface_hub import login
login()  # Paste your HF token from https://huggingface.co/settings/tokens  ⚠️


In [None]:
# ================================================
# 6. Start a new wandb run to track this script
# ================================================
run = wandb.init(
    # Set the wandb entity where your project will be logged (generally your team name).
    entity="Low-Resource_Pretraining",
    # Set the wandb project where this run will be logged.
    project="NLP_LRP_BabyLM",
    # Track hyperparameters and run metadata.
    config={
        "learning_rate": 0.02,  # the main parameter for configuring the optimizer
        "architecture": "DeBERTa",  # a description of the model architecture, to track which model was used in the project
        "epochs": 10,  # the number of training epochs, an important parameter for understanding the duration of the experiment and its settings

        # "dataset": "CIFAR-100",
        # "batch_size": 8,
    },
)

Here are we load the dataset:

In [None]:
# ===================
# 7. Load Datasets
# ===================

# Install and load spaCy with the English model
!pip install -q spacy
!python -m spacy download en_core_web_sm

import spacy
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 10_000_000  # Safe limit to handle long texts; can be adjusted if needed

# For using the 1M spoken dataset, split by sentences using spaCy
# (instead of splitting by '.' to avoid incorrect sentence breaks in abbreviations like 'Dr.', 'U.S.', 'Mr.', etc.)

# Function to split long text into smaller chunks for spaCy processing
def split_into_chunks(text, chunk_size=200_000):
    for i in range(0, len(text), chunk_size):
        yield text[i:i+chunk_size]

# Function to split text into sentences using spaCy on smaller chunks
def spacy_sent_tokenize(text):
    sentences = []
    for chunk in split_into_chunks(text):
        doc = nlp(chunk)
        sentences.extend([sent.text.strip() for sent in doc.sents if sent.text.strip()])
    return sentences

# Function to filter out very short sentences (set min words ⚠️)
def filter_short_sentences(sentences, min_words=4):
    return [s for s in sentences if len(s.split()) >= min_words]

# Load and tokenize training dataset
with open(TRAIN_PATH, 'r', encoding='utf-8') as f:
    full_text = f.read().strip()
    sentences = spacy_sent_tokenize(full_text)
    filtered_sentences = filter_short_sentences(sentences)
    train_data = [{"text": s} for s in filtered_sentences]

# Load and tokenize validation dataset
with open(DEV_PATH, 'r', encoding='utf-8') as f:
    full_text = f.read().strip()
    sentences = spacy_sent_tokenize(full_text)
    filtered_sentences = filter_short_sentences(sentences)
    val_data = [{"text": s} for s in filtered_sentences]

# Create DatasetDict object for HuggingFace datasets
from datasets import Dataset, DatasetDict
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(val_data)
})

In [None]:
# =====================================================================================
# 8. Setup Model, Training, and Logging (This is the part that may change per model)
# =====================================================================================

# Specific Settings for the Model you choose            ⚠️
# Replace this block with model-specific settings       ⚠️
model_name = "microsoft/deberta-v3-small"  # Example: DeBERTa model for MLM
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_config(config)

# Enable gradient checkpointing to save GPU memory
# model.gradient_checkpointing_enable()  --  causes 'RuntimeError: Trying to backward through the graph a second time'

# Data collator to support masked language modeling

# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer,
#     seed=0,
# )
# Passing a custom data collator is not supported when using padding-free.


# ===============================
# 9. Setup Training Arguments
# ===============================
# train.py
def preprocess_logits_for_metrics(logits, labels):
    pred_ids = torch.argmax(logits, dim=-1)
    return pred_ids


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.flatten()
    labels = labels.flatten()
    mask = labels != -100
    labels = labels[mask]
    predictions = predictions[mask]

    correct = labels == predictions
    accuracy = correct.sum() / float(len(correct))
    return {"acc": accuracy}


# ======================
# 10. Train the Model
# ======================
trainer = SFTTrainer(
    model = model,
    processing_class = tokenizer,
    train_dataset = dataset['train'],
    eval_dataset = dataset['validation'],
    # data_collator = data_collator,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics,
    args = SFTConfig(
        remove_unused_columns = True,
        label_names = ["labels"],
        dataset_num_proc = 12,
        packing = True,
        eval_packing = True,
        max_seq_length = 64,  # ⚠️
        dataset_text_field = "text",
        eval_strategy = "steps",
        per_device_train_batch_size = 16,  # 64 / N  ⚠️
        gradient_accumulation_steps = 4,  # N  →  16 * 4 = 64 effective batch  ⚠️
        warmup_ratio = 0.05,
        num_train_epochs = 10,
        learning_rate = 2e-4,
        fp16 = True,
        bf16 = False,
        logging_steps = 10,
        eval_steps = 100,
        save_steps = 100,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 0,
        # output_dir = "",
        report_to = "none",
        eval_accumulation_steps=1,
        include_for_metrics=[],
        max_grad_norm=1,
    ),
)

# torch.cuda.empty_cache()
trainer.train()

In [None]:
# ===========================
# 11. Finish wandb Logging
# ===========================
wandb.finish()  # log the final metrics and mark the run as complete

In [None]:
# ============================================================
# 12. Save the trained model locally (for later evaluation)
# ============================================================

# This will save the model to a folder inside your Colab environment
# The folder will be deleted when the session ends unless uploaded elsewhere (e.g., Hugging Face, Google Drive)

# save_path = "./trained_models/deberta-v3-small-dataset"  # ⚠️ Change `model_name` if training multiple models in one session
# trainer.save_model(save_path)        # Save model weights, config, etc.
# tokenizer.save_pretrained(save_path) # Save tokenizer (required for evaluation)

# OR use HF repository
save_path = "your_hf_username/model_name"

In [None]:
# ===============================================
# 13. Upload Trained Model to Hugging Face Hub
# ===============================================
from huggingface_hub import create_repo, upload_folder

hf_repo_name = "deberta-v3-small-dataset"  # Change this to a unique name for your model       ⚠️
hf_username = "your_hf_username"          # Replace with your actual Hugging Face username    ⚠️
repo_id = f"{hf_username}/{hf_repo_name}"

# Create a public repo (use private=True if needed)
create_repo(repo_id, exist_ok=True, private=True)

# Upload entire trained model folder
upload_folder(
    repo_id=repo_id,
    folder_path=save_path,
    path_in_repo=".",  # Upload everything from the folder
    repo_type="model"
)

print(f"Model uploaded to: https://huggingface.co/{repo_id}")

In [None]:
# ===============================================================
# 14. Evaluate the model using BabyLM evaluation pipeline 2025
# ===============================================================

# run chunk "1. Clone the Repository" again before starting the evaluation ⚠️

import os
import pathlib
import shutil
import subprocess
# !pip install transformers --quiet


!cp -r BabyLM-Tiny/evaluation_data evaluation-pipeline-2025/
!rm -r evaluation-pipeline-2025/evaluation_data/fast_eval/ewok_fast*

# --- Configuration ---
EVAL_PIPELINE_REPO = "evaluation-pipeline-2025"

ARCHITECTURE = "mlm"  # "mlm", "causal", or "mntp"  # ⚠️
EVAL_MODE = "zero-shot-fast"  # "zero-shot", "zero-shot-fast", or "finetune"  # ⚠️
EVAL_DATA_FOLDER = "fast_eval"  # "full_eval" or "fast_eval" for "zero-shot-fast" # ⚠️
# EVAL_DATA_FOLDER is ignored when using EVAL_MODE = "finetune"
# because finetuning tasks load datasets directly from HuggingFace (e.g., GLUE), not from evaluation_data/.

DATASET_NAME = "name-dataset"  # change 'name' in the "name-dataset" to the dataset your model was trained on  # ⚠️
ENABLE_GIT_PUSH = True  # set to True to automatically commit and push the result  # ⚠️

model_id_path = pathlib.Path(save_path).name.replace("/", "_")  # for results directory structure

# --- Step 1: Clone evaluation pipeline if not present ---
if not os.path.exists(EVAL_PIPELINE_REPO):
    subprocess.run(["git", "clone", "https://github.com/babylm/evaluation-pipeline-2025.git"], check=True)

# --- Step 2: Run the appropriate evaluation script ---
if EVAL_MODE == "zero-shot":
    eval_script = "eval_zero_shot.sh"
    eval_dir = os.path.join("evaluation_data", EVAL_DATA_FOLDER)
    cmd = ["bash", eval_script, save_path, ARCHITECTURE, eval_dir]
    working_dir = EVAL_PIPELINE_REPO

elif EVAL_MODE == "zero-shot-fast":
    eval_script = "eval_zero_shot_fast.sh"
    eval_dir = os.path.join("evaluation_data", EVAL_DATA_FOLDER)
    revision_name = "main"  # <-- Use "main" as default unless you have custom branches/tags

    # Optional: specify a revision if using branches or tags in your HF repo
    # For example, "checkpoint_1M", "final", or a commit hash like "e5a72b3"
    # revision_name = "checkpoint_1M"

    cmd = ["bash", eval_script, save_path, revision_name, ARCHITECTURE, eval_dir]
    working_dir = EVAL_PIPELINE_REPO

elif EVAL_MODE == "finetune":
    eval_script = "eval_finetune.sh"
    cmd = ["bash", eval_script, save_path]
    working_dir = EVAL_PIPELINE_REPO

else:
    raise ValueError(f"Unknown EVAL_MODE: {EVAL_MODE}")

# --- Run the script and capture output ---
result = subprocess.run(cmd, cwd=working_dir, capture_output=True, text=True)

# Print outputs for debugging
print("=== STDOUT ===")
print(result.stdout)
print("\n=== STDERR ===")
print(result.stderr)

# --- Manually save results if results.txt is missing ---
manual_results_path = f"{EVAL_PIPELINE_REPO}/results/{model_id_path}/manual_results.txt"
results_dir = pathlib.Path(manual_results_path).parent
results_dir.mkdir(parents=True, exist_ok=True)

with open(manual_results_path, "w") as f:
    f.write(result.stdout)

print(f"\n✅ Saved results to: {manual_results_path}")

# Raise an error if the script failed
if result.returncode != 0:
    raise RuntimeError(f"❌ Evaluation script failed with exit code {result.returncode}")


# --- Step 3: Locate the results.txt file ---
if EVAL_MODE.startswith("zero-shot"):
    eval_subfolder = "zero_shot"
elif EVAL_MODE == "finetune":
    eval_subfolder = "finetune"
else:
    raise RuntimeError("Unexpected EVAL_MODE value")

results_root = pathlib.Path(EVAL_PIPELINE_REPO) / "results" / model_id_path / "main" / eval_subfolder

# Search recursively for results.txt
candidate = None
for root, _, files in os.walk(results_root):
    for f in files:
        if f == "results.txt":
            candidate = os.path.join(root, f)
            break

# Fallback to manually saved output if results.txt not found
if not candidate:
    manual_results_path = f"{EVAL_PIPELINE_REPO}/results/{model_id_path}/manual_results.txt"
    if os.path.exists(manual_results_path):
        candidate = manual_results_path
        print("ℹ️ results.txt not found — using manually saved output instead.")
    else:
        raise FileNotFoundError("Evaluation completed, but neither results.txt nor manual_results.txt was found.")

In [None]:
# --- Step 4: Save results using <dataset>_<eval_mode>.txt format ---
TARGET_REPO_DIR = "BabyLM-Tiny"

output_dir = (
    pathlib.Path(TARGET_REPO_DIR)
    / "models_evaluation_results"
    / "deberta-v3-small_results"  # ⚠️ Change this folder name to match the name of the model being evaluated
)
output_dir.mkdir(parents=True, exist_ok=True)

result_file_path = output_dir / f"{DATASET_NAME}_{EVAL_MODE}.txt"
shutil.copy(candidate, result_file_path)

print(f"✅ Evaluation complete. Results saved to {result_file_path}")

# --- Step 5: Optionally commit and push the result to GitHub ---
"# --- Step 5: Optionally commit and push the result to GitHub ---
",
        "!git config --global user.name "Your Name"
",
        ""!git config --global user.email "your.email@example.com"
",
",

if ENABLE_GIT_PUSH:
    rel_path = result_file_path.relative_to(TARGET_REPO_DIR)

    subprocess.run(["git", "add", str(rel_path)], cwd=TARGET_REPO_DIR)
    subprocess.run(
        ["git", "commit", "-m", f"Add {EVAL_MODE} evaluation results for {DATASET_NAME}"],
        cwd=TARGET_REPO_DIR,
    )

    subprocess.run(["git", "pull", "origin", "main", "--rebase"], cwd=TARGET_REPO_DIR)  # This prevents a push error when there are new commits on GitHub.
    result = subprocess.run(["git", "push"], cwd=TARGET_REPO_DIR, capture_output=True, text=True)
    print(result.stdout)
    print(result.stderr)