In [2]:
# =========================================
# patRoBERTa Toy MLM — Simple & Stable
# =========================================
import math
from pathlib import Path
from datasets import load_dataset, load_from_disk
from transformers import (
    RobertaConfig, RobertaForMaskedLM, RobertaTokenizerFast,
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
)

# ---------- CONSTANTS (edit) ----------
TRAIN_TXT     = "../data/ep-b1-claim1-corpus/ep-b1-claim1-cpc_train.txt"
VAL_TXT       = "../data/ep-b1-claim1-corpus/ep-b1-claim1-cpc_val.txt"
ENCODINGS_DIR = Path("../data/patroberta-encoded-128-vs8000")
TOKENIZER_DIR = "../artifacts/patroberta-tokenizers/vs8000"

SEQ_LEN = 128                  # keep small on 4GB
ARCH_MAX_POSITIONS = 514       # capacity (OK for 512 later)
MLM_PROB = 0.15

# Tiny model
HIDDEN_SIZE, NUM_LAYERS, NUM_HEADS, INTER_SIZE = 128, 2, 2, 512
DROPOUT = 0.1

# Training knobs (keep tiny to avoid OOM)
PER_DEVICE_TRAIN_BS = 16       # fits 4GB with fp16 + checkpointing
PER_DEVICE_EVAL_BS  = 8
GRAD_ACCUM_STEPS    = 2        # effective batch = 32 sequences
LEARNING_RATE       = 5e-4
WEIGHT_DECAY        = 0.01
WARMUP_RATIO        = 0.06
FP16                = True
NUM_EPOCHS          = 2        # or set MAX_STEPS instead of epochs

OUT_DIR = "../artifacts/patroberta-mlm-128-simple"

# ---------- Tokenizer & data ----------
tok = RobertaTokenizerFast.from_pretrained(TOKENIZER_DIR)
tok.model_max_length = SEQ_LEN

def enc(b): return tok(b["text"], truncation=True, max_length=SEQ_LEN)

if ENCODINGS_DIR.exists():
    ds = load_from_disk(str(ENCODINGS_DIR))
else:
    raw = load_dataset("text", data_files={"train": TRAIN_TXT, "validation": VAL_TXT})
    ds = raw.map(enc, batched=True, remove_columns=["text"])
    ds.save_to_disk(str(ENCODINGS_DIR))

# ---------- Collator ----------
collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=True, mlm_probability=MLM_PROB)

# ---------- Model ----------
cfg = RobertaConfig(
    vocab_size=tok.vocab_size,
    hidden_size=HIDDEN_SIZE,
    num_hidden_layers=NUM_LAYERS,
    num_attention_heads=NUM_HEADS,
    intermediate_size=INTER_SIZE,
    hidden_dropout_prob=DROPOUT,
    attention_probs_dropout_prob=DROPOUT,
    max_position_embeddings=ARCH_MAX_POSITIONS,
    pad_token_id=tok.pad_token_id,
    bos_token_id=tok.bos_token_id,
    eos_token_id=tok.eos_token_id,
)
model = RobertaForMaskedLM(cfg)
model.gradient_checkpointing_enable()  # big saver on 4GB

# ---------- Auto-scale logging by epoch fraction ----------
steps_per_epoch = max(1, math.ceil(len(ds["train"]) / (PER_DEVICE_TRAIN_BS * GRAD_ACCUM_STEPS)))
EVAL_STEPS     = max(1, steps_per_epoch // 4)   # 4× per epoch
SAVE_STEPS     = EVAL_STEPS
LOGGING_STEPS  = max(1, steps_per_epoch // 10)  # 10× per epoch

# ---------- Training args ----------
args = TrainingArguments(
    output_dir=OUT_DIR,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BS,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BS,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    num_train_epochs=NUM_EPOCHS,
    eval_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_steps=SAVE_STEPS,
    logging_steps=LOGGING_STEPS,
    fp16=FP16,
    fp16_full_eval=True,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    torch_empty_cache_steps=LOGGING_STEPS,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    processing_class=tok,
    data_collator=collator,
)

trainer.train()
out = trainer.evaluate()
import math as _m
print(out, "Perplexity:", _m.exp(out["eval_loss"]))


Step,Training Loss,Validation Loss
2638,5.478,5.027371
5276,3.8482,3.417055
7914,3.3625,2.95989
10552,3.0887,2.8151
13190,2.9835,2.651377
15828,2.873,2.563684
18466,2.8207,2.503295
21104,2.7828,2.484285


There were missing keys in the checkpoint model loaded: ['lm_head.decoder.weight', 'lm_head.decoder.bias'].


{'eval_loss': 2.4812066555023193, 'eval_runtime': 6.727, 'eval_samples_per_second': 512.116, 'eval_steps_per_second': 64.07, 'epoch': 2.0} Perplexity: 11.95568210710622


In [4]:
# =========================================
# patRoBERTa Toy MLM — Simple & Stable
# =========================================
import math
from pathlib import Path
from datasets import load_dataset, load_from_disk
from transformers import (
    RobertaConfig, RobertaForMaskedLM, RobertaTokenizerFast,
    DataCollatorForLanguageModeling, Trainer, TrainingArguments, 
    EarlyStoppingCallback
)

# ---------- CONSTANTS (edit) ----------
TRAIN_TXT     = "../data/1-corpus-all-claims/corpus_train.txt"
VAL_TXT       = "../data/1-corpus-all-claims/corpus_val.txt"
ENCODINGS_DIR = Path("../data/3-encodings/sl128-v8000") # Will create if encoding doesn't exist
TOKENIZER_DIR = "../data/2-tokenizers/vs8000"

SEQ_LEN = 128                  # keep small on 4GB
ARCH_MAX_POSITIONS = 514       # capacity (OK for 512 later)
MLM_PROB = 0.15

# Tiny model
HIDDEN_SIZE, NUM_LAYERS, NUM_HEADS, INTER_SIZE = 128, 2, 2, 512
DROPOUT = 0.1

# Training knobs
PER_DEVICE_TRAIN_BS = 64 
PER_DEVICE_EVAL_BS  = 8
GRAD_ACCUM_STEPS    = 2        # effective batch = 32 sequences
LEARNING_RATE       = 5e-4
WEIGHT_DECAY        = 0.01
WARMUP_RATIO        = 0.06
FP16                = True
NUM_EPOCHS          = 4

# Early stopping
PATIENCE = 3                     # evals with no improvement
THRESH   = 1e-4                  # min improvement in eval_loss

OUT_DIR = "../data/3-pretaining/mlm-sl128-v8000"

# ---------- Tokenizer & data ----------
tok = RobertaTokenizerFast.from_pretrained(TOKENIZER_DIR)
tok.model_max_length = SEQ_LEN

def enc(b): return tok(b["text"], truncation=True, max_length=SEQ_LEN)

if ENCODINGS_DIR.exists():
    ds = load_from_disk(str(ENCODINGS_DIR))
else:
    raw = load_dataset("text", data_files={"train": TRAIN_TXT, "validation": VAL_TXT})
    ds = raw.map(enc, batched=True, remove_columns=["text"])
    ds.save_to_disk(str(ENCODINGS_DIR))

# ---------- Collator ----------
collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=True, mlm_probability=MLM_PROB)

# ---------- Model ----------
cfg = RobertaConfig(
    vocab_size=tok.vocab_size,
    hidden_size=HIDDEN_SIZE,
    num_hidden_layers=NUM_LAYERS,
    num_attention_heads=NUM_HEADS,
    intermediate_size=INTER_SIZE,
    hidden_dropout_prob=DROPOUT,
    attention_probs_dropout_prob=DROPOUT,
    max_position_embeddings=ARCH_MAX_POSITIONS,
    pad_token_id=tok.pad_token_id,
    bos_token_id=tok.bos_token_id,
    eos_token_id=tok.eos_token_id,
)
model = RobertaForMaskedLM(cfg)
model.gradient_checkpointing_enable()  # big saver on 4GB

# ---------- Auto-scale logging by epoch fraction ----------
steps_per_epoch = max(1, math.ceil(len(ds["train"]) / (PER_DEVICE_TRAIN_BS * GRAD_ACCUM_STEPS)))
EVAL_STEPS     = max(1, steps_per_epoch // 10)   # 4× per epoch
SAVE_STEPS     = EVAL_STEPS
LOGGING_STEPS  = max(1, steps_per_epoch // 20)  # 10× per epoch

# ---------- Training args ----------
args = TrainingArguments(
    output_dir=OUT_DIR,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BS,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BS,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    num_train_epochs=NUM_EPOCHS,
    eval_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_steps=SAVE_STEPS,
    save_total_limit=4,
    logging_steps=LOGGING_STEPS,
    fp16=FP16,
    fp16_full_eval=True,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    torch_empty_cache_steps=LOGGING_STEPS,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    processing_class=tok,
    data_collator=collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE,
                                         early_stopping_threshold=THRESH)],
)

trainer.train()
out = trainer.evaluate()
import math as _m
print(out, "Perplexity:", _m.exp(out["eval_loss"]))


Step,Training Loss,Validation Loss
3384,5.4357,4.855713
6768,3.6396,3.119433
10152,2.9926,2.6148
13536,2.7112,2.381478
16920,2.5747,2.265131
20304,2.4921,2.197931
23688,2.4326,2.140914
27072,2.3924,2.105449
30456,2.3569,2.08331
33840,2.3184,2.042159


KeyboardInterrupt: 

# Push model to HF

In [None]:
from transformers import RobertaForMaskedLM, RobertaTokenizerFast

model_dir = "../data/3-pretaining/mlm-sl128-v8000/checkpoint-33840" # 1 epoch
repo_name = "mhurhangee/patroberta-mlm-sl128-v8000"

model = RobertaForMaskedLM.from_pretrained(model_dir)
tokenizer = RobertaTokenizerFast.from_pretrained(model_dir)

model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmp5nrkud6u/model.safetensors    :   9%|9         |  554kB / 6.05MB            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mhurhangee/patroberta-mlm-sl128-v8000/commit/d3ea4e75a17a328f5b9135cbfb594fd67be1af5b', commit_message='Upload tokenizer', commit_description='', oid='d3ea4e75a17a328f5b9135cbfb594fd67be1af5b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mhurhangee/patroberta-mlm-sl128-v8000', endpoint='https://huggingface.co', repo_type='model', repo_id='mhurhangee/patroberta-mlm-sl128-v8000'), pr_revision=None, pr_num=None)

In [None]:
# Push whole checkpoint for resuming to HF

In [3]:
from huggingface_hub import upload_folder

# Path to your Trainer checkpoint directory
checkpoint_dir = "../data/3-pretaining/mlm-sl128-v8000/checkpoint-33840"
repo_name = "mhurhangee/patroberta-mlm-sl128-v8000"

upload_folder(
    folder_path=checkpoint_dir,
    repo_id=repo_name,
    repo_type="model",
    commit_message="Upload full Trainer checkpoint for resume"
)


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  .../checkpoint-33840/model.safetensors: 100%|##########| 6.05MB / 6.05MB            

  ...v8000/checkpoint-33840/optimizer.pt:   5%|4         |  588kB / 12.1MB            

  ...8000/checkpoint-33840/rng_state.pth:   5%|4         |   710B / 14.6kB            

  ...28-v8000/checkpoint-33840/scaler.pt:   5%|4         |  67.0B / 1.38kB            

  ...v8000/checkpoint-33840/scheduler.pt:   5%|4         |  71.0B / 1.47kB            

  .../checkpoint-33840/training_args.bin:   5%|4         |   280B / 5.78kB            

CommitInfo(commit_url='https://huggingface.co/mhurhangee/patroberta-mlm-sl128-v8000/commit/08a406030cbf8b53c8d52bbf99634dbdbe96b8ab', commit_message='Upload full Trainer checkpoint for resume', commit_description='', oid='08a406030cbf8b53c8d52bbf99634dbdbe96b8ab', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mhurhangee/patroberta-mlm-sl128-v8000', endpoint='https://huggingface.co', repo_type='model', repo_id='mhurhangee/patroberta-mlm-sl128-v8000'), pr_revision=None, pr_num=None)