In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from tqdm import tqdm
from itertools import islice
import json
import os
import gzip
import string
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def page_iter(pages_file):
   with gzip.open(pages_file, 'rt', encoding='utf-8') as fh:
       for line in fh:
           yield json.loads(line)


def pages_generator(file, allowed_ids):
    with gzip.open(file, 'rt', encoding='utf-8') as f:
        for line in f:
            page = json.loads(line)
            wid = page.get("work_id")
            text = page.get("text", "").strip()
            if wid in allowed_ids and text:
                yield {"text": text}

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForMaskedLM.from_pretrained(model_id).to(DEVICE)


TARGET_COLLECTIONS = {"Literary"}
EXCLUSION_COLLECTIONS = {"Dictionary", "Word Lists", "Typographically Unique"}

In [4]:
with open("Data/ppa_corpus_2025-02-03_1308/ppa_metadata.json") as f:
    metadata = json.load(f)


metadata_index = {
    entry["work_id"]: entry
    for entry in metadata
    if "collections" in entry
       and any(c in TARGET_COLLECTIONS for c in entry["collections"])
       and not any(c in EXCLUSION_COLLECTIONS for c in entry["collections"])
}


In [6]:
dataset = Dataset.from_generator(
    lambda: islice(
        pages_generator("Data/ppa_corpus_2025-02-03_1308/ppa_pages.jsonl.gz", metadata_index.keys()), 
        50
    )
)

Generating train split: 0 examples [00:00, ? examples/s]

In [11]:
block_size = 512

def tokenize_and_chunk(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",   # pad short texts up to block_size
        max_length=block_size,
        return_attention_mask=True,
        return_token_type_ids=False,
    )
tokenized_dataset = dataset.map(tokenize_and_chunk, batched=True, remove_columns=["text"])
tokenized_dataset = tokenized_dataset.flatten() 

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15) #PARAM


training_args = TrainingArguments(
    output_dir="./modernbert-literary-mlm",
    per_device_train_batch_size=2, ##PARAM
    num_train_epochs=1,  ## PARAM
    learning_rate=5e-5, ## PARAM
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    max_steps=20, ## PARAM
    fp16=torch.cuda.is_available(),
    remove_unused_columns=False,


)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=collator,
)


trainer.train()

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
10,2.8636
20,2.7727


TrainOutput(global_step=20, training_loss=2.818165969848633, metrics={'train_runtime': 20.3266, 'train_samples_per_second': 1.968, 'train_steps_per_second': 0.984, 'total_flos': 13636314071040.0, 'train_loss': 2.818165969848633, 'epoch': 0.8})