In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from datasets import load_dataset
import json

In [None]:
MODEL_NAME = "dbmdz/bert-base-italian-xxl-cased"

TRAIN_DATASET_PATH = 'italian_poems_train.json'
VAL_DATASET_PATH = 'italian_poems_val.json'
TEST_DATASET_PATH = 'italian_poems_test.json'

OUTPUT_DIR = '.'

In [None]:
BATCH_SIZE = 32
GRAD_ACCUM = 4
EPOCHS = 30
MAX_LENGTH = 512
STRIDE = 128
EVAL_STRATEGY = "epoch"
SAVE_STRATEGY = "epoch"
LOGGING_STRATEGY = "epoch"
LEARNING_RATE = 5e-5
LR_SCHEDULER_TYPE = "cosine"
SAVE_TOTAL_LIMIT = 2
FP16 = True
OPTIM = "adamw_torch"
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01

In [None]:
train_dataset = load_dataset("json", data_files=TRAIN_DATASET_PATH, split="train")
valid_dataset = load_dataset("json", data_files=VAL_DATASET_PATH, split="train")
test_dataset = load_dataset("json", data_files=TEST_DATASET_PATH, split="train")
print(f"Dataset sizes: Train={len(train_dataset)}, Val={len(valid_dataset)}, Test={len(test_dataset)}")

In [None]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)

num_parameters = sum(p.numel() for p in model.parameters())
print(f"Size of the model: {num_parameters} parameters")

In [None]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        max_length=MAX_LENGTH,
        truncation=True,
        stride=STRIDE,
        return_overflowing_tokens=True,
        return_special_tokens_mask=True,
        return_attention_mask=True
    )

def tokenize_dataset(dataset):
    return dataset.map(
        tokenize_function,
        batched=True,
        batch_size=1000,
        remove_columns=["text", "url"],
        num_proc=4
    )

# Tokenize all splits
train_tokenized = tokenize_dataset(train_dataset)
valid_tokenized = tokenize_dataset(valid_dataset)
test_tokenized = tokenize_dataset(test_dataset)

In [None]:
print(f"Train dataset size: {len(train_tokenized)}")
print(f"Valid dataset size: {len(valid_tokenized)}")
print(f"Test dataset size: {len(test_tokenized)}")

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    gradient_accumulation_steps=GRAD_ACCUM,
    eval_strategy=EVAL_STRATEGY,
    save_strategy=SAVE_STRATEGY,
    logging_strategy=LOGGING_STRATEGY,
    learning_rate=LEARNING_RATE,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    save_total_limit=SAVE_TOTAL_LIMIT,
    fp16=FP16,
    report_to="none",
    optim=OPTIM,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=WEIGHT_DECAY,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    logging_dir=f"{OUTPUT_DIR}/logs"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=valid_tokenized,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(
        early_stopping_patience=3,
        early_stopping_threshold=0.01
    )]
)

In [None]:
# Evaluate the base model on the test set
base_test_results = trainer.evaluate(test_tokenized)
print(f"Base model test loss: {base_test_results['eval_loss']:.4f}")
print(f"Base model test perplexity: {torch.exp(torch.tensor(base_test_results['eval_loss'])).item():.2f}")

# Save base model results
with open(f"{OUTPUT_DIR}/base_model_results.txt", "w") as f:
    f.write(f"Base Model Test Loss: {base_test_results['eval_loss']:.4f}\n")
    f.write(f"Base Model Test Perplexity: {torch.exp(torch.tensor(base_test_results['eval_loss'])).item():.2f}\n")

In [None]:
# Start training
trainer.train()

# Save final model (best according to validation loss)
trainer.save_model(f"{OUTPUT_DIR}/best_model")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/best_model")

In [None]:
# Evaluate on test set
test_results = trainer.evaluate(test_tokenized)
print(f"Final test loss: {test_results['eval_loss']:.4f}")
print(f"Final test perplexity: {torch.exp(torch.tensor(test_results['eval_loss'])).item():.2f}")

# Save test results
with open(f"{OUTPUT_DIR}/test_results.txt", "w") as f:
    f.write(f"Test Loss: {test_results['eval_loss']:.4f}\n")
    f.write(f"Test Perplexity: {torch.exp(torch.tensor(test_results['eval_loss'])).item():.2f}\n")