In [2]:
import json
import math
import torch
import torch.nn.functional as F
import torch.optim as optim
from datasets import load_from_disk
from datasets import load_from_disk
from transformers import (
    PreTrainedTokenizerFast,
    DataCollatorForLanguageModeling,
    BertConfig,
    BertForMaskedLM,
    Trainer,
    TrainingArguments,
    GPT2Config,
    GPT2LMHeadModel,
    MobileBertConfig,
    MobileBertForMaskedLM,
)
from torch.optim import AdamW, SGD
from optimizers import Lion, Sophia, SignSGD 

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def load_dataset(path, name, model):
    with open(f"./save/{path}/{name}/tokenizer/{model}/special_tokens_map.json") as f:
        special_tokens = json.load(f)
        print("Loaded ", f)

        tokenized_datasets = load_from_disk(f"./save/{path}/{name}/datasets/{model}")
        tokenizer = PreTrainedTokenizerFast(
            # TODO: make sure these are set for MASKED models
            # https://huggingface.co/docs/transformers/v4.30.0/en/main_classes/tokenizer#transformers.PreTrainedTokenizerFast
            sep_token=special_tokens["sep_token"],
            cls_token=special_tokens["cls_token"],
            mask_token=special_tokens["mask_token"],
            unk_token=special_tokens["unk_token"],
            pad_token=special_tokens["pad_token"],
            tokenizer_file=f"./save/{path}/{name}/tokenizer/{model}/tokenizer.json",
        )
        print(
            tokenizer.sep_token,
            tokenizer.cls_token,
            tokenizer.mask_token,
            tokenizer.unk_token,
            tokenizer.pad_token,
        )
        return tokenized_datasets, tokenizer

In [5]:
def compute_metric_with_tokenizer(tokenizer):
    def compute_custom_metric(pred):
        logits = torch.from_numpy(pred.predictions)
        labels = torch.from_numpy(pred.label_ids)
        loss = F.cross_entropy(logits.view(-1, tokenizer.vocab_size), labels.view(-1))
        return {"perplexity": math.exp(loss), "calculated_loss": loss}
    return compute_custom_metric

In [6]:
def set_optimizer(model , i):
    match i:
        case 1:
            optimizer = SignSGD(model.parameters())
        case 2:
            optimizer = Lion(model.parameters())
        case 3:
            optimizer = optim.AdamW(model.parameters())
        case 4:
            optimizer = Sophia(model.parameters())
        case _:
            print("Invalid optimizer")        
    return optimizer


In [7]:
import gc

def train(tokenizer, tokenized_datasets, optimizer, model, data_collator, training_args):
    compute_custom_metric = compute_metric_with_tokenizer(tokenizer)
    torch.cuda.empty_cache()
    gc.collect()
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_custom_metric,
        optimizers=(optimizer, None),
    )

    trainer.train()
    trainer.save_model(f"./bert/output/{optimizer.__class__.__name__}")
    # evaluate the model
    eval_results = trainer.evaluate()
    #print eval results + name of optimizer
    print(f"{optimizer.__class__.__name__} results: {eval_results}")

In [8]:
#get the configs
# Import training configs
from huggingface.configs import SEED, TRAINING_CONFIGS

config = TRAINING_CONFIGS["bert-wikitext"]
tokenizer_name = config["tokenizer_name"]
path = config["dataset_path"]
name = config["dataset_name"]
model = config["model"]

# load the dataset
tokenized_datasets, tokenizer = load_dataset(path, name, model)

training_args = TrainingArguments(
        output_dir=f"./{model}/output/",
        evaluation_strategy="epoch",
        # learning_rate=1e-5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        # warmup_steps=500,
        # weight_decay=0.01,
        logging_dir=f"./{model}/logs/",
        seed=SEED,
        bf16=True,
        eval_accumulation_steps=50,
        gradient_accumulation_steps=16,
    )




Loaded  <_io.TextIOWrapper name='./save/wikitext/wikitext-103-raw-v1/tokenizer/special_tokens_map.json' mode='r' encoding='UTF-8'>
[SEP] [CLS] [MASK] [UNK] [PAD]


In [9]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:4"

In [None]:
# train the model
optimizers = [
    lambda params: SignSGD(params),
    lambda params: Sophia(params),
    lambda params: AdamW(params),
    lambda params: Lion(params),
    lambda params: SGD(params),
]

#train the model
for optimizer_func in optimizers:
    # load the model
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    config = BertConfig(vocab_size=len(tokenizer), #Tiny BERT config
                        hidden_size=128,
                        num_hidden_layers=2,
                        num_attention_heads=12,
                        intermediate_size=3072)
    model = BertForMaskedLM(config)  # model.resize_token_embeddings(len(tokenizer))
    model = model.to(device)
    data_collator = DataCollatorForLanguageModeling(tokenizer)
    optimizer = optimizer_func(model.parameters())
    train(tokenizer, tokenized_datasets, optimizer, model, data_collator, training_args)