In [14]:
import json
import math
import torch
import torch.nn.functional as F
import torch.optim as optim
from lion import Lion
from datasets import load_from_disk
from sophia import SophiaG
from sgd.sgd import signSGD
from transformers import (
    PreTrainedTokenizerFast,
    DataCollatorForLanguageModeling,
    BertConfig,
    BertForMaskedLM,
    Trainer,
    TrainingArguments,
)

In [23]:
def load_dataset(path, name):
    with open(f"./save/{path}/{name}/tokenizer/special_tokens_map.json") as f:
        special_tokens = json.load(f)

        tokenized_datasets = load_from_disk(f"./save/{path}/{name}/datasets/")
        tokenizer = PreTrainedTokenizerFast(
            # TODO: make sure these are set for MASKED models
            # https://huggingface.co/docs/transformers/v4.30.0/en/main_classes/tokenizer#transformers.PreTrainedTokenizerFast
            sep_token=special_tokens["sep_token"],
            cls_token=special_tokens["cls_token"],
            mask_token=special_tokens["mask_token"],
            unk_token=special_tokens["unk_token"],
            pad_token=special_tokens["pad_token"],
            tokenizer_file=f"./save/{path}/{name}/tokenizer/tokenizer.json",
        )
        print(
            tokenizer.sep_token,
            tokenizer.cls_token,
            tokenizer.mask_token,
            tokenizer.unk_token,
            tokenizer.pad_token,
        )
        return tokenized_datasets, tokenizer

In [90]:
#get the configs
# Import training configs
from configs import SEED, TRAINING_CONFIGS
from torch.cuda import empty_cache


config = TRAINING_CONFIGS["bert-wikitext"]
tokenizer_name = config["tokenizer_name"]
path = config["dataset_path"]
name = config["dataset_name"]

# load the dataset
tokenized_datasets, tokenizer = load_dataset(path, name)

# load the model
config = BertConfig(vocab_size=len(tokenizer))
model = BertForMaskedLM(config)  # model.resize_token_embeddings(len(tokenizer))
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=0.15,
)

def compute_metric_with_tokenizer(tokenizer):
    def compute_custom_metric(pred):
        logits = torch.from_numpy(pred.predictions)
        labels = torch.from_numpy(pred.label_ids)
        loss = F.cross_entropy(logits.view(-1, tokenizer.vocab_size), labels.view(-1))
        return {"perplexity": math.exp(loss), "calculated_loss": loss}
    return compute_custom_metric

compute_custom_metric = compute_metric_with_tokenizer(tokenizer)

#set the optimizers 
var1 = torch.tensor([1.0], requires_grad=True)
var2 = torch.tensor([2.0], requires_grad=True)
adam_optimizer = optim.Adam([var1, var2])
lion_optimizer = Lion(model.parameters())
sgd_optimizer = signSGD(model.parameters())
sophia_optimizer = SophiaG(model.parameters())
# train on optimizers
#optimizers = [sgd_optimizer, adam_optimizer, lion_optimizer, sophia_optimizer]
optimizers = [sgd_optimizer, adam_optimizer]
import gc

def train(tokenizer, tokenized_datasets, optimizers, model, data_collator):

    training_args = TrainingArguments(
        output_dir="./bert/output/",
        evaluation_strategy="epoch",
        # learning_rate=1e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        # warmup_steps=500,
        # weight_decay=0.01,
        logging_dir="./bert/logs/",
        seed=SEED,
        fp16=True,
        eval_accumulation_steps=50,
    )
    i = 0
    for optimizer in optimizers:
        # train the model
        empty_cache()
        gc.collect()
        print("Optimizer ", i)
        i+=1
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["validation"],
            data_collator=data_collator,
            tokenizer=tokenizer,
            compute_metrics=compute_custom_metric,
            optimizers=(optimizer, None),
        )
        
        trainer.train()
        trainer.save_model(f"./bert/output/{optimizer.__class__.__name__}")
        # evaluate the model
        eval_results = trainer.evaluate()
        #print eval results + name of optimizer
        print(f"{optimizer.__class__.__name__} results: {eval_results}")

[SEP] [CLS] [MASK] [UNK] [PAD]


In [91]:
#get the configs
# Import training configs
from configs import SEED, TRAINING_CONFIGS

config = TRAINING_CONFIGS["bert-wikitext"]
tokenizer_name = config["tokenizer_name"]
path = config["dataset_path"]
name = config["dataset_name"]

In [92]:
with open(f"./save/{path}/{name}/tokenizer/special_tokens_map.json") as f:
    special_tokens = json.load(f)


tokenized_datasets = load_from_disk(f"./save/{path}/{name}/datasets/")
tokenizer = PreTrainedTokenizerFast(
    # TODO: make sure these are set for MASKED models
    # https://huggingface.co/docs/transformers/v4.30.0/en/main_classes/tokenizer#transformers.PreTrainedTokenizerFast
    sep_token=special_tokens["sep_token"],
    cls_token=special_tokens["cls_token"],
    mask_token=special_tokens["mask_token"],
    unk_token=special_tokens["unk_token"],
    pad_token=special_tokens["pad_token"],
    tokenizer_file=f"./save/{path}/{name}/tokenizer/tokenizer.json",
)

In [93]:
# load the model
config = BertConfig(vocab_size=len(tokenizer))
model = BertForMaskedLM(config)  # model.resize_token_embeddings(len(tokenizer))
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=0.15,
)

#initialize all the optimizers
var1 = torch.tensor([1.0], requires_grad=True)
var2 = torch.tensor([2.0], requires_grad=True)
adam_optimizer = optim.Adam([var1, var2])
lion_optimizer = Lion(model.parameters())
sgd_optimizer = signSGD(model.parameters())
sophia_optimizer = SophiaG(model.parameters())
optimizers = [lion_optimizer, sgd_optimizer, adam_optimizer, sophia_optimizer]

In [95]:
# set the training args
training_args = TrainingArguments(
    output_dir="./bert/output/",
    evaluation_strategy="epoch",
    # learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    # warmup_steps=500,
    # weight_decay=0.01,
    logging_dir="./bert/logs/",
    seed=SEED,
    fp16=True,
    eval_accumulation_steps=50,
)

#train the model
for optimizer in optimizers:
    # train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_custom_metric,  
        optimizers=(optimizer, None),
    )
    trainer.train()
    trainer.save_model(f"./bert/output/{optimizer.__class__.__name__}")
    # evaluate the model
    eval_results = trainer.evaluate()
    print(eval_results)

OutOfMemoryError: CUDA out of memory. Tried to allocate 154.00 MiB (GPU 0; 5.80 GiB total capacity; 4.67 GiB already allocated; 30.81 MiB free; 4.69 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [88]:
!export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:8