In [None]:
%pip install -r requirements.txt

In [None]:
from transformers import GPTNeoXForCausalLM, get_cosine_schedule_with_warmup, TrainingArguments
from huggingface_hub import hf_hub_download
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, SequentialSampler
from torch.optim import Adam
from PythiaBinaryDataset import PythiaBinaryDataset
from NoShuffleTrainer import NoShuffleTrainer
from SynonymCrossEntropy import SynonymCrossEntropy
import wandb
import yaml

wandb.login()

def load_config(path: str):
    try:
        with open(path, 'r') as config:
            config = yaml.safe_load(config) #returns dict btw
            return config
    except FileNotFoundError:
        print("Config File Not Found. Check your path!!")

config_dict = load_config('/Users/nathan/Documents/Development/project2/pythia_160m_deduped_config.yaml')

wandb.init(
    project="pythia-160m-test-training-run", 
    name="Test-Run-01",
    config=config_dict
)

In [None]:
train_iters = config_dict['train-iters']
seq_len = config_dict['seq-length']
global_batch_size = 1024 # attempts to simulate using micro batches

micro_batch_size = 8 # done for grad accum
grad_accum_steps = global_batch_size // micro_batch_size

if torch.cuda.is_available():
    device = 'cuda'
elif torch.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/pythia-160m-deduped", revision="step0", cache_dir="./pythia-160m-deduped/step0")
#tokenizer unnecessary, i think document.bin comes preprocessed with the targets idk
model.to(device)


    
optimizer = Adam(
    model.parameters(), 
    lr = config_dict['optimizer']['params']['lr'], 
    betas = config_dict['optimizer']['params']['betas'], 
    eps = config_dict['optimizer']['params']['eps'], 
    weight_decay = config_dict['weight-decay']
    )

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_training_steps = train_iters,
    num_warmup_steps = int(train_iters * config_dict['warmup'])
)

dataset = PythiaBinaryDataset("./pythia_data/document.bin", seq_len)


training_arguments = TrainingArguments(
    output_dir="./checkpoints",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=grad_accum_steps,
    max_steps=train_iters,  
    logging_steps=1,
    report_to="wandb",
    save_steps=1000,
    fp16=torch.cuda.is_available(),
    bf16=torch.mps.is_available()
)

trainer = NoShuffleTrainer(
    model=model, 
    args=training_arguments,
    train_dataset=dataset,
    optimizers=(optimizer, scheduler),
    # compute_loss_func = SynonymCrossEntropy()
)

trainer.train()

