## Yakut simple example of model training

In [1]:
import torch
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import get_linear_schedule_with_warmup


In [2]:
class TextDataset(Dataset):
    def __init__(self, path, tokenizer, seq_length=512):
        with open(path) as f:
            data = f.read()
        tokens = tokenizer.encode(data)
        examples = []
        for i in range(0, len(tokens) - seq_length + 1, seq_length):
            examples.append(tokens[i:i + seq_length])
        self.samples = torch.LongTensor(examples)
        print('Loaded samples:', len(self.samples))
    
    def __len__(self):
        return len(self.samples)

    def __getitem__(self, item):
        return self.samples[item]

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained("sberbank-ai/mGPT")



In [4]:
dataset = TextDataset('./pelevin.txt', tokenizer)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True, drop_last=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (6989546 > 2048). Running this sequence through the model will result in indexing errors


Loaded samples: 13651


In [5]:
model = GPT2LMHeadModel.from_pretrained("sberbank-ai/mGPT")

  return torch.load(checkpoint_file, map_location="cpu")


In [6]:
if torch.cuda.is_available():
    model.cuda()
model.train()
optimizer = torch.optim.AdamW(params=model.parameters(), lr=3e-4)

In [7]:
# Calculate total number of training steps
total_steps = len(dataloader)

# Create a learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=100,
    num_training_steps=total_steps
)

for epoch in range(1):
    print('Epoch', epoch)
    progressbar = tqdm(dataloader)
    losses = []
    for step, batch in enumerate(progressbar):
        batch = batch.to(model.device)
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update the learning rate
        optimizer.zero_grad()
        losses.append(loss.detach().item())
        progressbar.set_description(f"Loss: {np.mean(losses[-10:]):.3f}, LR: {scheduler.get_last_lr()[0]:.6f}")

    print(f"Epoch {epoch} finished. Average loss: {np.mean(losses[-300:]):.3f}")

Epoch 0


  0%|          | 0/13651 [00:00<?, ?it/s]

Loss: 2.868, LR: 0.000000: 100%|██████████| 13651/13651 [33:44<00:00,  6.74it/s]

Epoch 0 finished. Average loss: 2.518





In [8]:
#2.338 no decay
folder='models/1e5/'

#2.119 last 300
folder='models/3e5/'

# 1.991 last 300
folder='models/1e4/'

# 2.518 last 300
folder='models/3e4/'

In [9]:
tokenizer.save_pretrained(folder)
model.save_pretrained(folder)