In [None]:
from datasets import load_from_disk
import sys

sys.path.append("../")
ds = load_from_disk("data/tinystories")

In [2]:
train = ds["train"]
val = ds["validation"]

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

Tokenów: 471_872_517
Dokumentów: 2_119_719

In [11]:
import torch
from torch.utils.data import IterableDataset, DataLoader
from datasets.arrow_dataset import Dataset
from typing import Generator
from transformers import AutoTokenizer

class StreamingTokenDataset(IterableDataset):
    def __init__(
            self, 
            dataset: Dataset,
            tokenizer: AutoTokenizer,
            context_size=128, 
            buffer_size=10_000
        ) -> None:

        self.dataset = dataset
        self.tokenizer = tokenizer

        self.context_size = context_size
        self.buffer_size = buffer_size
        self.sep_token_id = self.tokenizer.sep_token_id

    def _token_stream(self) -> Generator[int, None, None]:
        for example in self.dataset:
            tokens = self.tokenizer.encode(example["text"], add_special_tokens=False)
            yield from tokens
            yield 0

    def _chunk_stream(self):
        buf = []
        for token in self._token_stream():
            buf.append(token)
            if len(buf) > self.context_size:

                context_batch = buf[:self.context_size + 1]

                input_tokens = torch.tensor(context_batch[:self.context_size], dtype=torch.long)
                pred_tokens = torch.tensor(context_batch[1:], dtype=torch.long)
                yield input_tokens, pred_tokens
                buf = buf[self.context_size:]

    def __iter__(self):
        yield from self._chunk_stream()

In [6]:
train_dataset = StreamingTokenDataset(train, tokenizer)
val_dataset = StreamingTokenDataset(val, tokenizer)


train_loader = DataLoader(train_dataset, batch_size=4)
test_loader = DataLoader(train_dataset, batch_size=4)

In [None]:
from lab1.architectures.gpt import GPTDecoder

vocab_size = tokenizer.vocab_size
embed_dim = 256
num_heads = 8
ff_hidden_dim = 2048
num_layers = 6
context_length = 128
dropout = 0.1

gpt = GPTDecoder(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    num_heads=num_heads,
    ff_hidden_dim=ff_hidden_dim,
    num_layers=num_layers,
    context_length=context_length,
    dropout=dropout
)

In [8]:
def choose_device() -> str:
    if torch.cuda.is_available():
        return "cuda"
    elif torch.backends.mps.is_available():
        return "mps"
    else:
        return "cpu"

In [9]:
import torch.nn as nn
from tqdm import tqdm

epochs = 3
grad_clip = 1.0
device = torch.device(choose_device())

print(f"Training on device: {device}")

gpt.to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.AdamW(gpt.parameters())

for epoch in range(1, epochs + 1):
    gpt.train()
    total_loss = 0.0

    progress = tqdm(enumerate(train_loader), total=500_000, desc=f"Epoch {epoch}/{epochs}")

    for i, (batch_x, batch_y) in progress:
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)

        optimizer.zero_grad()
        out = gpt(batch_x)
        loss = criterion(out.view(-1, out.size(-1)), batch_y.view(-1))
        loss.backward()

        torch.nn.utils.clip_grad_norm_(gpt.parameters(), grad_clip)

        optimizer.step()

        total_loss += loss.item()
        avg_loss = total_loss / (i + 1)

        progress.set_postfix({"loss": f"{avg_loss:.4f}", "lr": optimizer.param_groups[0]["lr"]})

    torch.save(gpt.state_dict(), f"gpt_epoch_{epoch}.pt")
    print(f"Epoch {epoch} done | Average training loss: {avg_loss:.4f}")
    print(f"Perplexity on training data: {torch.math.exp(avg_loss)}\n")

    with torch.no_grad():
        progress = tqdm(enumerate(test_loader), total=5000, desc=f"Epoch {epoch + 1}/{epochs}")

        for i, (batch_x, batch_y) in progress:
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)

            out = gpt(batch_x)
            loss = criterion(out.view(-1, out.size(-1)), batch_y.view(-1))

            total_loss += loss.item()
            avg_loss = total_loss / (i + 1)

    print(f"Average loss on held-out_dataset: {avg_loss:.4f}")
    print(f"Perplexity on held-out data: {torch.math.exp(avg_loss)}\n")


torch.save(gpt.state_dict(), "gpt_final.pt")
print("Training complete. Model saved to gpt_final.pt")


Training on device: mps


Epoch 1/3:   0%|          | 966/500000 [00:32<4:39:26, 29.76it/s, loss=7.7872, lr=0.001] Token indices sequence length is longer than the specified maximum sequence length for this model (1106 > 1024). Running this sequence through the model will result in indexing errors
Epoch 1/3:   1%|          | 2818/500000 [01:35<4:41:02, 29.48it/s, loss=5.3946, lr=0.001]


KeyboardInterrupt: 