In [1]:
TOKENIZER = "wp" # sp, gpt, wp

In [2]:
from datasets import load_from_disk
import sys
sys.path.append("../")

ds = load_from_disk("data/tinystories")
wiki_ds = load_from_disk("data/wiki")

train = ds["train"]
val = ds["validation"]

In [3]:
from tokenizer import GPTTokenizer, Tokenizer, SPTokenizer, WPTokenizer

match TOKENIZER:
    case "sp":
        tokenizer = SPTokenizer('data/tokenizer_bpe.model')
    case "gpt":
        tokenizer = GPTTokenizer()
    case "wp":
        tokenizer = WPTokenizer.from_json("data/custom_vocab.json")
    case _:
        raise ValueError(f"Unknown tokenizer: {TOKENIZER}")

tokenizer.decode(tokenizer.encode("Test sentence"))

'test sentence'

In [4]:
tokenizer

<tokenizer.tokenizer.WPTokenizer at 0x157908470>

Tokenów: 471_872_517
Dokumentów: 2_119_719

In [5]:
import torch
from torch.utils.data import IterableDataset, DataLoader
from datasets.arrow_dataset import Dataset
from typing import Generator

class StreamingTokenDataset(IterableDataset):
    def __init__(
            self, 
            dataset: Dataset,
            tokenizer: Tokenizer,
            context_size=128, 
            buffer_size=10_000
        ) -> None:

        self.dataset = dataset
        self.tokenizer = tokenizer

        self.context_size = context_size
        self.buffer_size = buffer_size

    def _token_stream(self) -> Generator[int, None, None]:
        for example in self.dataset:
            tokens = self.tokenizer.encode(example["text"])
            yield from tokens
            yield 0

    def _chunk_stream(self):
        buf = []
        for token in self._token_stream():
            buf.append(token)
            if len(buf) > self.context_size:

                context_batch = buf[:self.context_size + 1]

                input_tokens = torch.tensor(context_batch[:self.context_size], dtype=torch.long)
                pred_tokens = torch.tensor(context_batch[1:], dtype=torch.long)
                yield input_tokens, pred_tokens
                buf = buf[self.context_size:]

    def __iter__(self):
        yield from self._chunk_stream()

In [6]:
train_dataset = StreamingTokenDataset(train, tokenizer)
val_dataset = StreamingTokenDataset(val, tokenizer)


train_loader = DataLoader(train_dataset, batch_size=4)
val_loader = DataLoader(val_dataset, batch_size=4)

wiki_loader = DataLoader(StreamingTokenDataset(wiki_ds, tokenizer), batch_size=4)

In [7]:
from lab1.architectures.gpt import GPTDecoder

vocab_size = tokenizer.vocab_size()
embed_dim = 256
num_heads = 8
ff_hidden_dim = 2048
num_layers = 6
context_length = 128
dropout = 0.1

gpt = GPTDecoder(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    num_heads=num_heads,
    ff_hidden_dim=ff_hidden_dim,
    num_layers=num_layers,
    context_length=context_length,
    dropout=dropout
)

In [8]:
model_path = f"trained_models/{TOKENIZER}_tokenizer.pt"
gpt.load_state_dict(torch.load(model_path, weights_only=True, map_location="cpu"))

<All keys matched successfully>

In [9]:
def choose_device() -> str:
    if torch.cuda.is_available():
        return "cuda"
    elif torch.backends.mps.is_available():
        return "mps"
    else:
        return "cpu"

In [10]:
# import torch.nn as nn
# from tqdm import tqdm

# epochs = 1
# grad_clip = 10.0
# device = torch.device(choose_device())

# print(f"Training on device: {device}")

# gpt.to(device)
# criterion = nn.CrossEntropyLoss(ignore_index=0)
# optimizer = torch.optim.AdamW(gpt.parameters())

# for epoch in range(1, epochs + 1):
#     gpt.train()
#     total_loss = 0.0

#     progress = tqdm(enumerate(train_loader), total=900_000, desc=f"Epoch {epoch}/{epochs}")

#     for i, (batch_x, batch_y) in progress:
#         batch_x = batch_x.to(device)
#         batch_y = batch_y.to(device)

#         optimizer.zero_grad()
#         out = gpt(batch_x)
#         loss = criterion(out.view(-1, out.size(-1)), batch_y.view(-1))
#         loss.backward()

#         torch.nn.utils.clip_grad_norm_(gpt.parameters(), grad_clip)

#         optimizer.step()

#         total_loss += loss.item()
#         avg_loss = total_loss / (i + 1)

#         progress.set_postfix({"loss": f"{avg_loss:.4f}", "lr": optimizer.param_groups[0]["lr"]})

#     torch.save(gpt.state_dict(), f"data/{TOKENIZER}_epoch_{epoch}.pt")
#     print(f"Epoch {epoch} done | Average training loss: {avg_loss:.4f}")
#     print(f"Perplexity on training data: {torch.math.exp(avg_loss)}\n")


# torch.save(gpt.state_dict(), f"data/{TOKENIZER}_final.pt")
# print("Training complete. Model saved to gpt_final.pt")

In [11]:
# import torch.nn as nn
# from tqdm import tqdm


# gpt.eval()

# criterion = nn.CrossEntropyLoss(ignore_index=0)
# device = choose_device()
# gpt.to(device)

# total_loss = 0.0
# token_count = 0

# with torch.no_grad():
#     progress = tqdm(val_loader, desc="Evaluating")

#     for batch_x, batch_y in progress:
#         batch_x = batch_x.to(device)
#         batch_y = batch_y.to(device)

#         out = gpt(batch_x)
#         loss = criterion(out.view(-1, out.size(-1)), batch_y.view(-1))
#         total_loss += loss.item() * batch_x.shape[0] * batch_x.shape[1]
#         token_count += batch_x.shape[0] * batch_x.shape[1]

In [12]:
import time
from functools import wraps

def measure_inference_time(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        end = time.time()
        elapsed = end - start
        print(f"[{func.__name__}] Inference time: {elapsed:.4f} seconds")
        return result
    return wrapper

In [13]:
# word_count = 0
# token_count = 0

# for text in wiki_ds['text']:
#     word_count += len(text.strip().split(" "))
#     token_count += len(tokenizer.encode(text))

# word_count, token_count, token_count / word_count

In [14]:
import torch
import torch.nn.functional as F
from torch.distributions import Categorical

@measure_inference_time
@torch.no_grad()
def generate_text(model, tokenizer, prompt, device="cpu",
                  max_new_tokens=100, context_length=128, top_k= 50, temperature=0.7):
    model.eval()
    model.to(device)

    tokens = tokenizer.encode(prompt)
    tokens = torch.tensor(tokens, dtype=torch.long).to(device)

    for _ in range(max_new_tokens):
        tokens_cond = tokens[-context_length:]

        logits = model(tokens_cond.reshape(1, -1))
        logits = logits[:, -1, :] / temperature  # apply temperature

        top_logits, indices = torch.topk(logits, top_k)
        distribution = Categorical(logits=top_logits)
        next_token = indices.flatten()[distribution.sample()]

        tokens = torch.cat([tokens, next_token])

    return tokenizer.decode(tokens.tolist())

In [None]:
# oov_count = 0
# overall_count = 0

# for text in val["text"]:
#     encoded = torch.tensor(tokenizer.encode(text))
#     oov_count += (encoded == tokenizer.word2id["<UNK>"]).sum().item()
#     overall_count += len(encoded)

In [None]:
# oov_count, overall_count, oov_count / overall_count

(345, 4534219, 7.608807602808775e-05)

In [17]:
# prompts = [
#     "One morning, a child woke up and felt that something exciting might happen.",
#     "A quiet day began in a small town where everyone was getting ready for a new adventure.",
#     "In a sunny field, a young friend looked around and wondered what the day would bring.",
#     "At the edge of a simple village, a child decided to explore just a little farther than usual.",
#     "On a calm afternoon, two friends met and talked about what they should do next.",
#     "Inside a cozy house, a child found something they had not noticed before.",
#     "Under a clear blue sky, a young explorer took their first step outside.",
#     "By the old oak tree, a child paused, sensing that a story was about to begin.",
#     "During a peaceful morning, a friend noticed something small but interesting nearby.",
#     "As the day started, a young hero wondered what new things they might learn."
# ]

In [18]:
# import textwrap

# ans = generate_text(gpt, tokenizer, prompts[1], max_new_tokens=200, device="mps", temperature=0.001)
# print(textwrap.fill(ans, width=80), "\n")

In [19]:
# samples = [
# """
# A young boy wandered through the quiet forest, imagining great adventures unfolding beneath the tall trees. 
# He carried a small wooden sword and believed he was destined to become a brave explorer.
# """,
# """
# Modern artificial intelligence systems rely on vast amounts of data, efficient optimization algorithms, 
# and powerful hardware accelerators. These components together enable rapid progress in natural language processing and machine learning.
# """,
# """
# Scientific discoveries often begin with small, unexpected observations that challenge existing theories. 
# Researchers must remain curious and persistent, carefully examining results that initially appear unimportant.
# """
# ]

In [20]:
# which = 2

In [21]:
# encoded = tokenizer.sentence_piece.encode(samples[which], out_type=str)
# for out in encoded:
#     print(out, end=" ")

In [22]:
# encoded = tokenizer.model.tokenize(samples[which].strip())
# for out in encoded:
#     print(out, end=" ")

In [23]:
# encoded = list(map(tokenizer.id2word.get, tokenizer.encode(samples[which].strip())))
# for out in encoded:
#     print(f"|{out}", end=" ")