<a href="https://colab.research.google.com/github/pashok3d/RemarqueGPT/blob/main/RemarqueGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [69]:
!pip install -Uqq ipdb
import ipdb

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
!pip install tqdm -q
!pip install wandb -qqq

In [2]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [3]:
!mkdir dataset

In [4]:
"""
Building GPT from scratch and training it on all books of Erich Maria Remarque

Available tools: python, pytorch

Tasks:
1. Load data and tokenize to characters
2. Implement GPT model using pytorch
3. Train and evaluate the model

GPT model structure:
1. embedding layer
2. positional encoding
3. blocks
    .1 attention
    .2 feedforward
4. projection
"""

import wandb
import torch
import math
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

WINDOW_SIZE = 64
BATCH_SIZE = 16
EPOCHS = 10
LR = 5e-4

device = "cuda" if torch.cuda.is_available() else "cpu"

config = {
    "learning_rate": LR,
    "epochs": EPOCHS,
    "batch_size": BATCH_SIZE,
    "window_size": WINDOW_SIZE,
}

In [5]:
run = wandb.init(project="remark-gpt", config=config)

[34m[1mwandb[0m: Currently logged in as: [33mcrush_tarash[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
# Load dataset
with open("dataset/The_Dream_Room_1920_AST_978-5-17-071518-3.txt", "r") as f:
    lines = f.readlines()

text = "\n".join(lines)
tokens = sorted(set(text))

# Load train dataset
with open("dataset/The_Dream_Room_1920_AST_978-5-17-071518-3-train.txt", "r") as f:
    lines = f.readlines()

train_text = "\n".join(lines)

# Load dev dataset
with open("dataset/The_Dream_Room_1920_AST_978-5-17-071518-3-dev.txt", "r") as f:
    lines = f.readlines()

dev_text = "\n".join(lines)

# Load dev dataset
with open("dataset/The_Dream_Room_1920_AST_978-5-17-071518-3-test.txt", "r") as f:
    lines = f.readlines()

test_text = "\n".join(lines)

id_to_token = {i: token for i, token in enumerate(tokens)}
token_to_id = {token: i for i, token in enumerate(tokens)}


def tokenize(text) -> list[int]:
    return [token_to_id[ch] for ch in text]


def decode(token_ids: list[int]) -> str:
    return "".join([id_to_token[token_id] for token_id in token_ids])


class TextDataset(Dataset):
    def __init__(self, text, context_window_size):
        self.tokens = tokenize(text)

        self.x = []
        self.y = []
        for i in range(len(self.tokens) - context_window_size):
            self.x.append(self.tokens[i : i + context_window_size])
            self.y.append(self.tokens[i + 1 : i + context_window_size + 1])

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return torch.tensor(self.x[idx]), torch.tensor(self.y[idx])


class GPTBlock(nn.Module):
    def __init__(self, embedding_dim: int, max_len: int, dropout: float = 0.1):
        # Attention
        super().__init__()
        self.Q = nn.Linear(embedding_dim, embedding_dim, bias=False)
        self.K = nn.Linear(embedding_dim, embedding_dim, bias=False)
        self.V = nn.Linear(embedding_dim, embedding_dim, bias=False)
        self.kv_softmax = nn.Softmax(dim=-1)
        self.attn_dropout = nn.Dropout(dropout)

        # Feedforward
        self.f1 = nn.Linear(embedding_dim, embedding_dim * 4)
        self.f_act = nn.ReLU()
        self.f2 = nn.Linear(embedding_dim * 4, embedding_dim)
        self.ff_dropout = nn.Dropout(dropout)

        self.register_buffer("tril", torch.tril(torch.ones(max_len, max_len)))

        self.ln1 = nn.LayerNorm(embedding_dim)
        self.ln2 = nn.LayerNorm(embedding_dim)


    def forward(self, inputs):

        B, T, C = inputs.shape

        norm_inputs = self.ln1(inputs)

        q = self.Q(norm_inputs)
        k = self.K(norm_inputs)
        v = self.V(norm_inputs)

        attention_weights = (q @ k.transpose(-1, -2)) / math.sqrt(C)  # shape: (B, T, T)
        # ipdb.set_trace()
        attention_weights_masked = attention_weights.masked_fill(
            self.tril[:T, :T] == 0, -torch.inf
        )
        attention_scores = self.kv_softmax(attention_weights_masked)
        attention_scores = self.attn_dropout(attention_scores)

        new_v = attention_scores @ v + inputs

        x = self.ln2(new_v)
        x = self.ff_dropout(self.f2(self.f_act(self.f1(x)))) + x
        return x


class GPT(nn.Module):
    def __init__(
        self, vocab_size: int, max_len: int, embedding_dim: int = 16, blocks_num: int = 4
    ):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.emb = nn.Embedding(vocab_size, embedding_dim)
        self.pos = nn.Embedding(max_len, embedding_dim)
        self.blocks = nn.Sequential(
            *[GPTBlock(embedding_dim, max_len) for _ in range(blocks_num)]
        )
        self.proj = nn.Linear(embedding_dim, vocab_size)

        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, inputs, labels=None):
        B, T = inputs.shape
        embs = self.emb(inputs)
        pos_embs = self.pos(torch.arange(T, device=device))  # (T,C)
        blocks_output = self.blocks(embs + pos_embs)
        logits = self.proj(blocks_output)  # (B,T,vocab_size)
        if labels is not None:
            loss = self.loss_fn(logits.view(-1, self.vocab_size), labels.view(-1))
            return logits, loss
        else:
            return logits, None




In [7]:
train_ds = TextDataset(train_text, WINDOW_SIZE)
train_dataloader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)

dev_ds = TextDataset(dev_text, WINDOW_SIZE)
dev_dataloader = DataLoader(dev_ds, batch_size=BATCH_SIZE, shuffle=False)

dev_ds = TextDataset(dev_text, WINDOW_SIZE)
dev_dataloader = DataLoader(dev_ds, batch_size=BATCH_SIZE, shuffle=False)

test_ds = TextDataset(test_text, WINDOW_SIZE)
test_dataloader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

model = GPT(vocab_size=len(tokens), max_len=WINDOW_SIZE)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

In [8]:
wandb.watch(model, log_freq=5000)

In [11]:
# model.train()
# epoch_loss = 0
# steps_n = 0
# with torch.no_grad():
#     for batch in tqdm(train_dataloader):
#         input, labels = batch[0].to(device), batch[1].to(device)
#         output, loss = model(input, labels)
#         epoch_loss += loss.item()
#         steps_n += 1
#     avg_loss = epoch_loss / steps_n
# expected_init_loss = -math.log(1 / 74)
# print(f"initial train loss: {avg_loss:.3f}, with expected of {expected_init_loss:.3f}")

100%|██████████| 12836/12836 [00:45<00:00, 283.16it/s]

initial train loss: 4.459, with expected of 4.304





In [23]:
LR = 0.00001

In [24]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

In [25]:
for epoch in range(2):
    model.train()
    epoch_loss = 0
    val_epoch_loss = 0
    steps_n = 0
    val_steps_n = 0
    test_epoch_loss = 0
    test_steps_n = 0
    for batch in tqdm(train_dataloader):
        input, labels = batch[0].to(device), batch[1].to(device)
        output, loss = model(input, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        epoch_loss += loss.item()
        steps_n += 1
        run.log({"train_loss": loss.item()})

    avg_loss = epoch_loss / steps_n
    print(f"epoch {epoch} train loss: {avg_loss:.3f}")

    model.eval()
    with torch.no_grad():
        for batch in tqdm(dev_dataloader):
            input, labels = batch[0].to(device), batch[1].to(device)
            output, loss = model(input, labels)
            val_epoch_loss += loss.item()
            val_steps_n += 1

        for batch in tqdm(test_dataloader):
            input, labels = batch[0].to(device), batch[1].to(device)
            output, loss = model(input, labels)
            test_epoch_loss += loss.item()
            test_steps_n += 1

    avg_val_loss = val_epoch_loss / val_steps_n
    avg_test_loss = test_epoch_loss / test_steps_n
    print(f"epoch {epoch} val loss: {avg_val_loss:.3f}")
    print(f"epoch {epoch} test loss: {avg_test_loss:.3f}")
    run.log({"epoch_train_loss": avg_loss, "epoch_val_loss": avg_val_loss, "epoch_test_loss": avg_test_loss})


100%|██████████| 12836/12836 [02:13<00:00, 96.26it/s]


epoch 0 train loss: 2.086


100%|██████████| 1588/1588 [00:05<00:00, 295.58it/s]
100%|██████████| 1618/1618 [00:05<00:00, 296.03it/s]


epoch 0 val loss: 2.010
epoch 0 test loss: 2.019


100%|██████████| 12836/12836 [02:13<00:00, 96.47it/s]


epoch 1 train loss: 2.083


100%|██████████| 1588/1588 [00:05<00:00, 294.44it/s]
100%|██████████| 1618/1618 [00:05<00:00, 296.07it/s]

epoch 1 val loss: 2.009
epoch 1 test loss: 2.017





In [28]:

artifact = wandb.Artifact('model', type='model')
artifact.add_file('model/gpt.pt')
run.log_artifact(artifact)

<Artifact model>

In [26]:
torch.save(model.state_dict(), "model/gpt.pt")

In [14]:
# wandb.finish()

0,1
epoch_train_loss,█▃▃▂▂▁▁▁
epoch_val_loss,█▆▅▄▅▂▁▁
train_loss,█▄▄▂▄▃▂▃▂▂▂▂▂▂▃▂▂▃▂▂▂▃▂▂▂▂▂▂▁▂▁▁▂▂▂▁▂▁▂▁

0,1
epoch_train_loss,2.59363
epoch_val_loss,2.57107
train_loss,2.52198


In [29]:
def generate_text(model, prompt: str, max_tokens: int = 10, temperature: float = 1.0) -> str:
   """Generate text using the trained GPT model."""
   model.eval()
   context = tokenize(prompt)
   generated = list(context)

   with torch.no_grad():
       for _ in range(max_tokens):
           x = torch.tensor(context[-WINDOW_SIZE:]).unsqueeze(0).to(device)
           logits, _ = model(x)
           logits = logits[0, -1, :] / temperature
           probs = torch.softmax(logits, dim=-1)
           next_token = torch.multinomial(probs, num_samples=1).item()
           generated.append(next_token)
           context = generated

   return decode(context)

In [31]:
model.eval()  # Ensure model is in evaluation mode
prompt = "Привет, любовь моя "
generated_text = generate_text(
    model=model,
    prompt=prompt,
    max_tokens=1000,
    temperature=1.0
)
print(generated_text)

Привет, любовь моя визние, Фод.

– Де вересть обийных обы дыхойство что их оприцамх мыливых Боткрыв промусь пожолму, незра намолья даже знамет от дишией.

– О онужды все кой. Постить Тенщиесназыщите тьмом стрить Эрнст убыть ас познажалась села не уловки. Не ту не можный, вот растсяен.

– И очесь гро ей. Гкацен все товал. На. К не лушлоенна леречишь скусстно в шелщинул уголкот звт подумиднаясь ее остельнымую, мназывает.

– Зечу. Оздно ччто таки вля часть ма не мне жедиловое…

Качким се горда…

Посольчитесть.

Пото несковь зведивать жизнь и он усливпыски.

– Оцм кулоподность: «А При замел побирител дерь из умит чень глишую и отповолизвиица на цвей припобил – дешие убыроскался чумоную разможенный но его госвесли он это кудно сказорния.

– В и тотвь дружение ренята все тевя дил просинклия на эткс!

Гупесни теуда вылской сеашь доржала тразной же чутро головодость И ткаль. Сшорния? Слен взгласло, как опустинчах то сна скорумы. Я, потения, когда Фридам подолжастился. А кото. Всего него убыннт

In [32]:
wandb.finish()

0,1
epoch_test_loss,█▅▄▃▃▂▂▂▂▂▂▁▁▁▁
epoch_train_loss,█▅▄▃▂▂▂▂▂▂▁▁▁▁▁
epoch_val_loss,█▅▄▃▃▂▂▂▂▂▂▁▁▁▁
train_loss,█▇▆▄▄▄▄▃▃▂▂▂▃▃▂▄▃▂▃▃▂▃▁▂▃▁▃▂▂▂▂▁▃▂▂▂▂▂▂▁

0,1
epoch_test_loss,2.01742
epoch_train_loss,2.08324
epoch_val_loss,2.00906
train_loss,1.99332
