In [3]:
# v notebooku použij %pip (magic) pro instalace do aktuálního kernelu
%pip install --upgrade pip
%pip install -U tiktoken tqdm

# Pokud máte GPU s CUDA 11.8 (jen pokud je to kompatibilní):
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Nebo CPU-only (odkomentujte místo předchozího řádku, pokud nemáte kompatibilní CUDA):
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

# Po instalaci: Kernel -> Restart (pokud je třeba)

Note: you may need to restart the kernel to use updated packages.

^C
Note: you may need to restart the kernel to use updated packages.


Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.7.1%2Bcu118-cp311-cp311-win_amd64.whl.metadata (6.8 kB)
INFO: pip is looking at multiple versions of torchaudio to determine which version is compatible with other requirements. This could take a while.
  Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.7.0%2Bcu118-cp311-cp311-win_amd64.whl.metadata (6.8 kB)
  Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-win_amd64.whl.metadata (6.8 kB)
  Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.5.1%2Bcu118-cp311-cp311-win_amd64.whl (4.0 MB)
  Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.5.0%2Bcu118-cp311-cp311-win_amd64.whl (4.0 MB)
  Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.4.1%2Bcu118-cp311-cp311-win_amd64.whl (4.0 MB)
  Using cached https://download.pytorch.org/whl/cu118/torchau

  You can safely remove it manually.


In [4]:
from pathlib import Path

data_dir = Path(r"C:\Users\lucie\OneDrive\Desktop\MUNI\data\github\M7DataSP\textdata")
txt_paths = list(data_dir.glob("*.txt"))  # nebo **/*.txt pokud jsou v podadresářích

print(f"Nalezeno {len(txt_paths)} .txt souborů")
if len(txt_paths) == 0:
    raise FileNotFoundError("Žádné .txt soubory nenašly! Zkontroluj cestu a příponu.")

# spoj všechny texty
all_texts = []
for p in txt_paths:
    with p.open("r", encoding="utf-8", errors="ignore") as f:
        all_texts.append(f.read())
text = "\n\n".join(all_texts)
print("Spojeno, délka:", len(text))


Nalezeno 1 .txt souborů
Spojeno, délka: 2656671


In [5]:
import tiktoken
import numpy as np

# použij GPT-2 encoding (kompatibilní s tiktoken + Karpathy style)
enc = tiktoken.get_encoding("gpt2")

# tokenizuj (pro velké korpusy může chvíli trvat)
tokens = enc.encode(text)
tokens = np.array(tokens, dtype=np.int32)
print("Počet tokenů:", tokens.shape[0])

# ulož tokeny pro rychlejší opětovné spuštění
np.save(data_dir / "tokens.npy", tokens)
print("Tokens saved to:", data_dir / "tokens.npy")

# bezpečný způsob získat slovník (vocab) -- použiju velikost z dat
vocab_size = int(tokens.max()) + 1
print("Vypočtené vocab_size z dat:", vocab_size)
# Pozn: pokud chceš přímo nastavit GPT-2 slovník, jeho velikost je 50257.


Počet tokenů: 1533710
Tokens saved to: C:\Users\lucie\OneDrive\Desktop\MUNI\data\github\M7DataSP\textdata\tokens.npy
Vypočtené vocab_size z dat: 50229


In [6]:
import numpy as np
import torch

tokens = np.load(data_dir / "tokens.npy")
n = len(tokens)
split = int(0.9 * n)
train_tokens = tokens[:split]
val_tokens   = tokens[split:]

block_size = 128  # můžeš zvýšit na 256/512 pokud máš paměť
def get_batch(arr, batch_size=16, block_size=128, device='cpu'):
    ix = np.random.randint(0, len(arr) - block_size, (batch_size,))
    x = np.stack([arr[i:i+block_size] for i in ix])
    y = np.stack([arr[i+1:i+1+block_size] for i in ix])
    x = torch.tensor(x, dtype=torch.long, device=device)
    y = torch.tensor(y, dtype=torch.long, device=device)
    return x, y

print("Train tokens:", len(train_tokens), "Val tokens:", len(val_tokens))


Train tokens: 1380339 Val tokens: 153371


In [10]:
import torch.nn as nn
import math

class CausalSelfAttention(nn.Module):
    def __init__(self, n_embed, n_head, dropout, max_T=1024):
        super().__init__()
        assert n_embed % n_head == 0
        self.n_head = n_head
        self.head_dim = n_embed // n_head
        self.qkv = nn.Linear(n_embed, 3 * n_embed)
        self.proj = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)
        # mask buffer (max_T × max_T)
        mask = torch.tril(torch.ones(max_T, max_T)).unsqueeze(0).unsqueeze(0)
        self.register_buffer("mask", mask)

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.qkv(x).view(B, T, 3, self.n_head, self.head_dim).permute(2,0,3,1,4)
        q, k, v = qkv[0], qkv[1], qkv[2]
        att = (q @ k.transpose(-2,-1)) / math.sqrt(self.head_dim)
        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float('-inf'))
        att = torch.softmax(att, dim=-1)
        att = self.dropout(att)
        y = att @ v
        y = y.transpose(2,1).contiguous().view(B, T, C)
        y = self.proj(y)
        y = self.dropout(y)
        return y

class Block(nn.Module):
    def __init__(self, n_embed, n_head, dropout):
        super().__init__()
        self.ln1 = nn.LayerNorm(n_embed)
        self.attn = CausalSelfAttention(n_embed, n_head, dropout)
        self.ln2 = nn.LayerNorm(n_embed)
        self.mlp = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.GELU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout),
        )
    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

class SimpleGPT(nn.Module):
    def __init__(self, vocab_size, n_embed=256, n_layer=6, n_head=8, block_size=128, dropout=0.1):
        super().__init__()
        self.tok_emb = nn.Embedding(vocab_size, n_embed)
        self.pos_emb = nn.Embedding(block_size, n_embed)
        self.drop = nn.Dropout(dropout)
        self.blocks = nn.ModuleList([Block(n_embed, n_head, dropout) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embed)
        self.head = nn.Linear(n_embed, vocab_size, bias=False)
        self.block_size = block_size
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
        if isinstance(module, nn.Linear) and module.bias is not None:
            nn.init.zeros_(module.bias)
        if isinstance(module, nn.LayerNorm):
            nn.init.ones_(module.weight)
            nn.init.zeros_(module.bias)

    def forward(self, idx, targets=None):
        b, t = idx.size()
        assert t <= self.block_size
        tok = self.tok_emb(idx)
        pos = self.pos_emb(torch.arange(t, device=idx.device))[None,:,:]
        x = self.drop(tok + pos)
        for block in self.blocks:
            x = block(x)
        x = self.ln_f(x)
        logits = self.head(x)
        if targets is None:
            return logits
        loss = nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss


In [11]:
import torch.optim as optim
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device:", device)

# z tokenů získáme slovník
tokens = np.load(data_dir / "tokens.npy")
vocab_size = int(tokens.max()) + 1  # bezpečné
print("Vocab size from data:", vocab_size)

model = SimpleGPT(
    vocab_size = max(vocab_size, 50257),  # alespoň GPT-2 base; pokud chceš prostě 50257, změň
    n_embed = 256,    # zmenši pokud nemáš paměť: 128
    n_layer = 4,      # malý model: 2-6
    n_head = 8,
    block_size = block_size,
    dropout = 0.1
).to(device)

optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.1)

# trénink (lehké nastavení pro rychlé experimenty)
epochs = 3
batch_size = 32

for epoch in range(epochs):
    model.train()
    # když máš velký dataset, iteruj přes pevný počet kroků
    iters_per_epoch = 200
    for it in range(iters_per_epoch):
        xb, yb = get_batch(train_tokens, batch_size=batch_size, block_size=block_size, device=device)
        logits, loss = model(xb, yb)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        if it % 20 == 0:
            print(f"Epoch {epoch} iter {it} loss {loss.item():.4f}")
    # validace
    model.eval()
    with torch.no_grad():
        xb, yb = get_batch(val_tokens, batch_size=64, block_size=block_size, device=device)
        _, val_loss = model(xb, yb)
        print(f"=== Epoch {epoch} VAL loss: {val_loss.item():.4f} ===")
    torch.save(model.state_dict(), data_dir / f"gpt_checkpoint_epoch{epoch}.pth")
    print("Saved checkpoint for epoch", epoch)


Device: cpu
Vocab size from data: 50229
Epoch 0 iter 0 loss 10.8846
Epoch 0 iter 20 loss 8.1002
Epoch 0 iter 40 loss 6.1550
Epoch 0 iter 60 loss 5.3563
Epoch 0 iter 80 loss 4.8688
Epoch 0 iter 100 loss 4.4320
Epoch 0 iter 120 loss 4.1812
Epoch 0 iter 140 loss 4.0603
Epoch 0 iter 160 loss 3.7803
Epoch 0 iter 180 loss 3.6673
=== Epoch 0 VAL loss: 3.9678 ===
Saved checkpoint for epoch 0
Epoch 1 iter 0 loss 3.5756
Epoch 1 iter 20 loss 3.6041
Epoch 1 iter 40 loss 3.3667
Epoch 1 iter 60 loss 3.3526
Epoch 1 iter 80 loss 3.4398
Epoch 1 iter 100 loss 3.3524
Epoch 1 iter 120 loss 3.4228
Epoch 1 iter 140 loss 3.1860
Epoch 1 iter 160 loss 3.1059
Epoch 1 iter 180 loss 3.0368
=== Epoch 1 VAL loss: 3.6257 ===
Saved checkpoint for epoch 1
Epoch 2 iter 0 loss 3.1819
Epoch 2 iter 20 loss 3.0608
Epoch 2 iter 40 loss 2.9322
Epoch 2 iter 60 loss 2.9671
Epoch 2 iter 80 loss 2.9538
Epoch 2 iter 100 loss 2.8934
Epoch 2 iter 120 loss 2.8168
Epoch 2 iter 140 loss 2.8312
Epoch 2 iter 160 loss 2.9420
Epoch 2 iter

In [13]:
import torch.nn.functional as F

@torch.no_grad()
def generate(model, idx, max_new_tokens=200, temperature=1.0, top_k=None):
    model.eval()
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -model.block_size:]
        logits = model(idx_cond)  # (B, T, V)
        logits = logits[:, -1, :] / (temperature if temperature>0 else 1.0)
        if top_k is not None:
            v, _ = torch.topk(logits, top_k)
            minv = v[:, -1].unsqueeze(1)
            logits = torch.where(logits < minv, torch.full_like(logits, -1e10), logits)
        probs = F.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, next_token), dim=1)
    return idx

# příklad — načti checkpoint (pokud potřebuješ)
# model.load_state_dict(torch.load(data_dir / "gpt_checkpoint_epoch2.pth", map_location=device))

prompt = "Začátek kapitoly:\n"
prompt_tokens = enc.encode(prompt)
x = torch.tensor([prompt_tokens], dtype=torch.long, device=device)
out = generate(model, x, max_new_tokens=300, temperature=0.8, top_k=40)
generated = enc.decode(out[0].cpu().tolist())
print("=== GENERATED ===\n")
print(generated)


=== GENERATED ===

Začátek kapitoly:
2
−2
(1) = π
− (1)2
−1)2 x−2 2
−4
−2
−2
−3 5 2 3 4 x
−1
U: Výborne. Zlomky vzorca tak výraz tgx+2 −x+y.
Ž: Výborne. Výborne, že priamky, tým roviny. Výborne. Keďže to budú už vieme
Ž: Zhrť na polomer, takto:
1
3
b ,
7 2
4 1
y
y = −3
0
1
0
−0
a
a·| =
1
F
1
1 1
−1
y = −1
−4 −1 0 x 3 x
−n,1 −1 0 −2 −1 2 2 3 5 x 1 1 0,4.
−2
−1 1 3 3 6
−1
−1
−1
−1
−2
−1
−0
−1
−0
−1
− = = 2 1
−1 1
n
−1
−2
−→
2
U: Správne rovn
