In [1]:
import os, sys
import ipdb
from tqdm import tqdm
from datetime import datetime
import requests, zipfile, io

import torch
import torch.nn as nn
from torch.nn import functional as F

# tokenizer
import sentencepiece as spm

# thsese improve performance for Ampere architecture
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.cuda.empty_cache()

In [2]:
# files_url = "https://ideami.com/llm_train"
# print("Downloading dataset...")
# response = requests.get(files_url)
# zipfile.ZipFile(io.BytesIO(response.content)).extractall(".")

In [3]:
# architecture parameters
batch_size = 8
context = 512
embed_size = 384
n_layers = 7
n_heads = 7
BIAS = True

# hyperparameters
lr = 3e-4
dropout = 0.05
weight_decay = 0.01
grad_clip = 1.0

# training parameters
train_iters = 100000
eval_interval = 50
eval_iters = 10
compile = True
checkpoint_dir = 'models/'
checkpoint_fn = 'latest.pt'
checkpoint_load_fn = 'latest.pt'
dtype = torch.bfloat16

# MODE 
inference = False

# DEVICE
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)



cuda


In [4]:
# logging
wandb_log = True
wandb_project = 'llm1'
wandb_run_name = 'llm1-' + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

if wandb_log:
    import wandb
    wandb.init(project=wandb_project, name=wandb_run_name)

[34m[1mwandb[0m: Currently logged in as: [33mmaciejej[0m ([33mmaciejej-uniwersytet-dzki[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
with open("wiki.txt", "r", encoding="utf-8") as f:
    text = f.read()

print(text[10000:10300])

 that was used to represent a team in an old TV show, The A-Team. A capital a is written "A". Use a capital A at the start of a sentence if writing.

A is also a musical note, sometimes referred to as "La".

The letter 'A' was in the Phoenician alphabet's aleph. This symbol came from a simple pictur


In [6]:
# tokenizer
sp = spm.SentencePieceProcessor(model_file="wiki_tokenizer.model")

vocab_size = sp.get_piece_size()
print(f"Tokenizer vocab_size: {vocab_size}")

Tokenizer vocab_size: 4096


In [7]:
encode = lambda s: sp.Encode(s)
decode = lambda l: sp.Decode(l)

zdanie = "niebo jest niebieskie"
print(encode(zdanie))
print(decode(encode(zdanie)))

[316, 428, 4052, 4037, 599, 395, 316, 428, 4052, 412, 4055, 428]
niebo jest niebieskie


In [8]:
if os.path.exists("encoded_data.pt"):
    print("Loading encoding")
    data = torch.load("encoded_data.pt")
else:
    data = torch.tensor(encode(text), dtype=torch.long)
    torch.save(data, "encoded_data.pt")


Loading encoding


In [9]:
data_size = len(data)
splt = int(0.9 * data_size)
train_data = data[:splt]
val_data = data[splt:]

print(f"Total data: {data_size / 1e6:.2f} Million | Training: {len(train_data) / 1e6:.2f} Million | Validation {len(val_data) / 1e6:.2f} Million ")

Total data: 59.21 Million | Training: 53.29 Million | Validation 5.92 Million 


In [10]:
def get_batch(split):
    data = train_data if split=="train" else val_data
    indeces = torch.randint(len(data) - context, (batch_size,))
    x = torch.stack([data[i: i+context] for i in indeces]) # (batch_size, sequence_length)
    y = torch.stack([data[i+1:i+context+1] for i in indeces])

    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch("train")
print(x.shape, y.shape)
print(x[0][:10])
print(y[0][:10])

torch.Size([8, 512]) torch.Size([8, 512])
tensor([4031, 4062, 4059, 4056, 4085, 4053,  666, 1906, 2132,  959],
       device='cuda:0')
tensor([4062, 4059, 4056, 4085, 4053,  666, 1906, 2132,  959,  339],
       device='cuda:0')


In [11]:
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_size) # 3096 x 384
        self.positions = nn.Embedding(context, embed_size) # 512 x 384
        # self.blocks = nn.Sequential(*[Block(n_heads) for _ in range(n_layers)])
        self.layer_normalisation = nn.LayerNorm(embed_size)
        self.final_linear = nn.Linear(embed_size, vocab_size, bias=BIAS) # 384 x 4096
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, input, targets=None):
        loss = None
        BS, SL = input.shape # BS x SL
        emb = self.embeddings(input) # BSS x SL x 384
        pos = self.positions(torch.arange(SL, device=device)) # SL x 384
        x = emb + pos # BS x SL x 384
        # x = self.blocks(x) # BS x SL x 384
        x = self.layer_normalisation(x) # BS x SL x Embedding size
        logits = self.final_linear(x) # BS x SL x vocab_size (4096)

        if targets is not None:
            BS, SL, vocabsize = logits.shape
            logits = logits.view(BS * SL, vocabsize)
            targets = targets.view(BS * SL)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
            
    def generate(self, input, max=500):
        for _ in range(max):
            input = input[:, -context:] # (1, input length until max of sequence length)
            logits, _ = self(input) # (1, input length, vocab_size)
            logits = logits[:, -1, :] # pick last logit (1, vocab_size)
            probs = F.softmax(logits, dim=-1) # (1, vocab_size)
            next = torch.multinomial(probs, num_samples=1)
            input = torch.cat((input, next), dim=1)
        return input


In [None]:
class Block(nn.Module):
    def __init__(self, n_heads):
        super().__init__()
        head_size = embed_size // n_heads
        self.multi_attention = Multihead(n_heads, head_size)
        self.feed_forward = ForwardLayer(embed_size)
        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)

    def forward(self, x):
        x = x + self.multi_attention(self.ln1)
        x = x + self.feed_forward(self.ln2(x))
        return x  

In [None]:
class ForwardLayer(nn.Module):
    def __init__(self, embed_size):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(embed_size, 6*embed_size, bias=BIAS),
            nn.GELU(),
            nn.Linear(6*embed_size, embed_size, bias=BIAS),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = self.network(x)
        return x

In [None]:
class Multihead(nn.Module):
    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.combine = nn.Linear(head_size * n_heads, embed_size, bias=BIAS)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = torch.cat([head(x) for head in self.heads], dim=1)
        x = self.combine(x) # (BS, SL, 384)
        x = self.dropout(x)
        return x


In [None]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.queries = nn.Linear(embed_size, head_size, bias=BIAS)
        self.keys = nn.Linear(embed_size, head_size, bias=BIAS)
        self.values = nn.Linear(embed_size, head_size, bias=BIAS)


In [14]:
head_size = embed_size // n_heads
print(f"embed: {embed_size} n_heads: {n_heads} head_size: {head_size}")

embed: 384 n_heads: 7 head_size: 54


In [12]:
x, y = get_batch("train")
print(x.shape, y.shape)
print(x[0][:10])
print(y[0][:10])

model = GPT()
model = model.to(dtype)
model = model.to(device)

logits, loss = model(x, y)
print(loss.item())

torch.Size([8, 512]) torch.Size([8, 512])
tensor([4036, 4053,  347,  305,  999, 4053, 3272, 4043, 4051,   13],
       device='cuda:0')
tensor([4053,  347,  305,  999, 4053, 3272, 4043, 4051,   13,   13],
       device='cuda:0')
8.375


In [13]:
@torch.no_grad
def generate_sample(input):
    t1 = torch.tensor(encode(input), dtype=torch.long, device=device)
    t1 = t1[None, :]
    newgen = model.generate(t1, max=64)[0].tolist()
    result = decode(newgen)
    print(f"{result}")

generate_sample("Once upon a time")


Once upon a time recognchester project Jes Julyrightso Fox^ moreoung Polish insp prot internationalheast Court ro Re meellaapt Rockrel English main Lee Flor producer She theseade video night Char esc Fore dayouthmanig Bro Dou Ind Met South bet enough evO Aï¿½eld bas Jamesootball claim lawyer Korean resth Polight
