In [1]:
with open('../data/data.txt','r') as f:
    text = f.read()

In [4]:
len(text)

230213515

In [5]:
text[:100]

'深いおどろきにうたれて、\n名高いウェストミンスターに\n真鍮や石の記念碑となって\nすべての王侯貴族が集まっているのをみれば、\n今はさげすみも、ほこりも、見栄もない。\n善にかえった貴人の姿、\n華美と俗世の'

In [6]:
chars = sorted(list(set(text)))

In [7]:
vocab_size = len(chars)

In [8]:
vocab_size

9505

In [9]:
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

encode = lambda s:[stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [10]:
encode("深いおどろきにうたれて")

[4339, 486, 492, 523, 559, 495, 525, 488, 513, 558, 520]

In [11]:
decode([4339, 486, 492, 523, 559, 495, 525, 488, 513, 558, 520])

'深いおどろきにうたれて'

In [12]:
import torch

In [13]:
data = torch.tensor(encode(text),dtype=torch.long)

In [14]:
print(data.shape,data.dtype)

torch.Size([230213515]) torch.int64


In [15]:
print(data[:100]) #first thousand

tensor([4339,  486,  492,  523,  559,  495,  525,  488,  513,  558,  520,  455,
           1, 1546, 8793,  486,  579,  580,  598,  613,  636,  656,  598,  604,
         665,  525,    1, 5283, 8187,  550, 5368,  528, 7291, 2768, 5411,  522,
         524,  517,  520,    1,  507,  539,  520,  528, 4889, 1010, 7528, 3432,
         494, 8409,  544,  517,  520,  486,  557,  528,  564,  545,  558,  530,
         455,    1,  905,  529,  503,  500,  507,  545,  548,  455,  541,  501,
         556,  548,  455, 7242, 3700,  548,  524,  486,  456,    1, 1698,  525,
         493,  490,  517,  513, 7528,  897,  528, 2133,  455,    1, 6642, 6158,
         522, 1024,  819,  528])


In [17]:
# training and validation
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [18]:
# maximum length of chunk we give to transformer
block_size = 8
train_data[:block_size+1]

tensor([4339,  486,  492,  523,  559,  495,  525,  488,  513])

In [19]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} target is {target}")

when input is tensor([4339]) target is 486
when input is tensor([4339,  486]) target is 492
when input is tensor([4339,  486,  492]) target is 523
when input is tensor([4339,  486,  492,  523]) target is 559
when input is tensor([4339,  486,  492,  523,  559]) target is 495
when input is tensor([4339,  486,  492,  523,  559,  495]) target is 525
when input is tensor([4339,  486,  492,  523,  559,  495,  525]) target is 488
when input is tensor([4339,  486,  492,  523,  559,  495,  525,  488]) target is 513


In [20]:
torch.manual_seed(42)
batch_size = 4 # how many independent sequences to process in parallel
block_size = 8 # maximum context length for predictions

def get_batch(split):
    # gen small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data)-block_size,(batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

xb,yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('------')
for b in range(batch_size): # batch dim
    for t in range(block_size): # block dim
        context = xb[b,:t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} target is {target}")

inputs:
torch.Size([4, 8])
tensor([[ 521,  525,  455,  484,  557,  528,  521,  484],
        [2368,  548, 6208,  548, 2049,  509,  513,  554],
        [ 511,  518,  522, 2722,  564,  518,  499,  520],
        [ 522,  518,  513, 3579, 7898, 8219,  521, 1550]])
targets:
torch.Size([4, 8])
tensor([[ 525,  455,  484,  557,  528,  521,  484,  517],
        [ 548, 6208,  548, 2049,  509,  513,  554,  488],
        [ 518,  522, 2722,  564,  518,  499,  520, 7105],
        [ 518,  513, 3579, 7898, 8219,  521, 1550,  532]])
------
when input is [521] target is 525
when input is [521, 525] target is 455
when input is [521, 525, 455] target is 484
when input is [521, 525, 455, 484] target is 557
when input is [521, 525, 455, 484, 557] target is 528
when input is [521, 525, 455, 484, 557, 528] target is 521
when input is [521, 525, 455, 484, 557, 528, 521] target is 484
when input is [521, 525, 455, 484, 557, 528, 521, 484] target is 517
when input is [2368] target is 548
when input is [2368, 548]

In [32]:
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(42)

class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)

    def forward(self,idx,targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss=None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits,targets) 

        return logits,loss

    def generate(self,idx,max_new_tokens):
        # idx is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get predictions
            logits,loss = self(idx)
            # focus only on the last time step
            logits = logits[:,-1,:] # becomes (B,C)
            # apply softmax to get probabilities
            probs = F.softmax(logits,dim=-1) # (B,C)
            # sample from distribution
            idx_next = torch.multinomial(probs,num_samples=1) # (B,1)
            # append sampled index to running sequence
            idx = torch.cat((idx,idx_next),dim=1) # (B,T+1)
        return idx



In [33]:
m = BigramLanguageModel(vocab_size)
out,loss = m(xb,yb)
print(out.shape,loss)

torch.Size([32, 9505]) tensor(9.5254, grad_fn=<NllLossBackward0>)


In [35]:
idx = torch.zeros((1,1),dtype=torch.long) # feed a zero to start
# garbage bc no training
print(decode(m.generate(idx,max_new_tokens=10)[0].tolist()))

	萎酒塢𠵅J俎迓彙厚瞽


In [36]:
# optimizer
optimizer = torch.optim.AdamW(m.parameters(),lr=1e-3)

In [43]:
batch_size = 32
for steps in range(1000):
    # sample batch of data
    xb,yb = get_batch('train')
    # eval loss
    logits,loss=m(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps %100 == 0:
        print(loss.item())

9.63690185546875


KeyboardInterrupt: 

In [45]:
idx = torch.zeros((1,1),dtype=torch.long) # feed a zero to start
# garbage bc bigram training
print(decode(m.generate(idx,max_new_tokens=10)[0].tolist()))

	淳詳韮㗅腔禪綷鱣v﹆


# math trick in self attention

In [46]:
torch.manual_seed(42)
B,T,C = 4,8,2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [54]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a,1,keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a@b

In [55]:
c

tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])

In [57]:
wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(1,keepdim=True)
xbow2 = wei @ x #(B,T,T)@(B,T,C) --> (B,T,C)

In [69]:
# self attention
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)

# head
head_size = 16
key = nn.Linear(C,head_size,bias=False)
query = nn.Linear(C,head_size,bias=False)
value = nn.Linear(C,head_size,bias=False)
k = key(x) # (B,T,16)
q = query(x) # (B,T,16)
# affinities
wei = q @ k.transpose(-2,-1) # transpose last two dims # (B,T,16) @ (B,16,T) -> (B,T,T)

tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril==0,float('-inf'))
wei = F.softmax(wei,dim=-1)

v = value(x)
out = wei @ v
out.shape

TypeError: unsupported operand type(s) for @: 'Tensor' and 'Linear'

In [68]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)