## In this Colab we:
1. import Shakespear romans (docs/shakespear.txt)
2. prepare the data:
    - vanilla tokenizer char -> index | ex. a -> 1
    - prepare batch loader with block_size = 8 | ex. [15, 32, 7 .. 19] -> [13]
3. create Neuro-based BigramLM
    - forward ~ nn.Embedding(idx)
    - generate next token
4. train BLM

   

---------------------
### Result 1 : sampling from Bi-Gram ~= sampling from Neuro approach.

Neuro approach : train only character embeddings matrix W : output = SoftMax(OHE @ W)

**Two Forward Pass Sampling are the same! (probabilities are literally the same)**

Just like we manually count Bi-gram, we update the word embedding matrix so its WEIGHTS "account" the statistical bi-occurance of characters.

----------------------

### Result 2 : lower bounds for likelihood
Upper bound for Likelihood (randomly uniformed guess)
- 3.2958

Lower bounds for likelohood based on k previous chars

- k = 1 : 2.454 
- k = 2 : 2.092 
- k = 3 : 1.963
----------------------


# 1. Import Shakespear text

In [1]:
import numpy as np
import torch

with open('../docs/shakespear.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print(text[:173], f'\n\n{len(text)=}')

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved. 

len(text)=1115390


# 2. Pre-process Shakespear
## 2.1 Vanilla Tokenizer

In [2]:
chars, counts = np.unique(list(text), return_counts=True)
counts = dict(zip(chars, counts))
vocab_size = len(chars)

stoi = {c:i for i,c in enumerate(chars)}
itos = {i:c for i,c in enumerate(chars)}

encode = lambda s : [stoi[c] for c in s]
decode = lambda l : ''.join([itos[i] for i in l])

print('Text: hii there')
print('Encoding:', encode('hii there'))
print('Decoding:', decode(encode('hii there')))

data = torch.tensor(encode(text), dtype=torch.long)

split_ration = int(0.9 * len(data))
train_data = data[:split_ration]
val_data = data[split_ration:]

block_size = 8
print(f'\n{block_size=}\t train block:{train_data[:block_size+1]}')


for t in range(block_size):
    context = train_data[:t+1]
    target = train_data[t+1]
    print(f"{context} -> {target}")


Text: hii there
Encoding: [44, 45, 45, 1, 56, 44, 41, 54, 41]
Decoding: hii there

block_size=8	 train block:tensor([16, 45, 54, 55, 56,  1, 13, 45, 56])
tensor([16]) -> 45
tensor([16, 45]) -> 54
tensor([16, 45, 54]) -> 55
tensor([16, 45, 54, 55]) -> 56
tensor([16, 45, 54, 55, 56]) -> 1
tensor([16, 45, 54, 55, 56,  1]) -> 13
tensor([16, 45, 54, 55, 56,  1, 13]) -> 45
tensor([16, 45, 54, 55, 56,  1, 13, 45]) -> 56


## 2.2 Batch Loader

In [3]:
batch_size = 4
def get_batch(data, batch_size):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

xb,yb = get_batch(train_data, batch_size)
print(f'Batch of {batch_size}: \n\t{xb.shape=}\n\t{xb=}\n\t{yb.shape=}\n\t{yb=}')

print(f'Each batch will produce {block_size} examples:')
for t in range(block_size):
    context = xb[0, :t+1]
    target = yb[0, t]
    print(f'\t{context}->{target}')

Batch of 4: 
	xb.shape=torch.Size([4, 8])
	xb=tensor([[ 1, 37, 50, 40,  1, 49, 51, 54],
        [37, 39, 47,  1, 37, 42, 56, 41],
        [ 1, 45, 50,  1, 44, 41, 48, 48],
        [44,  1, 38, 51, 61,  4,  1, 56]])
	yb.shape=torch.Size([4, 8])
	yb=tensor([[37, 50, 40,  1, 49, 51, 54, 41],
        [39, 47,  1, 37, 42, 56, 41, 54],
        [45, 50,  1, 44, 41, 48, 48,  9],
        [ 1, 38, 51, 61,  4,  1, 56, 44]])
Each batch will produce 8 examples:
	tensor([1])->37
	tensor([ 1, 37])->50
	tensor([ 1, 37, 50])->40
	tensor([ 1, 37, 50, 40])->1
	tensor([ 1, 37, 50, 40,  1])->49
	tensor([ 1, 37, 50, 40,  1, 49])->51
	tensor([ 1, 37, 50, 40,  1, 49, 51])->54
	tensor([ 1, 37, 50, 40,  1, 49, 51, 54])->41


# 3. Create Neuro-BigramLM

In [6]:
class NeuroBigramLM(torch.nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_emb_table = torch.nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        """
        idx (B,T)
        targets (B,T)
        """

        logits = self.token_emb_table(idx) # (B,T,C)
        
        if targets is not None: # compute CE_loss
            # reshape for torch cross_entropy
            B, T, C = logits.shape
            logits = logits.view(B*T, C)       # (B*T, C)
            targets = targets.view(B*T)        # (B*T)

            loss = torch.nn.functional.cross_entropy(logits, targets)
        else:
            loss = None 

        return logits, loss
    
    def generate(self, idx, max_new_size):
        for _ in range(max_new_size):
            logits, loss = self(idx) # logits (B, T, C)
            logits = logits[:, -1, :] # take logits only for last char (ignore others)
            probs = torch.nn.functional.softmax(logits, dim=-1) # softmax by C
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
            
    
blm = NeuroBigramLM(vocab_size)
out, loss = blm(xb, yb)
print(f'{out.shape=}\nCE_loss={loss}')

# generate
xval, yval = get_batch(val_data, batch_size=1)
new = blm.generate(xval, max_new_size=100)[0].tolist()
print(f'{decode(new)}')

out.shape=torch.Size([32, 63])
CE_loss=4.35011100769043
yman, I AwdzD,?zGbeBBBBKonkcWrctIq R-aqniJdHkr LkKU?scpc!JGs,yLi?md-qo,Lplzl,wsdTe
QFcz,KIj-nkpMDTK qnaY-uUs


# 4. Train

In [17]:
optimizer = torch.optim.AdamW(blm.parameters(), lr=1e-3)

batch_size = 32
for _ in range(10000):
    xb, yb = get_batch(train_data, batch_size)

    logits, loss = blm(idx=xb, targets=yb)
    optimizer.zero_grad()

    loss.backward()
    optimizer.step()

    if _%1000 ==0:
        print(loss.item())

2.436866283416748
2.428391456604004
2.4383668899536133
2.423478603363037
2.440695285797119
2.4636313915252686
2.441060781478882
2.517704963684082
2.4908270835876465
2.3524813652038574


In [18]:
# generate
xval, yval = get_batch(val_data, batch_size=1)
new = blm.generate(xval, max_new_size=100)[0].tolist()
print(f'{decode(new)}')

ir, now poual, athit gour:
Tore; m y manththol thend ar mm, ssin sure hair m parou hom!
I ayomer cthine SAs!


In [3]:
torch.tril((torch.zeros(5,5)))

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])