In [30]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# Get Data and Preprocess 

In [2]:
# download data:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-01-03 19:09:13--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-01-03 19:09:29 (71.3 KB/s) - ‘input.txt’ saved [1115394/1115394]



In [3]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
# len of data i.e. num of chars
len(text)

1115394

In [5]:
# first 1k chars
text[:1000]

"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be done: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citizens, the patricians good.\nWhat authority surfeits on would relieve us: if they\nwould yield us but the superfluity, while it were\nwholesome, we might guess they relieved us humanely;\nbut they think we are too dear: the leanness that\nafflicts us, the object of our misery, is as an\ninventory to particularise their abundance; our\nsufferance is a gain to them Let us revenge this with\nour pikes, ere we become rakes: for the gods know I\nspeak this in hunger 

In [8]:
# let's get vocab
chars = sorted(list(set(text)))
vocab_len = len(chars)
print(''.join(chars))
print(vocab_len)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [9]:
# let's tokenize text at the char level
# build mappings
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for ch,i in stoi.items()}
print(stoi, itos)

encode = lambda s: [stoi[ch] for ch in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode('hellow world!&!'))
print(decode(encode('hellow world!&!')))

# there are many tokenization schemes eg google uses SentencePiece (sub-word tokenizer), openAI uses tiktoken

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64} {0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 42: 'd', 43: 'e', 44: 'f', 45: 'g', 46: 'h', 47: 'i',

In [10]:
# so now we can tokenize the input corpus
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
text[:1000]

torch.Size([1115394]) torch.int64


"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be done: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citizens, the patricians good.\nWhat authority surfeits on would relieve us: if they\nwould yield us but the superfluity, while it were\nwholesome, we might guess they relieved us humanely;\nbut they think we are too dear: the leanness that\nafflicts us, the object of our misery, is as an\ninventory to particularise their abundance; our\nsufferance is a gain to them Let us revenge this with\nour pikes, ere we become rakes: for the gods know I\nspeak this in hunger 

In [11]:
# let's split data in train/test/val
n = int(.9*len(data))
train_data = data[:n] # 90%
val_data = data[n:] # 10%

In [12]:
ctx_len = 8
train_data[:ctx_len+1] # a first example of input data
# here we have that 47 comes after 18, 56 comes after 18 and 47, etc

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [13]:
x = train_data[:ctx_len]
y = train_data[1:ctx_len+1]

In [14]:
x, y

(tensor([18, 47, 56, 57, 58,  1, 15, 47]),
 tensor([47, 56, 57, 58,  1, 15, 47, 58]))

In [17]:
for t in range(ctx_len):
    ctx = x[:t+1]
    target = y[t]
    print(f"Sample {t}, Context: {ctx}, target: {target}") # so given a single chunk of the train data within a contex block we have 8 samples
    # it is important to train with all data with context between 1 and ctx_size cuz transformer must be able to adapt to any input size
    # thus we wrap up all these samples in a single batch

Sample 0 Context: tensor([18]), target: 47
Sample 1 Context: tensor([18, 47]), target: 56
Sample 2 Context: tensor([18, 47, 56]), target: 57
Sample 3 Context: tensor([18, 47, 56, 57]), target: 58
Sample 4 Context: tensor([18, 47, 56, 57, 58]), target: 1
Sample 5 Context: tensor([18, 47, 56, 57, 58,  1]), target: 15
Sample 6 Context: tensor([18, 47, 56, 57, 58,  1, 15]), target: 47
Sample 7 Context: tensor([18, 47, 56, 57, 58,  1, 15, 47]), target: 58


In [27]:
data = train_data
ix = torch.randint(len(data) - ctx_len, (4,)) # up to last char - ctx block s.t. have a complete block even at the end of dataset
x = torch.stack([data[i:i+ctx_len]  for i in ix])
y = torch.stack([data[i+1:i+ctx_len+1]  for i in ix])
x, y 

# so we here we have 32 samples (bs * ctx_len) cuz for each (x[i, 0:j] for j from 0 to ctx_len) we have a yij (look above)

(tensor([[39, 57,  1, 44, 53, 52, 42,  1],
         [56, 50,  1, 53, 44,  1, 35, 47],
         [61, 47, 44, 43, 11,  0, 21,  5],
         [43,  1, 46, 53, 52, 43, 57, 58]]),
 tensor([[57,  1, 44, 53, 52, 42,  1, 44],
         [50,  1, 53, 44,  1, 35, 47, 50],
         [47, 44, 43, 11,  0, 21,  5, 50],
         [ 1, 46, 53, 52, 43, 57, 58,  1]]))

In [28]:
torch.manual_seed(1337)
bs = 4 
ctx_len = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - ctx_len, (bs,)) # up to last char - ctx block s.t. have a complete block even at the end of dataset
    x = torch.stack([data[i:i+ctx_len]  for i in ix])
    y = torch.stack([data[i+1:i+ctx_len+1]  for i in ix])
    return x, y

# Baseline: BigramLanguageModel

In [49]:

class BigramLanguageModel(nn.Module):
    '''
    The model learns each entry of a (vocab_len, vocab_len) table 
    where each entry is the probability dist of the following char given an input char at row
    '''
    def __init__(self, vocab_len):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table (bigram model lecture 2)    
        self.token_embedding_table = nn.Embedding(vocab_len, vocab_len)

    def forward(self, idx, targets=None):

        # idx and targets are int tensors of shape (bs, ctx_len)
        logits = self.token_embedding_table(idx)

        if targets == None: return logits, None        
            
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)                
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is a (bs, ctx_len) array of int idxs that defines the context
        for _ in range(max_new_tokens):
            # predict i.e. get unnormalized probs
            logits, _ = self(idx)
            # get last time step
            logits = logits[:, -1, :] # (bs, out_classes)
            # normalize probs
            probs = F.softmax(logits, dim=-1)
            # sample
            idx_next = torch.multinomial(probs, num_samples=1) # (bs,1)
            idx = torch.cat((idx, idx_next), dim=1) # (bs, T+1)
        return idx


In [50]:
xb, yb = get_batch('train')
print(xb.shape, yb.shape)
xb, yb

torch.Size([4, 8]) torch.Size([4, 8])


(tensor([[43,  1, 17, 42, 61, 39, 56, 42],
         [39, 58, 58, 43, 51, 54, 58,  0],
         [42,  1, 43, 52, 58, 56, 43, 39],
         [47, 58, 63, 11,  0, 35, 46, 47]]),
 tensor([[ 1, 17, 42, 61, 39, 56, 42,  1],
         [58, 58, 43, 51, 54, 58,  0, 31],
         [ 1, 43, 52, 58, 56, 43, 39, 58],
         [58, 63, 11,  0, 35, 46, 47, 50]]))

In [51]:
m = BigramLanguageModel(vocab_len=vocab_len)
xb, yb = get_batch('train')
logits, loss = m(xb, yb)
logits.shape # for each chunk of text selected (bs = 4) of size ctx_len (8), we deconstruct the text in a sequential manner s.t. create
# 8 samples so for each one of the 8*4=32 samples we get a vocab_size tensor that represent the prob dist of over the next char
# all of these given that we are using directly embeddings is just as indexing into the token_embedding_table 

loss.item() # we know that the initial loss  must be -math.log(1/vocab_len) = 4.174387269895637

4.416069030761719

In [52]:
decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist())

"\nEcpLzY JXv3CL.vzM;:cCjMo$buqCQnhGqgRtqFqC,NOA3ROh$Bq,M;JOjxcKs;&SoSoIanJmZxOlGrX.lT?A;:VWyXg'\njXI-yU"

In [None]:
32:18