<a href="https://colab.research.google.com/github/moutard/llm-practice/blob/main/Shakespear_ChatGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load DataSet
Shakespeare dataset

In [2]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-05-14 06:52:46--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-05-14 06:52:46 (173 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [3]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)


<torch._C.Generator at 0x7d4252e3b0b0>

In [4]:
with open('input.txt', 'r', encoding='utf-8') as shakespeare_file:
    shakespeare_dataset = shakespeare_file.read()
    print("length of dataset in characters:", len(shakespeare_dataset))


length of dataset in characters: 1115394


# Hyperparameters (Global variables)

In [5]:
EMDEDDING_DIMENSION = 32 # size of the embedding that encodes a token
BATCH_SIZE = 32
BLOCK_SIZE = 8
MAX_ITER = 5000
EVAL_INTERVAL = 500
EVAL_ITERATIONS = 200
LEARNING_RATE = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Extract the Char Set

In [6]:
# here are all the unique characters that occur in this text
charset = sorted(list(set(shakespeare_dataset)))
vocab_size = len(charset)
print(charset)
print(vocab_size)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
65


In [7]:
encode_letter = lambda char : charset.index(char)
decode_letter = lambda index : charset[index]
encode_string = lambda text : [encode_letter(char) for i, char in enumerate(text)] # need to take into account when the char input not present in the original dataset
decode_string = lambda list_of_indexes : [decode_letter(index) for index in list_of_indexes]


# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(charset) }
itos = { i:ch for i,ch in enumerate(charset) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string


# openAI use ticktoken
# Google uses

In [8]:
print(encode("hello world"))

[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]


In [9]:
shakespeare_tensor = torch.tensor(encode(shakespeare_dataset), dtype=torch.long)

In [10]:
train_data = shakespeare_tensor[:int(0.9*len(shakespeare_tensor))]
val_data = shakespeare_tensor[int(0.9*len(shakespeare_tensor)):]

example_training_batch = train_data[:BLOCK_SIZE]
example_target_batch = train_data[1:BLOCK_SIZE+1]

for i in range(BLOCK_SIZE):
  context = example_training_batch[:i]
  target = example_target_batch[i]
  print(f"when input is {context} the target: {target}")

when input is tensor([], dtype=torch.int64) the target: 47
when input is tensor([18]) the target: 56
when input is tensor([18, 47]) the target: 57
when input is tensor([18, 47, 56]) the target: 58
when input is tensor([18, 47, 56, 57]) the target: 1
when input is tensor([18, 47, 56, 57, 58]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 58


In [11]:
batch_size = BATCH_SIZE # how many independent sequences will we process in parallel?
block_size = BLOCK_SIZE # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y


xb, yb = get_batch('train')
print(xb.shape, yb.shape)

torch.Size([32, 8]) torch.Size([32, 8])


In [12]:
class Head(nn.Module):
  """
  one head of self-attention
  """

  def __init__(self, head_size, embedding_dimension, block_size):
    super().__init__()
    self.key = nn.Linear(embedding_dimension, head_size, bias=False)
    self.query = nn.Linear(embedding_dimension, head_size, bias=False)
    self.value = nn.Linear(embedding_dimension, head_size, bias=False)
    # where is block_size defined?
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

  def forward(self, x):
    # input of size (batch, time-step, channels)
    # output of size (batch, time-step, head size)
    B, T, C = x.shape
    k = self.key(x) # (B, T, 16)
    q = self.query (x) # (B, T, 16)
    wei = q @ k.transpose(-2, -1) * C**-0.5 # (B, T, 16) @ (B, 16, T) -> (B, T, T)
    # query what I am looking for and key what I contain
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # you don't want future tokens to impact the current token
    wei = F.softmax(wei, dim=-1) # (B, T, T)
    out = wei @ x
    return out # (B, T, T) @ (B, T, C)




In [19]:
class MultiHeadAttention(nn.Module):

  def __init__(self, num_heads, head_size, embedding_dimension, block_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size, embedding_dimension, block_size) for _ in range(num_heads)])

  def forward(self, x):
    return torch.cat([h(x) for h in self.heads], dim=-1)



In [22]:
# bigram language model

class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size, embedding_dimension=EMDEDDING_DIMENSION, block_size=BLOCK_SIZE):
    super().__init__()
    # each token directly read off the logits
    self.token_embedding_table = nn.Embedding(vocab_size, embedding_dimension)
    self.position_embedding_table = nn.Embedding(block_size, embedding_dimension)
    import pdb
    pdb.set_trace()
    self.sa_heads = MultiHeadAttention(4, embedding_dimension/4, embedding_dimension, block_size) # 4 heads of 8-dimensional self-attention
    # self.self_attention_head = Head(embedding_dimension, embedding_dimension, block_size)
    self.lm_head = nn.Linear(embedding_dimension, vocab_size)

  """
    idx:
     and targets are both (B,T) tensor of integers
  """
  def forward(self, idx: torch.Tensor, targets=None):
    B, T = idx.shape
    # encore the idx
    token_embeddings = self.token_embedding_table(idx) # (B,T,C) (Batch size, Time, Channels = vocab_size)

    # encore position of idx
    position_embeddings = self.position_embedding_table(torch.arange(T)) # (T, C)
    # logits are scores based on the token.
    x = token_embeddings + position_embeddings # (B, T, C)
    # Rq: for now the position doesn't matter because we are using a bigram
    x = self.sa_heads(x) # apply one head of self-attention of (B, T, C)


    logits = self.lm_head(token_embeddings) # (B, T, vocab_size)

    if targets is None:
        loss = None
    else:
        B, T, C = logits.shape
        # the Channels C needs to the the second dimension expecting B * C *T for F.cross_entropy
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        # negative cross livelyhood (using it's functional form so we don't need to create a module for it)
        # compare how good we predict logits vs targets
        loss = F.cross_entropy(logits, targets)

    return logits, loss

  """
    idx: current context by batch
    max_new_tokens : number of token to generate
  """
  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
        # crop idx to the last block_size tokens
        idx_cond = idx[:, -BLOCK_SIZE:]
        # get the predictions
        logits, loss = self(idx_cond) # get to the forward option (so targets can be None, and no need to create a loss)
        # focus only on the last time step
        logits = logits[:, -1, :] # becomes (B, C)
        # apply softmax to get probabilities
        probs = F.softmax(logits, dim=-1) # (B, C)
        # sample from the distribution
        idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
        # append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

m = BigramLanguageModel(vocab_size)
out,loss = m(xb, yb)
print(out.shape)
print(loss)

idx = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))




sys.settrace() should not be used when the debugger is being used.
This may cause the debugger to stop working correctly.
If this is needed, please check: 
http://pydev.blogspot.com/2007/06/why-cant-pydev-debugger-work-with.html
to see how to restore the debug tracing back correctly.
Call Location:
  File "/usr/lib/python3.10/bdb.py", line 336, in set_trace
    sys.settrace(self.trace_dispatch)



> [0;32m<ipython-input-22-5cc0f0995cf6>[0m(12)[0;36m__init__[0;34m()[0m
[0;32m     10 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     11 [0;31m    [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 12 [0;31m    [0mself[0m[0;34m.[0m[0msa_heads[0m [0;34m=[0m [0mMultiHeadAttention[0m[0;34m([0m[0;36m4[0m[0;34m,[0m [0membedding_dimension[0m[0;34m/[0m[0;36m4[0m[0;34m,[0m [0membedding_dimension[0m[0;34m,[0m [0mblock_size[0m[0;34m)[0m [0;31m# 4 heads of 8-dimensional self-attention[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     13 [0;31m    [0;31m# self.self_attention_head = Head(embedding_dimension, embedding_dimension, block_size)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     14 [0;31m    [0mself[0m[0;34m.[0m[0mlm_head[0m [0;34m=[0m [0mnn[0m[0;34m.[0m[0mLinear[0m[0;34m([0m[0membedding_dimension[0m[0;34m,[0m [0mvocab_size[0m[0;34m)[0m[0


sys.settrace() should not be used when the debugger is being used.
This may cause the debugger to stop working correctly.
If this is needed, please check: 
http://pydev.blogspot.com/2007/06/why-cant-pydev-debugger-work-with.html
to see how to restore the debug tracing back correctly.
Call Location:
  File "/usr/lib/python3.10/bdb.py", line 361, in set_quit
    sys.settrace(None)


sys.settrace() should not be used when the debugger is being used.
This may cause the debugger to stop working correctly.
If this is needed, please check: 
http://pydev.blogspot.com/2007/06/why-cant-pydev-debugger-work-with.html
to see how to restore the debug tracing back correctly.
Call Location:
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/debugger.py", line 1075, in cmdloop
    sys.settrace(None)



--KeyboardInterrupt--

KeyboardInterrupt: Interrupted by user


TypeError: empty() received an invalid combination of arguments - got (tuple, dtype=NoneType, device=NoneType), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.memory_format memory_format, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of ints size, *, torch.memory_format memory_format, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)


- what's the difference between self-attention and cross-attention. Self-attention all come from x (k and q both come from x. cross-attention where k comes from x and q comes from another vector (for instance for translation we use cross-attention where k where from the target language and q the original language)

- wei need to be diffuse so it needs to be divided by squared root of head_size
` * head_size**-0.5` if you don't do that, the variance is going to change.

In [15]:
# create a PyTorch Optimizer
optimiser = torch.optim.AdamW(m.parameters(), lr=LEARNING_RATE)

In [16]:
for steps in range(EVAL_ITERATIONS):
  # sample a batch of data
  xb, yb = get_batch('train')
  # evaluate the loss
  logits, loss = m(xb, yb)
  optimiser.zero_grad(set_to_none=True)
  loss.backward()
  optimiser.step()
  print(loss.item())

4.376278400421143
4.321561336517334
4.254098892211914
4.295942306518555
4.269122123718262
4.283777236938477
4.235698223114014
4.2078142166137695
4.2039361000061035
4.286467552185059
4.240995407104492
4.186239242553711
4.265632152557373
4.216305255889893
4.211254596710205
4.123067855834961
4.193725109100342
4.142827987670898
4.150857448577881
4.180758953094482
4.117696285247803
4.097591400146484
4.171906471252441
4.084340572357178
4.112943172454834
4.146841049194336
4.0683393478393555
4.072709083557129
4.102790355682373
4.0087504386901855
4.063634872436523
4.007064342498779
4.080782413482666
4.020970821380615
3.9313879013061523
4.007304668426514
4.005529403686523
3.9916744232177734
4.052687644958496
3.914335250854492
3.9284307956695557
3.981619119644165
4.011439323425293
3.9609692096710205
3.950890064239502
4.0291972160339355
3.8926281929016113
3.878202199935913
3.9470181465148926
3.8863375186920166
3.86076283454895
3.8825433254241943
3.9164633750915527
3.93204927444458
3.91183090209960