<a href="https://colab.research.google.com/github/mehek-niwas/learning_transformer/blob/main/mehek_gpt_dev.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NanoTransformer Notes**
*using Andrej Karpathy's video*: https://www.youtube.com/watch?v=kCc8FmEb1nY&t=1334s

In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-01-09 21:13:30--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-01-09 21:13:30 (130 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [None]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [None]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [None]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [None]:
# let's now encode the entire text dataset and store it into a torch.Tensor
import torch # we use PyTorch: https://pytorch.org
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [None]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [None]:
print("training dataset: ", train_data.shape)
print("validation dataset: ", val_data.shape)

training dataset:  torch.Size([1003854])
validation dataset:  torch.Size([111540])


In [None]:
block_size = 8
train_data[:block_size+1] # showing first block (batch) --> represents 8 training samples

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [None]:
# all from the first batch
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [None]:
print("training dataset: ", train_data.shape)
print("validation dataset: ", val_data.shape)

torch.manual_seed(1337) # random number generator for reproducable code

batch_size = 4 # ---> THIS IS THE NUMBER OF SEQUENCES PER BATCH
block_size = 8 # ---> (MAXIMUM TOKENS PER BATCH PER SEQUENCE) or maximum context length

# disclaimer: this code does not ensure block_size seperation in between starting indicies
def get_batch(split):
    # generate a small batch of data of inputs x and targets y

    # if in the train split, we are going to look at the training data
    # if in test split, we are going to look at the testing data
    data = train_data if split == 'train' else val_data

    # generate (batch_size) random starting indexes anywhere from 0 to data.length - block_size
    ix = torch.randint(len(data) - block_size, (batch_size,)) # torch.randint(low=0, high, size) --> where size is tuple (rows, columns) --> for rows, 1 column --> (rows,)

    # create batches from random starting indexes
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

training dataset:  torch.Size([1003854])
validation dataset:  torch.Size([111540])
inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] th

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

# BigramLanguageModel inherits from nn.Module
# nn.Module --> base class for all neural network modules. user models should also subclass the Module class

class BigramLanguageModel(nn.Module):

  # constructor
  def __init__(self, vocab_size): #
    # ensures the parents class (nn.Module) is properly initialized before adding additional functionality specific to the BigramLanguageModel

    super().__init__()
    # each token directly reads off the logits for the next token from a lookup table

    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    # SYNTAX: nn.Embedding(num_embeddings, embedding_dim) --> (size of dictionary of embeddings, size of each embedding vector)
    #  --> OUTPUT:  random initialized embedding matrix of vocab_size x vocab_size

  def forward(self, inputs, targets=None):
    # idx and targets are both (B,T) tensor of integers

    # THIS WORKS FOR THE INPUTS BECAUSE THE INPUTS ARE ALREADY IN INDEX FORM. SO INPUT OF 23 WILL OUTPUT THE EMBEDDING ROW 23
    # THIS WOULD NOT WORK IF THE INPUTS WERE STILL IN STRING FORM. --> WE HAD TO SPLIT THEM AND INDEX THEM FIRST(SERVING A LOOKUP TABLE OR WHATEVER METHOD) --> SO NOW WE ARE SWITCHING TO ITS EMBEDDING VECTOR


    logits = self.token_embedding_table(inputs) # --> WE NEED TO ARRANGE THIS AS A (B, T, C) # ==== the tensors/embedding vectors related to the idx(s) [from the embedding matrix]
    # B = batch size (number of input sequences)  x   T = sequence length  x  C = vocabulary size (logits for each possible token??) --> logits --> interpreted as the scores for the next character in the sequence

    if targets is None:
      loss = None
    else:
    # --> SO pytorch expects a B, C, T matrix as the input if the input is multidimensional (btc = 4x8x65)
      B, T, C = logits.shape
      logits = logits.view(B*T, C) # making it into a 2 dimensional array!! (was previously B,T,C )
      targets = targets.view(B*T) # making into 1 dimensional array!! (was previously B,T)
      loss = F.cross_entropy(logits, targets) # loss is the cross entropy of the logits and targets. measures the quality of the logits with respect to the targets
      # but targets is not in 4 x 8 x 65 format? it is just in 4 x 8 format?

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      # get the predictions
      logits, loss = self(idx) # --> self(idx) goes to the forward function --> self.function(idx)
      # focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1) # AUTOREGRESSIVE PART OF THE BIGRAM MODEL!!

    return idx

m = BigramLanguageModel(vocab_size) # vocab size is the number of different characters in the input
logits, loss = m(xb, yb) # same as m.forward(xb, yb) because PyTorch automatically directs m(xb, yb) to the forward method
# ---> the scores/logits of a probabilities of each of the 65 character being the next part of the sequence (for every one of the 4 x 8 positions).  ---> 4 x 8 x 65 (imagine a very dense cube)

print(logits.shape)
print(loss)
inputs = torch.zeros((1, 1), dtype=torch.long) # zero is kind of like the SOS token since it is the newline character for this project
print(decode(m.generate(inputs, max_new_tokens=100)[0].tolist()))

NameError: name 'xb' is not defined

think of "T" in the BTC as the "time dimension" or the "time step"

# <font color = "blue"> remember the target in transformers is ALWAYS only 1 token

# <font color = "blue"> the input in transformers can be a RANGE of tokens up to the CONTEXT WINDOW/SIZE

### **PyTorch Embedding Notes**
*key component in transformers. used to convert input tokens into continuous representations*

"an embedding layer is a simple lookup table that stores embeddings of a fixed dictionary size"
- the embedding layer = lookup table that maps an index value to a weight matrix of (user-defined) dimension
- weight matrix is optimized to produce more useful vectors
--> embedding matrix is initialized: `num_embeddings x embedding_dim`
- NUM_EMBEDDINGS IS THE DICTIONARY SIZE
- given input word or token... (represented by index in the vocabulary)... the index is passed to the embedding layer which looks up the corresponding row in the embedding matrix


In [None]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

# Training Loop

In [None]:
batch_size = 32
for steps in range(10000): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.5580098628997803


In [None]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


Bur y ang wod, t se atperares, m:
Frerf y g, ncorparthe owns?

Tire
I trd odeloutes, e couklonsthendf.
PESh thovenofieaye kellar we gr mpanou s this?

Noflp
Ron:
OME trdovy
IO:
Or so waco is. he s!
CI I w'd Pordervet ld, temofostrurd tiomy Proute we g gary outhend le, le, yed rat y ay m.

nd, yorix$Jave ed thais, mea l herirnonais ire mo par,
OFOnealle qu f t atary meee dd il thy fitito-the ssd ovel y hathe, qun s;
LO tond t, oullke CIARicousangh'dlke, y lorveloveninde searslly otharomeligse ang


# GPU Check


In [None]:
!nvidia-smi # making sure GPU is running

Thu Jan  9 21:14:10 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import tensorflow as tf
from tensorflow.python.client import device_lib

numGPUs = len(tf.config.experimental.list_physical_devices('GPU'))

print('Num GPUs Available: ', numGPUs)

if numGPUs > 0:
  print(tf.test.gpu_device_name())
  print(device_lib.list_local_devices()[1].physical_device_desc)

# check if cuda is available
import torch

if torch.cuda.is_available():
    print("CUDA is available!")
else:
    print("CUDA is not available. Using CPU.")

Num GPUs Available:  1
/device:GPU:0
device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
CUDA is available!


# Script-Version

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
# ------------

torch.manual_seed(1337)

!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    # model.eval() is important for disabling droupout layers and adjusting batch normalization behavior
    # model.eval() does not automatically disable local gradient computation
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel(vocab_size)
m = model.to(device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

--2025-01-09 21:14:19--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2025-01-09 21:14:19 (209 MB/s) - ‘input.txt.1’ saved [1115394/1115394]

step 0: train loss 4.7305, val loss 4.7241
step 300: train loss 2.8110, val loss 2.8249
step 600: train loss 2.5434, val loss 2.5682
step 900: train loss 2.4932, val loss 2.5088
step 1200: train loss 2.4863, val loss 2.5035
step 1500: train loss 2.4665, val loss 2.4921
step 1800: train loss 2.4683, val loss 2.4936
step 2100: train loss 2.4696, val loss 2.4846
step 2400: train loss 2.4638, val loss 2.4879
step 2700: train loss 2.4738, val loss 2.4911



CEThik brid owindakis

# "The mathematical trick in self-attention"

In [None]:
torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time (max sequence/context length), channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [None]:
print(x)

tensor([[[ 0.1808, -0.0700],
         [-0.3596, -0.9152],
         [ 0.6258,  0.0255],
         [ 0.9545,  0.0643],
         [ 0.3612,  1.1679],
         [-1.3499, -0.5102],
         [ 0.2360, -0.2398],
         [-0.9211,  1.5433]],

        [[ 1.3488, -0.1396],
         [ 0.2858,  0.9651],
         [-2.0371,  0.4931],
         [ 1.4870,  0.5910],
         [ 0.1260, -1.5627],
         [-1.1601, -0.3348],
         [ 0.4478, -0.8016],
         [ 1.5236,  2.5086]],

        [[-0.6631, -0.2513],
         [ 1.0101,  0.1215],
         [ 0.1584,  1.1340],
         [-1.1539, -0.2984],
         [-0.5075, -0.9239],
         [ 0.5467, -1.4948],
         [-1.2057,  0.5718],
         [-0.5974, -0.6937]],

        [[ 1.6455, -0.8030],
         [ 1.3514, -0.2759],
         [-1.5108,  2.1048],
         [ 2.7630, -1.7465],
         [ 1.4516, -1.5103],
         [ 0.8212, -0.2115],
         [ 0.7789,  1.5333],
         [ 1.6097, -0.4032]]])


we want the 8 tokens (T = 6) to be able to talk to each other

the tokens should not be able to talk to tokens in the future

information only flows from the previous context to the current time step. and we can't get any information from the future

so if i was token 5, id want all the channels from token time steps of 4, 3, 2, and 1 ---> averaging them up --> so that it becomes like a feature vector that sort of summarizes me in the context of my history. just an average will be lossy, so we can do some more special stuff later


# An inefficient way to get averages of the previous tokens

In [None]:
xbow = torch.zeros((B,T,C))
for b in range(B):
  for t in range(T):
    xprev = x[b,:t+1] # size: (1, t, C) ---> so technically (t, C) since u can ignore 1 at that point
    xbow[b,t] = torch.mean(xprev, 0) # dim=0 means that the mean is computed along the "first axis"( --> meaning rows) of the tensor
    # so for each column in xprev, the function calculates the mean of all rows (time steps)
    # torch.mean(xprev, 0) --> averaging each time step by channel --> (C, )

In [None]:
print(x[0])
print(x.shape)

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])
torch.Size([4, 8, 2])


In [None]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [None]:
# write code that prints which inputs in x that a row of x bow is an average of
for minibatch in range(B):
  print("***********************minibatch start******************************")
  for tokenStep in range(T):
    print(xbow[minibatch, tokenStep])
    print("is an average of: ")
    print(x[minibatch, :tokenStep+1])
    print("--------------------------------------------------")

***********************minibatch start******************************
tensor([ 0.1808, -0.0700])
is an average of: 
tensor([[ 0.1808, -0.0700]])
--------------------------------------------------
tensor([-0.0894, -0.4926])
is an average of: 
tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152]])
--------------------------------------------------
tensor([ 0.1490, -0.3199])
is an average of: 
tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255]])
--------------------------------------------------
tensor([ 0.3504, -0.2238])
is an average of: 
tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643]])
--------------------------------------------------
tensor([0.3525, 0.0545])
is an average of: 
tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679]])
--------------------------------------------------
tensor([ 0.0688, -0.039

# An efficient way to get averages of the previous tokens

In [None]:
wei = torch.tril(torch.ones(T,T))
print(wei)
print(wei.sum(1, keepdim = True))
# NORMALIZING THE MATRIX
wei = wei / wei.sum(1, keepdim=True) # wei / (summing across columns --> so sum each row) = average of
print(wei)

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])
tensor([[1.],
        [2.],
        [3.],
        [4.],
        [5.],
        [6.],
        [7.],
        [8.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250,

In [None]:
# My method
for minibatch in range(B):
  currX = x[minibatch]
  print(wei @ currX)

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])
tensor([[ 1.3488, -0.1396],
        [ 0.8173,  0.4127],
        [-0.1342,  0.4395],
        [ 0.2711,  0.4774],
        [ 0.2421,  0.0694],
        [ 0.0084,  0.0020],
        [ 0.0712, -0.1128],
        [ 0.2527,  0.2149]])
tensor([[-0.6631, -0.2513],
        [ 0.1735, -0.0649],
        [ 0.1685,  0.3348],
        [-0.1621,  0.1765],
        [-0.2312, -0.0436],
        [-0.1015, -0.2855],
        [-0.2593, -0.1630],
        [-0.3015, -0.2293]])
tensor([[ 1.6455, -0.8030],
        [ 1.4985, -0.5395],
        [ 0.4954,  0.3420],
        [ 1.0623, -0.1802],
        [ 1.1401, -0.4462],
        [ 1.0870, -0.4071],
        [ 1.0430, -0.1299],
        [ 1.1138, -0.1641]])


In [None]:
# PyTorch method
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
print(xbow2)

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])


# An even more efficient way to get averages of the previous tokens

In [None]:
tril = torch.tril(torch.ones(T,T))
print(tril)

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])


In [None]:
wei = torch.zeros((T,T))
print(wei)
wei = wei.masked_fill(tril == 0, float('-inf')) # make all elements where tril = 0, --> replace with -inf
print(wei)
wei = F.softmax(wei, dim=-1) # dim=-1 --> means take softmax along every single row
print(wei)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


In [None]:
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

### in attention, we use weights similar to the above matrix. the mask will remain, but instead of normalized average weights, the weights will be based on the amount of affinity a token has for the next token prediction.

### when you multiply the weights by the token embeddings, you are multiply the weights by each channel to get a token embeddings matrix. so the same weight will be applied all of the channels for a certain sample (feature-vector)



In [None]:
x

tensor([[[ 0.1808, -0.0700],
         [-0.3596, -0.9152],
         [ 0.6258,  0.0255],
         [ 0.9545,  0.0643],
         [ 0.3612,  1.1679],
         [-1.3499, -0.5102],
         [ 0.2360, -0.2398],
         [-0.9211,  1.5433]],

        [[ 1.3488, -0.1396],
         [ 0.2858,  0.9651],
         [-2.0371,  0.4931],
         [ 1.4870,  0.5910],
         [ 0.1260, -1.5627],
         [-1.1601, -0.3348],
         [ 0.4478, -0.8016],
         [ 1.5236,  2.5086]],

        [[-0.6631, -0.2513],
         [ 1.0101,  0.1215],
         [ 0.1584,  1.1340],
         [-1.1539, -0.2984],
         [-0.5075, -0.9239],
         [ 0.5467, -1.4948],
         [-1.2057,  0.5718],
         [-0.5974, -0.6937]],

        [[ 1.6455, -0.8030],
         [ 1.3514, -0.2759],
         [-1.5108,  2.1048],
         [ 2.7630, -1.7465],
         [ 1.4516, -1.5103],
         [ 0.8212, -0.2115],
         [ 0.7789,  1.5333],
         [ 1.6097, -0.4032]]])

In [None]:
wei @ x

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])

In [None]:
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf')) # since -inf = 0 in softmmax
wei = F.softmax(wei, dim=-1) # ---> softmax is a normalization function
xbow3 = wei @ x
print(xbow3)

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])


# The self attention way

In [None]:
#torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

# single head of attention
head_size = 16

key = nn.Linear(C, head_size, bias=False) # linear layer to convert each token embedding into a lower dimension embedding (dim = head_size) --> key *embedding* vector
# ^^ so (B, T, embed_dim) will be able to become (B, T, head_size) = KEY

query = nn.Linear(C, head_size, bias=False) # linear layer to convert each token embedding into a lower dimension embedding (dim = head_size) --> value *embedding* vector
# ^^ (B, T, embed_dim) will be able to become (B, T, head_size) = QUERY

value = nn.Linear(C, head_size, bias=False) # linear layer to convert each token embedding into a lower dimension embedding (dim = head_size) --> value *embedding* vector
# ^^ (B, T, embed_dim) will be able to become (B, T, head_size) = VALUE

k = key(x) # (B, T, head_size)
q = query(x) # (B, T, head_size)

wei = q @ k.transpose(-2, -1) # (B, T, head_size) x (B, head_size, T) = (B, T, T) --> token_length x token_length x num_minibatches
# WEI IS NO LONGER JUST ZEROS
print("after qk dot product")
print(wei)
print("\n")
tril = torch.tril(torch.ones(T,T))

wei = wei.masked_fill(tril == 0, float('-inf'))
print("after masking")
print(wei)
print("\n")

v = value(x) # (B, T, head_size) wei = q @ k.transpose(-2, -1) # (B, T, head_size) x (B, head_size, T) = (B, T, T) --> token_length x token_length x num_minibatches

wei = F.softmax(wei, dim=-1)
print("after softmax")
print(wei)
print("\n")

out = wei @ v  # (B, T, T) x (B, T, head_size) = (B, T, head_size)
print("after wei @ v")
print(out)
print("\n")

out.shape

after qk dot product
tensor([[[-3.2516e-01,  7.5986e-02, -8.7015e-01,  1.3635e+00, -1.8130e-01,
          -7.6186e-01, -1.9567e-01,  1.3711e+00],
         [ 2.2350e-01,  3.7087e-01, -9.1976e-01,  1.2680e+00,  5.4802e-01,
          -8.2141e-01, -3.2949e-01,  6.7529e-01],
         [-3.5427e-01, -1.0413e+00,  1.5438e+00,  4.9150e-01, -2.7205e-01,
           1.4554e+00, -3.2128e+00,  3.1941e-01],
         [ 2.1174e-01, -2.9837e-01,  6.0922e-01, -1.4986e+00,  5.5533e-01,
           1.7037e+00, -9.9851e-01, -2.4456e+00],
         [-3.2672e-01, -1.9602e+00,  2.9103e-01, -7.8674e-01, -1.3949e+00,
           1.2709e+00,  5.1601e-01,  3.3495e-01],
         [ 1.7287e-01, -4.2323e-01, -5.1888e-01,  1.1365e+00,  2.2165e-01,
           2.9291e-02, -3.1094e-01,  1.2146e+00],
         [-1.8122e-01,  9.5256e-01, -2.1602e+00,  1.0454e-01, -5.0251e-01,
           6.0112e-01,  4.0828e+00, -7.1607e-01],
         [-7.4617e-01, -2.5025e-01,  1.1514e+00, -1.0002e-01, -1.8712e+00,
           4.2339e-01, -1.069

torch.Size([4, 8, 16])

now `wei` has different values for each batch, since each batch has different tokens... so we know that `wei` is data dependent.

- query --> heres what im interested in
-  key --> heres what i have
- value --> if u find anything interesting, heres what i will communicate to u

value is what gets aggregated



<font color = "blue">*for classification (e.g sentiment analysis), we dont need to mask because we can use all future and past information*

**in encoder attention:** no masking --- all tokens are able to communicate
- the input of an encoder would be the entire sequence at once

**in decoder attention:** masking --- future tokens cannot communicate with past
- the input of a decoder would be using the sequence in time steps --> so then the target of the decoder is also the sequence (next token prediction)

- **self-attention:** key, query, and values come from same tokens

- **cross-attention:** key, values come from different tokens (ex: for encoder) and queries come from decoder input tokens ---> think about in translation

- **scaled-attention:** if u scale/divide the attention by the sqrt(`head_size`) (dimension of key, value. query) then the variance of the attention will be 1 again. since it will feed into softmax, it is important for the attention to be scaled so that softmax doesnt cause a dying token/neuron type problem where highest values dominant too much


In [None]:
wei

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4632, 0.5368, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1223, 0.0615, 0.8162, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3059, 0.1837, 0.4552, 0.0553, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2485, 0.0485, 0.4608, 0.1568, 0.0854, 0.0000, 0.0000, 0.0000],
         [0.1518, 0.0836, 0.0760, 0.3978, 0.1594, 0.1315, 0.0000, 0.0000],
         [0.0126, 0.0390, 0.0017, 0.0167, 0.0091, 0.0275, 0.8933, 0.0000],
         [0.0557, 0.0915, 0.3716, 0.1063, 0.0181, 0.1794, 0.0403, 0.1370]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.6934, 0.3066, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.6564, 0.1684, 0.1752, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5565, 0.2045, 0.2175, 0.0215, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2075, 0.5960, 0.0928, 0.0342, 0.0695, 0.0000, 0.0000, 0.0000],
         [0.0558, 0.274

# Updated Script

- no longer passing in `vocab_size` for the model methods because we made it as a global variable in the beginning

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

torch.manual_seed(1337)

!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


--2025-01-15 20:47:55--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-01-15 20:47:56 (110 MB/s) - ‘input.txt’ saved [1115394/1115394]

0.209729 M parameters
step 0: train loss 4.4116, val loss 4.4022
step 100: train loss 2.6568, val loss 2.6670
step 200: train loss 2.5091, val loss 2.5058
step 300: train loss 2.4197, val loss 2.4336
step 400: train loss 2.3501, val loss 2.3562
step 500: train loss 2.2963, val loss 2.3125
step 600: train loss 2.2407, val loss 2.2496
step 700: train loss 2.2054, val loss 2.2187
step 800: train loss 2.1633, val loss 2.1866
step 900: train loss 2.1241, val loss 2.1504
step 1000: t