<a href="https://colab.research.google.com/github/norflin321/ml/blob/main/nano_gpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch.nn import functional as fun

with open("tiny_shakespeare.txt", "r", encoding="utf-8") as file:
    text = file.read()

print("-- dataset len:", len(text))

# here are all the unique characters that occur in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])
print(f"-- dataset has {vocab_size} unique characters:", itos)

# let's now encode the entire text dataset and store it into a torch.tensor
data = torch.tensor(encode(text), dtype=torch.long)

# let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

inputs, targets = get_batch("train")
print("-- inputs:", inputs)
print("-- targets:", targets)

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = inputs[b, :t+1]
        target = targets[b, t]
        print(f"-- when input is {context.tolist()} the target: {target}")

torch.manual_seed(1337)
class BigramLanguageModel(torch.nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # create embeding table which is a tensor of shape [64, 64] filled with random numbers
        self.token_embedding_table = torch.nn.Embedding(vocab_size, vocab_size)
    def forward(self, inputs, targets):
        # we cant just train a model on indices of characters from lookup table, before we need to embed them.
        # after embeding we should get a tensor with shape [4, 8, 64], which means that each of 8 numbers in 4 rows, now represented as one dim tensor with 64 random numbers
        logits = self.token_embedding_table(inputs)
        return logits

model = BigramLanguageModel(vocab_size) # initializa the model
logits = model(inputs, targets) # forward pass

print("-- logits:", logits.shape)