To be able to use my already existing python environment, I had to give Visual Studio Code the path to my environments folder. 

In [61]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print(f"Length of dataset: {len(text)} characters.")

# There are a total of 65 unique characters in the dataset.
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(vocab_size)
print("".join(chars))

# We will tokenize our vocabulary by building a character level language model. We will represent each
# character as an integer. Sub-word tokenizers are also possible (chat-gpt uses tiktoken)
# We first create a mapping from characters to integers using a dictionary
chtoi = {ch:i for i,ch in enumerate(chars)}
itoch = {i:ch for i,ch in enumerate(chars)}

def encode(s):  
    return [chtoi[ch] for ch in s] # Take a string, output list of integers.

def decode(list_int):
    return "".join([itoch[i] for i in list_int]) # Take a list of integers, output string.

Length of dataset: 1115394 characters.
65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [62]:
# We now encode entire "input.txt" and save it in a torch tensor.
import torch
import torch.nn as nn
from torch.nn import functional as F
data = torch.tensor(encode(text))

n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

When we train a transformer, we only work with random chunks we take from the dataset. 

In a chunk of 9 characters, there are 8 training examples of increasing context length. Maximum context length we train with is given by block_size. This is useful for inference as the transformer is used to working with varying context lengths. For inference, we have to divide inputs larger than block_size into chunks. 

In [63]:
block_size = 8

print("CONTEXT")
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When {context} is the context, the target is {target}.")

CONTEXT
When tensor([18]) is the context, the target is 47.
When tensor([18, 47]) is the context, the target is 56.
When tensor([18, 47, 56]) is the context, the target is 57.
When tensor([18, 47, 56, 57]) is the context, the target is 58.
When tensor([18, 47, 56, 57, 58]) is the context, the target is 1.
When tensor([18, 47, 56, 57, 58,  1]) is the context, the target is 15.
When tensor([18, 47, 56, 57, 58,  1, 15]) is the context, the target is 47.
When tensor([18, 47, 56, 57, 58,  1, 15, 47]) is the context, the target is 58.


In [64]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for prediction?

def get_batch(split):
    """
    We obtain a context and target tensor of size (batch_size, block_size)
    """
    data = train_data if split=="train" else val_data
    ix = torch.randint(low=0, high=len(data)-block_size, size=(batch_size,))

    # We now turn horizontally
    X = torch.vstack([data[i:i+block_size] for i in ix])
    Y = torch.vstack([data[i+1:i+block_size+1] for i in ix])

    return X,Y

# BIGRAM

Bigrams are a very simple model. They simply use a look-up table and no context. They use only the current character to predict the next. 

The objective of the generate() function is to extend the (batch_size, block_size) horizontally and predict more tokens. Gets (B,T) -> (B,T+1)

min 38

logit: output of a neuron without applying activation function.

In [65]:
idx = torch.tensor([[0,4,6,2],
                    [3,7,8,9]])  # size: (batch_size, block_size)

token_embedding_table = nn.Embedding(vocab_size, vocab_size)
logits = token_embedding_table(idx) # size: (batch_size, block_size, vocab_size)

print(logits.shape)

torch.Size([2, 4, 65])


In [78]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        # First input is vocab_size. Second input is the size of the encoded representation for each word. 
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are tensors of size (batch_size, block_size)
        logits = self.token_embedding_table(idx)   # size: (batch_size, block_size, vocab_size)

        if targets is None:
            loss = None

        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        """
        We call this function to generate new characters.
        """
        for _ in range(max_new_tokens):
            # We first get the predictions
            logits, loss = self(idx)  # (B,T,C)
            
            # Here, we are interested in using all the given context.
            logits = logits[:, -1, :]  # (B,C)

            # We then apply softmax to get probabilities.
            probs = F.softmax(logits, dim=-1)  # (B,C)

            # We now sample from the probabilities
            idx_next = torch.multinomial(probs, num_samples=1)  # (B,1)

            # Finally, we append
            idx = torch.hstack([idx, idx_next])  # (B, T+1)

        return idx

@torch.no_grad()
def estimate_loss(eval_iters):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [84]:
# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
max_iters = 20000
eval_interval = 1000
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
# ------------

model = BigramLanguageModel(vocab_size)
m = model.to(device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_interval == 0:
        losses = estimate_loss(eval_iters)
        print(f"iter: {iter}  train_loss: {losses['train']:.4f}  val_loss: {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

iter: 0  train_loss: 4.5814  val_loss: 4.5853
iter: 1000  train_loss: 2.4849  val_loss: 2.5241
iter: 2000  train_loss: 2.4643  val_loss: 2.4899
iter: 3000  train_loss: 2.4643  val_loss: 2.4930
iter: 4000  train_loss: 2.4480  val_loss: 2.4925
iter: 5000  train_loss: 2.4613  val_loss: 2.4874
iter: 6000  train_loss: 2.4639  val_loss: 2.4845
iter: 7000  train_loss: 2.4605  val_loss: 2.4814
iter: 8000  train_loss: 2.4529  val_loss: 2.4800
iter: 9000  train_loss: 2.4535  val_loss: 2.4812
iter: 10000  train_loss: 2.4531  val_loss: 2.4842
iter: 11000  train_loss: 2.4538  val_loss: 2.4901
iter: 12000  train_loss: 2.4572  val_loss: 2.4897
iter: 13000  train_loss: 2.4505  val_loss: 2.4896
iter: 14000  train_loss: 2.4592  val_loss: 2.4788
iter: 15000  train_loss: 2.4551  val_loss: 2.4967
iter: 16000  train_loss: 2.4528  val_loss: 2.4864
iter: 17000  train_loss: 2.4534  val_loss: 2.4894
iter: 18000  train_loss: 2.4464  val_loss: 2.4930
iter: 19000  train_loss: 2.4545  val_loss: 2.5021


In [86]:
context = "Thou art"
context = torch.tensor(encode(context)).unsqueeze(0)
print(context.shape)

text = decode(m.generate(context, 400)[0].tolist())
print(text)

torch.Size([1, 8])
Thou artofart
have thmercow'dsth tu
Aga hu w? nt Mo a orotr, ye pove RL:
MPle:
DUENGeeiceselly bowit, fo d te s rd omefershordase is, ves:
NGAnso mmig.

IO:
LES:
Mulld veamnd; s sintung adiposo;
OFRI ht
Prarionfred y teapt d mitee tins w themyofrthaime githoube ieged atou,
Be yf lllins:
HARINCAUCELIIOFFon aseli'd:
CETES:
O:
ARIUS: I scungin.
Anviste t s.
Ber yoncengicercel y.
Ha?
F ghth ror CELavelotheyov
