## Read input

In [1]:
with open('dataset/holmes.txt', 'r', encoding='utf-8-sig') as f:
    text = f.read()
    
len(text)

581863

## Tokenisation and training split

In [2]:
import torch
import tiktoken
enc = tiktoken.get_encoding("gpt2")

data = torch.tensor(enc.encode(text)).long()

training_data_length = int(len(data) * 0.9) # 90% of the data

train_data = data[:training_data_length]
validation_data = data[training_data_length:]

print(f'Training data: {len(train_data)}, Validation data: {len(validation_data)}')

Training data: 135485, Validation data: 15054


In [13]:
# Seed random generator for getting same output everytime
torch.manual_seed(12341)

# Number of input tokens to consider for prediction
context_length = 8

# Number of samples to consider in one batch. Each sample will be a sequence of context_length tokens
batch_size = 32

def get_batch(split):
    data = train_data if split == 'train' else validation_data
    # Generate batch number times random indices in the data
    ix = torch.randint(len(data) - context_length, (batch_size,))
    # Get input tensor for each random index with length context_length
    x = torch.stack([data[i:i+context_length] for i in ix])
    # Get output tensor. Each element is the next token prediction for the corresponding input tensor
    y = torch.stack([data[i+1:i+context_length+1] for i in ix])
    return x,y

In [14]:
vocab_size = enc.n_vocab # Total number of tokens in the vocabulary for the tiktoken encoding
embedding_dim = 16 # Dimension of the embedding vector per token. Each token will be converted to this size vector and later will be transformed to have inner meaning

from torch import nn
from torch.nn import functional as F

class SherlockModel(nn.Module):
    def __init__(self):
        super().__init__()
        # Embedding layer to convert batch and example to embedding vectors
        self.embedding_layer = torch.nn.Embedding(vocab_size, embedding_dim)
        # Linear layer to convert embedding vectors to logits that represent the probability of each token in the vocabulary.
        self.linear_layer = torch.nn.Linear(embedding_dim, vocab_size)

    def forward(self, x, targets = None):
        # Forward pass
        token_embeddings = self.embedding_layer(x) # (B, T, C)
        logits = self.linear_layer(token_embeddings) # (B, T, C) C = vocab_size

        if targets is None:
            loss = None
        else:
            # Reshape logits and targets to calculate loss
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # (B*T, C)
            targets = targets.view(B*T) # (B*T)

            # Calculate loss
            loss = F.cross_entropy(logits.view(-1, vocab_size), targets.view(-1))

        return logits, loss
    
    def generate(self, x, max_new_tokens = 10):
        # Generate a sequence of tokens
        for _ in range(max_new_tokens):
            # Forward pass
            logits, loss = self(x)
            # get last token logits from each batch (Removing the time dimension or the token traversal dimension)
            logits = logits[:, -1, :] # (B,C)

            # convert logits to probabilities (by applying softmax and squishing the values between 0 and 1)
            probs = F.softmax(logits, dim=-1) # (B,C)

            # from the next token probabilities, sample a token
            next_token_indices = torch.multinomial(probs, num_samples=1) # (B,1)

            # Append the sampled token to the input tensor for the next iteration
            x = torch.cat((x, next_token_indices), dim=1)

        return x

## Training

In [19]:
model = SherlockModel()

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for steps in range(1000):
    x_batch, y_batch = get_batch('train')
    logits, loss = model(x_batch, y_batch)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

7.298051834106445


## Generation

In [20]:
input = torch.zeros((1,1), dtype=torch.long)
output = enc.decode(model.generate(input)[0].tolist())
print(output)

! River Fishingprototype residency Military DinosaurprEditorBenef Contact
