# Overview

In [3]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer

## Download Dataset

In [4]:
from datasets import load_dataset

dataset = load_dataset("roneneldan/TinyStories")
dataset

Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})

In [5]:
sample = 20

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
tokenizer.pad_token = tokenizer.eos_token  

subset_dataset = dataset['train'][:sample]['text']

# Tokenize the text data in the new subset dataset with padding and truncation
tokenized_dataset = tokenizer(
    subset_dataset,
    return_tensors='pt',
    padding=True,  # Enable padding
    truncation=True  # Enable truncation
)


In [6]:
for example in tokenized_dataset['input_ids'][:3]:
    # Decode the list of token IDs
    decoded_text = tokenizer.decode(example, skip_special_tokens=True)
    print(decoded_text)

One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.

Lily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."

Together, they shared the needle and sewed the button on Lily's shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.
Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong.

One day, Beep was driving in the park when he saw a big tree. The tree had many leaves that were falling. B

In [7]:
data = tokenized_dataset['input_ids']
data.shape

torch.Size([20, 219])

In [8]:
x = data[:, :-1].contiguous()
y = data[:, 1:].contiguous()
x.shape, y.shape

(torch.Size([20, 218]), torch.Size([20, 218]))

In [9]:
for i in range(5):
    print(f"Input: {x[0, :i+1]} --> Labels: {y[0,i]}")

Input: tensor([3198]) --> Labels: 1110
Input: tensor([3198, 1110]) --> Labels: 11
Input: tensor([3198, 1110,   11]) --> Labels: 257
Input: tensor([3198, 1110,   11,  257]) --> Labels: 1310
Input: tensor([3198, 1110,   11,  257, 1310]) --> Labels: 2576


In [10]:
vocab_size = tokenizer.vocab_size
sequence_len = x.size(1)

print(vocab_size, sequence_len)

50257 218


## Embedding

In [11]:
# Token Embedding
n_embd = 36
wte = nn.Embedding(vocab_size, n_embd) # word to embedding

token_embd = wte(x)
token_embd.shape

torch.Size([20, 218, 36])

In [12]:
# Position Embedding
position = nn.Embedding(sequence_len, n_embd)

position_embd = position(torch.arange(sequence_len))
position_embd.shape

torch.Size([218, 36])

In [13]:
x_embd = token_embd + position_embd
x_embd.shape

torch.Size([20, 218, 36])

## MLP

In [14]:
ln_hidden_states = nn.LayerNorm(n_embd)
x_embd_ln = ln_hidden_states(x_embd)

In [15]:
fc1 = nn.Linear(n_embd, 4 * n_embd)
fc2 = nn.Linear(4 * n_embd, n_embd)

act = nn.ReLU()

In [16]:
# Feed forward output
ffwd_out = fc1(x_embd_ln)
ffwd_out = act(ffwd_out)
ffwd_out = fc2(ffwd_out)

ffwd_out.shape

torch.Size([20, 218, 36])

## Attension

In [17]:
n_head = 4

head_size = n_embd // n_head
opt_size = n_head * head_size # output size
head_size, opt_size

(9, 36)

In [18]:
Wqkv = nn.Linear(n_embd, 3 * opt_size)
qkv = Wqkv(x_embd_ln)
qkv.shape

torch.Size([20, 218, 108])

In [19]:
from einops import rearrange
qkv = rearrange(qkv, "... (three h d) -> ... three h d", three=3, h = n_head)
qkv.shape

torch.Size([20, 218, 3, 4, 9])

In [20]:
q, k, v = qkv.unbind(dim=2)
q.shape, k.shape, v.shape

(torch.Size([20, 218, 4, 9]),
 torch.Size([20, 218, 4, 9]),
 torch.Size([20, 218, 4, 9]))

In [21]:
import math
softmax_scale = 1.0 / math.sqrt(q.shape[-1])

scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
scores.shape

torch.Size([20, 4, 218, 218])

In [22]:
mask = torch.triu(torch.full((sequence_len, sequence_len), -10000.0), 1)
scores = scores + mask
scores.shape

torch.Size([20, 4, 218, 218])

In [23]:
attention = torch.softmax(scores, dim=-1)

attn_out = torch.einsum("bhts,bshd->bthd", attention, v)

attn_out = rearrange(attn_out, "... h d -> ... (h d)")
attn_out.shape

torch.Size([20, 218, 36])

In [24]:
out_proj = nn.Linear(opt_size, n_embd)
attn_out = out_proj(attn_out)
attn_out.shape

torch.Size([20, 218, 36])

## Transformer Block

In [25]:
residual = x_embd

output = ffwd_out + attn_out + residual
output.shape

torch.Size([20, 218, 36])

In [26]:
linear = nn.Linear(n_embd, vocab_size)
ln_output = nn.LayerNorm(n_embd)

output = ln_output(output)

logits = linear(output)
logits.shape

torch.Size([20, 218, 50257])

## Loss

In [27]:
loss_fct = nn.CrossEntropyLoss()

logits  = logits.view(-1, logits.shape[-1])
labels = y.view(-1)

loss = loss_fct(logits, labels)
loss

tensor(11.0906, grad_fn=<NllLossBackward0>)