# Lecture 24: GPT Architecture --> refer to model.py 

In [1]:
import torch
import matplotlib.pyplot as plt
from torch import nn
import tiktoken

In [2]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [3]:
from model_with_feature_classes import GPTModel

torch.manual_seed(123)

batch = torch.tensor([[6109, 3626, 6100, 345],
                      [6109, 1110, 6622, 257]])

model = GPTModel(GPT_CONFIG_124M)
out = model(batch)

print(f"Input batch:\n{batch}\nInput batch shape:\n{batch.shape}")
print(f"Output:\n{out}\nOutput shape:\n{out.shape}")

Input batch:
tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
Input batch shape:
torch.Size([2, 4])
Output:
tensor([[[ 0.1381,  0.0077, -0.1963,  ..., -0.0222, -0.1060,  0.1717],
         [ 0.3865, -0.8408, -0.6564,  ..., -0.5163,  0.2369, -0.3357],
         [ 0.6989, -0.1829, -0.1631,  ...,  0.1472, -0.6504, -0.0056],
         [-0.4290,  0.1669, -0.1258,  ...,  1.1579,  0.5303, -0.5549]],

        [[ 0.1094, -0.2894, -0.1467,  ..., -0.0557,  0.2911, -0.2824],
         [ 0.0882, -0.3552, -0.3527,  ...,  1.2930,  0.0053,  0.1898],
         [ 0.6091,  0.4702, -0.4094,  ...,  0.7688,  0.3787, -0.1974],
         [-0.0612, -0.0737,  0.4751,  ...,  1.2463, -0.3834,  0.0609]]],
       grad_fn=<UnsafeViewBackward0>)
Output shape:
torch.Size([2, 4, 50257])


In [4]:
print(f"Shape Token Embedding Layer: {model.input_emb.weight.shape}")
print(f"Shape Output Layer: {model.out_head.weight.shape}")

Shape Token Embedding Layer: torch.Size([50257, 768])
Shape Output Layer: torch.Size([50257, 768])


In [5]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of trainable parameters in model: {total_params:,}")

Total number of trainable parameters in model: 163,009,536


In [6]:
total_params_gpt2 = total_params - sum(p.numel() for p in model.out_head.parameters())
print(f"Total number of trainable parameters in gpt2: {total_params_gpt2:,}\n\nUses weight tying")

Total number of trainable parameters in gpt2: 124,412,160

Uses weight tying


In [7]:
total_size_bytes = total_params * 4
total_size_mb = total_size_bytes / (1024 * 1024)

print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 621.83 MB


# Lecture 25: Next word prediction from logits

In [8]:
def generate_text_simple(model, idx, max_new_tokens, context_size):

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]

        with torch.no_grad():
            logits = model(idx_cond)

        logits = logits[:, -1, :]

        probas = torch.softmax(logits, dim=-1)

        idx_next = torch.argmax(probas, dim=-1, keepdims=True)

        idx = torch.cat((idx, idx_next), dim=1)
    
    return idx


In [9]:
tokenizer = tiktoken.get_encoding("gpt2")
start_contaext = "Hello, I am"
encoded = tokenizer.encode(start_contaext)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)

print(f"encoded: {encoded}")
print(f"encoded_tensor shape: {encoded_tensor.shape}")


encoded: [15496, 11, 314, 716]
encoded_tensor shape: torch.Size([1, 4])


In [10]:
model.eval()
out = generate_text_simple(
    model=model,
    idx=encoded_tensor,
    max_new_tokens=6,
    context_size=GPT_CONFIG_124M["context_length"]
)

print(out, out.shape[-1])

tensor([[15496,    11,   314,   716, 27018, 24086, 47843, 30961, 42348,  7267]]) 10


In [11]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
decoded_text

'Hello, I am Featureiman Byeswickattribute argue'