In [3]:
%pwd

'c:\\Users\\kgn\\OneDrive - PowerSchool\\PowerSchool\\Release Script\\Python\\Data Scientist\\LLMScratch\\notebook'

In [5]:
import os
os.chdir('../')
%pwd

'c:\\Users\\kgn\\OneDrive - PowerSchool\\PowerSchool\\Release Script\\Python\\Data Scientist\\LLMScratch'

In [6]:
from importlib.metadata import version


print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.7.1
tiktoken version: 0.9.0


In [7]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.0,       # Dropout rate
    "qkv_bias": True       # Query-Key-Value bias
}

In [None]:
import torch.nn as nn
from notebook.modules.supplementary_architecture import TransformerBlock, LayerNorm


class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        
        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [9]:
import torch
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

batch = []

txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [10]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)

out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)

Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])
tensor([[[-0.8523, -0.4563, -0.3349,  ..., -0.0809, -0.2644,  0.1384],
         [ 0.3884, -0.4947,  0.7741,  ..., -0.5099, -0.9245,  0.2568],
         [ 0.2801, -0.1876, -0.5661,  ..., -0.1883, -0.3576, -0.1122],
         [ 0.4540,  0.3971,  0.4569,  ..., -0.0255,  0.4539,  0.4159]],

        [[-0.8523, -0.4563, -0.3349,  ..., -0.0809, -0.2644,  0.1384],
         [ 0.1889, -0.9119,  1.4292,  ..., -0.7277, -0.5452,  0.0713],
         [-0.1624, -0.1354, -0.5259,  ..., -0.2078, -1.0084,  0.5181],
         [-0.0725,  1.2847,  0.2356,  ..., -0.5090, -0.4796, -0.3735]]],
       grad_fn=<UnsafeViewBackward0>)


In [11]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context
    for _ in range(max_new_tokens):
        
        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]
        
        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond)
        
        # Focus only on the last time step
        # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]  

        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)

        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

In [12]:
model.eval();  # disable dropout

In [13]:
start_context = "Hello, I am"

encoded = tokenizer.encode(start_context)
print("encoded:", encoded)

encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print("encoded_tensor.shape:", encoded_tensor.shape)

encoded: [15496, 11, 314, 716]
encoded_tensor.shape: torch.Size([1, 4])


In [14]:
out = generate_text_simple(
    model=model,
    idx=encoded_tensor, 
    max_new_tokens=6, 
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output:", out)
print("Output length:", len(out[0]))

Output: tensor([[15496,    11,   314,   716,  3127, 29991,  6539, 21826, 18530,  6276]])
Output length: 10


In [15]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

Hello, I am network BEL Afghan postp aired technical


In [23]:
text="some sample text"
enc_txt = torch.tensor(tokenizer.encode(text))
print("Encoded text:", enc_txt) 

Encoded text: tensor([11246,  6291,  2420])


In [24]:
enc_txt.ndim  # Check the dimensions of the tensor

1

In [25]:
enc_txt_batch = enc_txt.unsqueeze(0)  # Add batch dimension
print("Encoded text with batch dimension:", enc_txt_batch)

Encoded text with batch dimension: tensor([[11246,  6291,  2420]])


In [26]:
enc_txt_batch.shape

torch.Size([1, 3])

In [28]:

out_ids = generate_text_simple(model=model, idx=enc_txt_batch, max_new_tokens=10, context_size=1024)
tokenizer.decode(out_ids.squeeze().tolist())

'some sample text potentiallyython Heaven▄ ratePlace Face corrections blame Additionally'