In [1]:
from letter_tokenizer import tokenize, detokenize

text_example = "ali ata bak. ali ata bak."
tokens = tokenize(text_example)
print(tokens)

text = detokenize(tokens)
print(text)

[0, 14, 11, 29, 0, 23, 0, 29, 1, 0, 13, 30, 29, 0, 14, 11, 29, 0, 23, 0, 29, 1, 0, 13, 30]
ali ata bak. ali ata bak.


In [2]:
from gpt_config import GPTConfig

test_config = GPTConfig(
    vocab_size=32,
    n_layer=1,  
    n_head=1,
    n_embd=3,
    seq_len=12,
)

print(test_config.vocab_size)


32


In [3]:
import torch

device = 'cpu'

if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'

print(device)


mps


In [4]:
from gpt_model import GPTModel

torch.manual_seed(42)
model = GPTModel(test_config, device)

parameters_count = 0

for p in model.parameters():
    parameters_count += p.numel()

print(parameters_count)
model

341


GPTModel(
  (token_embedding): Embedding(32, 3)
  (blocks): Sequential(
    (0): GPTBlock(
      (mha): MultiHeadAttention(
        (attn_heads): ModuleList(
          (0): CausalSelfAttention()
        )
        (projection): Linear(in_features=3, out_features=3, bias=True)
      )
      (ln1): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
      (ffn): Sequential(
        (0): Linear(in_features=3, out_features=12, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=12, out_features=3, bias=True)
      )
      (ln2): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
    )
  )
  (ln_f): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
  (head): Linear(in_features=3, out_features=32, bias=True)
)

In [5]:
def inference(prompt, max_new_tokens):
    tokens = tokenize(prompt)
    for _ in range(max_new_tokens):
        num_tokens = len(tokens)
        tokens_padded = tokens + [0] * (test_config.seq_len - num_tokens)
        tokens_padded = torch.tensor(tokens_padded).unsqueeze(0).to(device)
        logits = model(tokens_padded)
        predicted_token = torch.argmax(logits[0, num_tokens-1, :]).item()
        tokens.append(predicted_token)
    return detokenize(tokens)

print("Original: ", text_example[:test_config.seq_len])
row_model_prediction = inference(text_example[0], max_new_tokens=test_config.seq_len)
print("Predicted:", row_model_prediction)

Original:  ali ata bak.
Predicted: ajjjjfffjjjff


In [6]:
with open("tr_texts_400.txt", "r", encoding="utf-8") as file:
    tr_texts = file.read()

# text_example = tr_texts

tokenized_text = tokenize(text_example)

def get_dataset(num_examples, context_window_length, test_split=0.1):
    input_blocks = [] # List to store input sequences
    target_blocks = [] # List to store target sequences

    # Use a sliding window to create input/target sequences
    for i in range(0, len(tokenized_text), context_window_length + 1):
        block = tokenized_text[i:i+context_window_length+ 1]
        
        # Skip blocks that are too short
        if len(block) < context_window_length + 1:
            continue

        input_seq = block[:-1]  
        target_seq = block[1:]  

        input_blocks.append(input_seq)
        target_blocks.append(target_seq)
        
        # Stop if we have enough examples
        if len(input_blocks) >= num_examples:
            break

    # Convert to tensors for pytorch and move to gpu
    inputs = torch.tensor(input_blocks, dtype=torch.long).to(device)
    targets = torch.tensor(target_blocks, dtype=torch.long).to(device)

    # Calculate train/test split point
    split_idx = int(num_examples * (1 - test_split))

    # Split into train/test
    train_inputs = inputs[:split_idx]
    train_targets = targets[:split_idx]
    test_inputs = inputs[split_idx:]
    test_targets = targets[split_idx:]
    return train_inputs, train_targets, test_inputs, test_targets

# Get a small dataset
i, o, _, _ = get_dataset(4, test_config.seq_len, 0)
print("Input Shape", i.shape)
print("Output Shape", o.shape)
print("Input Example:")
print(i)
print("Output Example:")
print(o)

Input Shape torch.Size([1, 12])
Output Shape torch.Size([1, 12])
Input Example:
tensor([[ 0, 14, 11, 29,  0, 23,  0, 29,  1,  0, 13, 30]], device='mps:0')
Output Example:
tensor([[14, 11, 29,  0, 23,  0, 29,  1,  0, 13, 30, 29]], device='mps:0')


In [7]:
import torch.nn.functional as F

batch_size = 1
num_steps = 2000

# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

# Define Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',factor=0.2, patience=20, min_lr=5e-6, threshold=1e-4)

# Training loop
i = 1
losses = []

train_inputs, train_targets, _, _ = get_dataset(100, test_config.seq_len, 0)

while i < num_steps:
    for j in range(0, len(train_inputs), batch_size):
        x = train_inputs[j:j+batch_size]
        y = train_targets[j:j+batch_size]

        # Forward pass
        logits = model(x)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1))
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        losses.append(loss.item())
        
        optimizer.step()
        optimizer.zero_grad()
    

        loss = loss.item()
        scheduler.step(loss)

   
        # Print the average loss for the epoch
        lr = optimizer.param_groups[0]["lr"]
        if i % 150 == 1:
            print(f"Step {i+1}/{num_steps}\t\tLoss: {loss:.6f}\t\tLR: {lr}")
            print(f"Original: {text_example[:test_config.seq_len]}\tPredicted: {inference(text_example[0], max_new_tokens=test_config.seq_len)}\tRow: {row_model_prediction}")

        i += 1


Step 2/2000		Loss: 3.790171		LR: 0.0005
Original: ali ata bak.	Predicted: ajjjjfffjjjff	Row: ajjjjfffjjjff
Step 152/2000		Loss: 3.199797		LR: 0.0005
Original: ali ata bak.	Predicted: affjuffffffff	Row: ajjjjfffjjjff
Step 302/2000		Loss: 2.604598		LR: 0.0005
Original: ali ata bak.	Predicted: alfkultllk.tl	Row: ajjjjfffjjjff
Step 452/2000		Loss: 2.063809		LR: 0.0005
Original: ali ata bak.	Predicted: a  khit    it	Row: ajjjjfffjjjff
Step 602/2000		Loss: 1.697719		LR: 0.0005
Original: ali ata bak.	Predicted: a i      i i 	Row: ajjjjfffjjjff
Step 752/2000		Loss: 1.425092		LR: 0.0005
Original: ali ata bak.	Predicted: a i     tk.  	Row: ajjjjfffjjjff
Step 902/2000		Loss: 1.281379		LR: 0.0005
Original: ali ata bak.	Predicted: aai  aa tk. a	Row: ajjjjfffjjjff
Step 1052/2000		Loss: 1.130798		LR: 0.0005
Original: ali ata bak.	Predicted: aaiaka  tk. a	Row: ajjjjfffjjjff
Step 1202/2000		Loss: 0.994100		LR: 0.0005
Original: ali ata bak.	Predicted: ali ata tk. a	Row: ajjjjfffjjjff
Step 1352/2000		Los