In [1]:
from letter_tokenizer import tokenize, detokenize, letters

text_example = "ali ata bak. ali ata bak."
tokens = tokenize(text_example)
print(tokens)

text = detokenize(tokens)
print(text)

[0, 14, 11, 29, 0, 23, 0, 29, 1, 0, 13, 30, 29, 0, 14, 11, 29, 0, 23, 0, 29, 1, 0, 13, 30]
ali ata bak. ali ata bak.


In [2]:
from gpt_config import GPTConfig

test_config = GPTConfig(
    vocab_size=32,
    n_layer=1,  
    n_head=1,
    n_embd=3,
    seq_len=12,
)

print(test_config.vocab_size)


32


In [3]:
import torch

device = 'cpu'

if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'

print(device)


mps


In [4]:
from gpt_model import GPTModel

torch.manual_seed(42)
model = GPTModel(test_config, device)

parameters_count = 0

for p in model.parameters():
    parameters_count += p.numel()

print(parameters_count)
model

248


GPTModel(
  (token_embedding): Embedding(32, 3)
  (blocks): Sequential(
    (0): GPTBlock(
      (mha): MultiHeadAttention(
        (attn_heads): ModuleList(
          (0): CausalSelfAttention()
        )
        (projection): Linear(in_features=3, out_features=3, bias=True)
      )
      (ln1): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
    )
  )
  (ln_f): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
  (head): Linear(in_features=3, out_features=32, bias=True)
)

In [5]:
weights = model.token_embedding.weight.data.cpu().numpy()
weights

array([[ 1.926915  ,  1.4872842 ,  0.9007172 ],
       [-2.1055214 ,  0.67841846, -1.234545  ],
       [-0.04306748, -1.604667  , -0.7521362 ],
       [ 1.6487229 , -0.39247864, -1.4036068 ],
       [-0.72788125, -0.5594299 , -0.76883894],
       [ 0.7624454 ,  1.6423169 , -0.15959732],
       [-0.49739748,  0.43958923, -0.75813115],
       [ 1.0783176 ,  0.8008005 ,  1.6806206 ],
       [ 1.2791244 ,  1.2964228 ,  0.6104665 ],
       [ 1.3347378 , -0.23162432,  0.04175949],
       [-0.25157526,  0.8598585 , -1.3846742 ],
       [-0.87123615, -0.22336593,  1.7173611 ],
       [ 0.31887972, -0.42451897,  0.30572033],
       [-0.7745925 , -1.5575722 ,  0.9956361 ],
       [-0.87978584, -0.60114294, -1.2741514 ],
       [ 2.122785  , -1.2346534 , -0.48791388],
       [-0.913823  , -0.65813726,  0.07802387],
       [ 0.52580875, -0.48799172,  1.1913692 ],
       [-0.81400764, -0.73599285, -1.4032478 ],
       [ 0.03600382, -0.06347727,  0.6756149 ],
       [-0.09780689,  1.844594  , -1.184

In [6]:
weights = model.token_embedding.weight.data.cpu().numpy()
weights

array([[ 1.926915  ,  1.4872842 ,  0.9007172 ],
       [-2.1055214 ,  0.67841846, -1.234545  ],
       [-0.04306748, -1.604667  , -0.7521362 ],
       [ 1.6487229 , -0.39247864, -1.4036068 ],
       [-0.72788125, -0.5594299 , -0.76883894],
       [ 0.7624454 ,  1.6423169 , -0.15959732],
       [-0.49739748,  0.43958923, -0.75813115],
       [ 1.0783176 ,  0.8008005 ,  1.6806206 ],
       [ 1.2791244 ,  1.2964228 ,  0.6104665 ],
       [ 1.3347378 , -0.23162432,  0.04175949],
       [-0.25157526,  0.8598585 , -1.3846742 ],
       [-0.87123615, -0.22336593,  1.7173611 ],
       [ 0.31887972, -0.42451897,  0.30572033],
       [-0.7745925 , -1.5575722 ,  0.9956361 ],
       [-0.87978584, -0.60114294, -1.2741514 ],
       [ 2.122785  , -1.2346534 , -0.48791388],
       [-0.913823  , -0.65813726,  0.07802387],
       [ 0.52580875, -0.48799172,  1.1913692 ],
       [-0.81400764, -0.73599285, -1.4032478 ],
       [ 0.03600382, -0.06347727,  0.6756149 ],
       [-0.09780689,  1.844594  , -1.184

In [7]:
# Create an interactive visualization using plotly
import plotly.graph_objects as go
import numpy as np
import plotly.offline

# Get the weights from the token embedding layer
weights = model.token_embedding.weight.data.cpu().numpy()

# Get labels of the points from letters
labels = [letters[i] for i in range(32)]

# Create the data for the 3D scatter plot
data = [go.Scatter3d(
    x=weights[:, 0],
    y=weights[:, 1], 
    z=weights[:, 2],
    mode='markers+text',
    marker=dict(
        size=8,
        color='blue',
    ),
    text=labels,
    hoverinfo='text'
)]

# Create the layout
layout = go.Layout(
    scene = dict(
        xaxis_title='X-axis',
        yaxis_title='Y-axis',
        zaxis_title='Z-axis'
    ),
    width=800,
    height=800,
    showlegend=False,
    title="Embedding Uzayı"
)


# Create and render the plot
plot_figure = go.Figure(data=data, layout=layout)
plotly.offline.iplot(plot_figure)


In [8]:
def inference(prompt, max_new_tokens):
    tokens = tokenize(prompt)
    #print("tokens: ", tokens)
    for _ in range(max_new_tokens):
        num_tokens = len(tokens)
        tokens_padded = tokens + [0] * (test_config.seq_len - num_tokens)
        tokens_padded = torch.tensor(tokens_padded).unsqueeze(0).to(device)
        # print("tokens_padded: ", tokens_padded)
        logits = model(tokens_padded)
        predicted_token = torch.argmax(logits[0, num_tokens-1, :]).item()
        tokens.append(predicted_token)
    return detokenize(tokens)

print("Original: ", text_example[:test_config.seq_len])
row_model_prediction = inference(text_example[:3], max_new_tokens=1)
print("Predicted:", row_model_prediction)

Original:  ali ata bak.
Predicted: aliy


In [9]:
with open("tr_texts_400.txt", "r", encoding="utf-8") as file:
    tr_texts = file.read()

# text_example = tr_texts

tokenized_text = tokenize(text_example)

def get_dataset(num_examples, context_window_length, test_split=0.1):
    input_blocks = [] # List to store input sequences
    target_blocks = [] # List to store target sequences

    # Use a sliding window to create input/target sequences
    for i in range(0, len(tokenized_text), context_window_length + 1):
        block = tokenized_text[i:i+context_window_length+ 1]
        
        # Skip blocks that are too short
        if len(block) < context_window_length + 1:
            continue

        input_seq = block[:-1]  
        target_seq = block[1:]  

        input_blocks.append(input_seq)
        target_blocks.append(target_seq)
        
        # Stop if we have enough examples
        if len(input_blocks) >= num_examples:
            break

    # Convert to tensors for pytorch and move to gpu
    inputs = torch.tensor(input_blocks, dtype=torch.long).to(device)
    targets = torch.tensor(target_blocks, dtype=torch.long).to(device)

    # Calculate train/test split point
    split_idx = int(num_examples * (1 - test_split))

    # Split into train/test
    train_inputs = inputs[:split_idx]
    train_targets = targets[:split_idx]
    test_inputs = inputs[split_idx:]
    test_targets = targets[split_idx:]
    return train_inputs, train_targets, test_inputs, test_targets

# Get a small dataset
i, o, _, _ = get_dataset(4, test_config.seq_len, 0)
print("Input Shape", i.shape)
print("Output Shape", o.shape)
print("Input Example:")
print(i)
print("Output Example:")
print(o)

Input Shape torch.Size([1, 12])
Output Shape torch.Size([1, 12])
Input Example:
tensor([[ 0, 14, 11, 29,  0, 23,  0, 29,  1,  0, 13, 30]], device='mps:0')
Output Example:
tensor([[14, 11, 29,  0, 23,  0, 29,  1,  0, 13, 30, 29]], device='mps:0')


In [13]:
import torch.nn.functional as F

batch_size = 1
num_steps = 2000

# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)

# Define Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',factor=0.2, patience=20, min_lr=5e-6, threshold=1e-4)

# Training loop
i = 1
losses = []

train_inputs, train_targets, _, _ = get_dataset(100, test_config.seq_len, 0)

while i < num_steps:
    for j in range(0, len(train_inputs), batch_size):
        x = train_inputs[j:j+batch_size]
        y = train_targets[j:j+batch_size]

        # Forward pass
        logits = model(x)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1))
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        losses.append(loss.item())
        
        optimizer.step()
        optimizer.zero_grad()
    

        loss = loss.item()
        scheduler.step(loss)

   
        # Print the average loss for the epoch
        lr = optimizer.param_groups[0]["lr"]
        if i % 150 == 1:
            print(f"Step {i+1}/{num_steps}\t\tLoss: {loss:.6f}\t\tLR: {lr}")
            print(f"Original: {text_example[:test_config.seq_len]}\tPredicted: {inference(text_example[0], max_new_tokens=test_config.seq_len)}\tRow: {row_model_prediction}")

        i += 1


Step 2/2000		Loss: 0.499055		LR: 0.005
Original: ali ata bak.	Predicted: ali k lltt l 	Row: aliy
Step 152/2000		Loss: 0.448248		LR: 0.001
Original: ali ata bak.	Predicted: ali ata    li	Row: aliy
Step 302/2000		Loss: 0.400409		LR: 0.001
Original: ali ata bak.	Predicted: ali ata    li	Row: aliy
Step 452/2000		Loss: 0.350008		LR: 0.001
Original: ali ata bak.	Predicted: ali ata    li	Row: aliy
Step 602/2000		Loss: 0.310131		LR: 0.001
Original: ali ata bak.	Predicted: ali ata    li	Row: aliy
Step 752/2000		Loss: 0.282662		LR: 0.001
Original: ali ata bak.	Predicted: ali ata    li	Row: aliy
Step 902/2000		Loss: 0.260312		LR: 0.001
Original: ali ata bak.	Predicted: ali ata    li	Row: aliy
Step 1052/2000		Loss: 0.246964		LR: 0.001
Original: ali ata bak.	Predicted: ali ata    ll	Row: aliy
Step 1202/2000		Loss: 0.226912		LR: 0.001
Original: ali ata bak.	Predicted: ali ata    ll	Row: aliy
Step 1352/2000		Loss: 0.208438		LR: 0.001
Original: ali ata bak.	Predicted: ali ata    ll	Row: aliy
Step 1502

In [14]:
inference("ali", max_new_tokens=10)

'ali ata bak. '