In [38]:
from letter_tokenizer import tokenize, detokenize, letters

text_example = "ali ata bak. ali ata bak."
tokens = tokenize(text_example)
print(tokens)

text = detokenize(tokens)
print(text)
print(letters)

[0, 14, 11, 29, 0, 23, 0, 29, 1, 0, 13, 30, 29, 0, 14, 11, 29, 0, 23, 0, 29, 1, 0, 13, 30]
ali ata bak. ali ata bak.
['a', 'b', 'c', 'ç', 'd', 'e', 'f', 'g', 'ğ', 'h', 'ı', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'ö', 'p', 'r', 's', 'ş', 't', 'u', 'ü', 'v', 'y', 'z', ' ', '.', ',']


In [39]:
embedding = {
  "a": [0.1, 0.2, 0.3],
  "b": [0.4, 0.5, 0.6],
  "c": [0.7, 0.8, 0.9]
}

embedding["b"]

[0.4, 0.5, 0.6]

In [40]:
from gpt_config import GPTConfig

test_config = GPTConfig(
    vocab_size=32,
    n_layer=1,
    n_head=1,
    n_embd=3,
    seq_len=12,
)

print(test_config.vocab_size)


32


In [41]:
import torch

device = 'cpu'

if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'

print(device)


mps


In [42]:
from gpt_model import GPTModel

torch.manual_seed(42)
model = GPTModel(test_config, device)

parameters_count = 0

for p in model.parameters():
    parameters_count += p.numel()

print(parameters_count)
model

248


GPTModel(
  (token_embedding): Embedding(32, 3)
  (blocks): Sequential(
    (0): GPTBlock(
      (mha): MultiHeadAttention(
        (attn_heads): ModuleList(
          (0): CausalSelfAttention()
        )
        (projection): Linear(in_features=3, out_features=3, bias=True)
      )
      (ln1): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
    )
  )
  (ln_f): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
  (head): Linear(in_features=3, out_features=32, bias=True)
)

In [43]:
untrained_weights = model.token_embedding.weight.data.cpu().numpy()
untrained_weights

array([[ 1.926915  ,  1.4872842 ,  0.9007172 ],
       [-2.1055214 ,  0.67841846, -1.234545  ],
       [-0.04306748, -1.604667  , -0.7521362 ],
       [ 1.6487229 , -0.39247864, -1.4036068 ],
       [-0.72788125, -0.5594299 , -0.76883894],
       [ 0.7624454 ,  1.6423169 , -0.15959732],
       [-0.49739748,  0.43958923, -0.75813115],
       [ 1.0783176 ,  0.8008005 ,  1.6806206 ],
       [ 1.2791244 ,  1.2964228 ,  0.6104665 ],
       [ 1.3347378 , -0.23162432,  0.04175949],
       [-0.25157526,  0.8598585 , -1.3846742 ],
       [-0.87123615, -0.22336593,  1.7173611 ],
       [ 0.31887972, -0.42451897,  0.30572033],
       [-0.7745925 , -1.5575722 ,  0.9956361 ],
       [-0.87978584, -0.60114294, -1.2741514 ],
       [ 2.122785  , -1.2346534 , -0.48791388],
       [-0.913823  , -0.65813726,  0.07802387],
       [ 0.52580875, -0.48799172,  1.1913692 ],
       [-0.81400764, -0.73599285, -1.4032478 ],
       [ 0.03600382, -0.06347727,  0.6756149 ],
       [-0.09780689,  1.844594  , -1.184

In [44]:
trained_weights = model.token_embedding.weight.data.cpu().numpy()
trained_weights

array([[ 1.926915  ,  1.4872842 ,  0.9007172 ],
       [-2.1055214 ,  0.67841846, -1.234545  ],
       [-0.04306748, -1.604667  , -0.7521362 ],
       [ 1.6487229 , -0.39247864, -1.4036068 ],
       [-0.72788125, -0.5594299 , -0.76883894],
       [ 0.7624454 ,  1.6423169 , -0.15959732],
       [-0.49739748,  0.43958923, -0.75813115],
       [ 1.0783176 ,  0.8008005 ,  1.6806206 ],
       [ 1.2791244 ,  1.2964228 ,  0.6104665 ],
       [ 1.3347378 , -0.23162432,  0.04175949],
       [-0.25157526,  0.8598585 , -1.3846742 ],
       [-0.87123615, -0.22336593,  1.7173611 ],
       [ 0.31887972, -0.42451897,  0.30572033],
       [-0.7745925 , -1.5575722 ,  0.9956361 ],
       [-0.87978584, -0.60114294, -1.2741514 ],
       [ 2.122785  , -1.2346534 , -0.48791388],
       [-0.913823  , -0.65813726,  0.07802387],
       [ 0.52580875, -0.48799172,  1.1913692 ],
       [-0.81400764, -0.73599285, -1.4032478 ],
       [ 0.03600382, -0.06347727,  0.6756149 ],
       [-0.09780689,  1.844594  , -1.184

In [45]:
# Create an interactive visualization using plotly
import plotly.graph_objects as go
import plotly.offline

In [46]:
def plot_dots(dots_data, title):
  data = [
      go.Scatter3d(
          x=dot_data["dots"][:, 0],
          y=dot_data["dots"][:, 1],
          z=dot_data["dots"][:, 2],
          mode='markers+text',
          marker=dict(
              size=8,
              color=dot_data["color"],
          ),
          text=dot_data["labels"],
          hoverinfo='text'
      ) for dot_data in dots_data
  ]
  layout = go.Layout(
    scene = dict(
      xaxis_title='Meyve',
      yaxis_title='Teknoloji',
      zaxis_title='Diğer'
    ),
    width=800,
    height=800,
    showlegend=False,
    title=title
  )
  plot_figure = go.Figure(data=data, layout=layout)
  plotly.offline.iplot(plot_figure)
  

In [47]:
dots_data = [
  {
    "dots": untrained_weights,
    "color": "blue",
    "labels": [letters[i] for i in range(32)]
  },
  {
    "dots": trained_weights,
    "color": "red",
    "labels": [letters[i] for i in range(32)]
  }
]

plot_dots(dots_data, "Embedding Uzayı yani sözlük uzayı")


In [48]:
prompt = "ali"
tokens = tokenize(prompt)
num_tokens = len(tokens)
tokens_padded = tokens + [19] * (test_config.seq_len - num_tokens)
tokens, tokens_padded

([0, 14, 11], [0, 14, 11, 19, 19, 19, 19, 19, 19, 19, 19, 19])

In [49]:
cumle = model.token_embedding(torch.tensor([tokens_padded]).to(device))
cumle.shape

torch.Size([1, 12, 3])

In [50]:
cumle[0]

tensor([[ 1.9269,  1.4873,  0.9007],
        [-0.8798, -0.6011, -1.2742],
        [-0.8712, -0.2234,  1.7174],
        [ 0.0360, -0.0635,  0.6756],
        [ 0.0360, -0.0635,  0.6756],
        [ 0.0360, -0.0635,  0.6756],
        [ 0.0360, -0.0635,  0.6756],
        [ 0.0360, -0.0635,  0.6756],
        [ 0.0360, -0.0635,  0.6756],
        [ 0.0360, -0.0635,  0.6756],
        [ 0.0360, -0.0635,  0.6756],
        [ 0.0360, -0.0635,  0.6756]], device='mps:0',
       grad_fn=<SelectBackward0>)

In [51]:
[letters[i] for i in tokens_padded]

['a', 'l', 'i', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p']

In [52]:

dots_data = [
  {
    "dots": cumle[0].detach().cpu().numpy(),
    "color": "red",
    "labels": [letters[i] for i in tokens_padded]
  }
]

plot_dots(dots_data, "Attention Uzayı yani bağlam uzayı")

In [53]:
from gpt_model import get_position_encoding

position_encoding = get_position_encoding(test_config.seq_len, test_config.n_embd, device=device)
positioned_cumle = cumle + position_encoding
positioned_cumle


tensor([[[ 1.9269,  2.4873,  0.9007],
         [-0.0383, -0.0608, -1.2742],
         [ 0.0381, -0.6395,  1.7174],
         [ 0.1771, -1.0535,  0.6756],
         [-0.7208, -0.7171,  0.6756],
         [-0.9229,  0.2202,  0.6756],
         [-0.2434,  0.8967,  0.6756],
         [ 0.6930,  0.6904,  0.6756],
         [ 1.0254, -0.2090,  0.6756],
         [ 0.4481, -0.9746,  0.6756],
         [-0.5080, -0.9025,  0.6756],
         [-0.9640, -0.0591,  0.6756]]], device='mps:0', grad_fn=<AddBackward0>)

In [54]:
dots_data = [
  {
    "dots": positioned_cumle[0].detach().cpu().numpy(),
    "color": "red",
    "labels": [letters[i] for i in tokens_padded]
  },
  {
    "dots": cumle[0].detach().cpu().numpy(),
    "color": "blue",
    "labels": [letters[i] for i in tokens_padded]
  }
]

plot_dots(dots_data, "Position Encoding")

In [55]:
model.blocks[0].mha.attn_heads[0].Wq

tensor([[-0.6890, -1.1267, -0.2858],
        [-1.0935,  1.1351,  0.7592],
        [-3.5945,  0.0192,  0.1052]], device='mps:0',
       grad_fn=<ToCopyBackward0>)

In [56]:
attention_output = model.blocks[0].mha.attn_heads[0](positioned_cumle)
attention_output


tensor([[[-4.5114, -8.5372, -2.8416],
         [-4.5114, -8.5372, -2.8416],
         [ 0.2631, -0.2310,  0.1570],
         [ 0.2031,  0.8307,  0.4115],
         [ 0.4714,  1.0272,  0.4278],
         [ 0.5161,  0.6239,  0.3162],
         [ 0.4467,  0.3796,  0.2770],
         [ 0.4284,  0.4039,  0.2883],
         [ 0.4290,  0.7376,  0.3654],
         [ 0.2090,  1.1697,  0.4555],
         [ 0.3690,  1.2992,  0.4493],
         [ 0.6383,  1.1594,  0.4113]]], device='mps:0',
       grad_fn=<UnsafeViewBackward0>)

In [57]:
dots_data = [
  {
    "dots": attention_output[0].detach().cpu().numpy(),
    "color": "red",
    "labels": [letters[i] for i in tokens_padded]
  }
]

plot_dots(dots_data, "Attention")

In [58]:
l_normalized = model.ln_f(attention_output)
l_normalized

tensor([[[ 0.3285, -1.3555,  1.0270],
         [ 0.3285, -1.3555,  1.0270],
         [ 0.9419, -1.3843,  0.4424],
         [-1.0675,  1.3369, -0.2694],
         [-0.6255,  1.4111, -0.7856],
         [ 0.2405,  1.0863, -1.3268],
         [ 1.1298,  0.1696, -1.2994],
         [ 0.8970,  0.4962, -1.3932],
         [-0.5021,  1.3958, -0.8936],
         [-0.9878,  1.3703, -0.3825],
         [-0.8004,  1.4099, -0.6095],
         [-0.3130,  1.3508, -1.0378]]], device='mps:0',
       grad_fn=<NativeLayerNormBackward0>)

In [59]:
dots_data = [
  {
    "dots": l_normalized[0].detach().cpu().numpy(),
    "color": "red",
    "labels": [letters[i] for i in tokens_padded]
  }
]

plot_dots(dots_data, "Normalized")

In [60]:
l_projected = model.blocks[0].mha.projection(l_normalized)
l_projected

tensor([[[ 1.2319, -0.2748,  0.5149],
         [ 1.2319, -0.2748,  0.5149],
         [ 0.8982, -0.6623,  0.3322],
         [-0.5250,  0.8845, -1.2678],
         [-0.8565,  0.6154, -1.4789],
         [-1.0555,  0.0384, -1.4990],
         [-0.7013, -0.6158, -1.0317],
         [-0.8770, -0.4354, -1.2265],
         [-0.9145,  0.5364, -1.5076],
         [-0.6040,  0.8381, -1.3226],
         [-0.7523,  0.7248, -1.4189],
         [-0.9828,  0.4131, -1.5338]]], device='mps:0',
       grad_fn=<LinearBackward0>)

In [61]:
dots_data = [
  {
    "dots": l_projected[0].detach().cpu().numpy(),
    "color": "red",
    "labels": [letters[i] for i in tokens_padded]
  },
  {
    "dots": l_normalized[0].detach().cpu().numpy(),
    "color": "blue",
    "labels": [letters[i] for i in tokens_padded]
  },
  {
    "dots": positioned_cumle[0].detach().cpu().numpy(),
    "color": "green",
    "labels": [letters[i] for i in tokens_padded]
  },
  {
    "dots": cumle[0].detach().cpu().numpy(),
    "color": "yellow",
    "labels": [letters[i] for i in tokens_padded]
  }
]

plot_dots(dots_data, "Projected")

In [62]:
logits = model(torch.tensor([tokens_padded]).to(device))


In [63]:
logits.shape

torch.Size([1, 12, 32])

In [64]:
letters[torch.argmax(logits[0, 2, :]).item()]

'y'

In [65]:
num_tokens-1, torch.argmax(logits[0, 11, :]).item()

(2, 5)

In [66]:
l1 = torch.nn.Linear(3, 3)
p1 = torch.nn.Parameter(torch.randn(3, 3))
l1.weight, p1

(Parameter containing:
 tensor([[-0.5214, -0.5618,  0.2146],
         [-0.3170, -0.3712, -0.0450],
         [-0.1923, -0.1868,  0.0186]], requires_grad=True),
 Parameter containing:
 tensor([[-0.0909, -0.0337, -1.0090],
         [-1.2315, -1.0470, -1.7461],
         [-0.3742, -0.4117, -0.8997]], requires_grad=True))

In [67]:
def inference(prompt, max_new_tokens):
    tokens = tokenize(prompt)
    #print("tokens: ", tokens)
    for _ in range(max_new_tokens):
        num_tokens = len(tokens)
        tokens_padded = tokens + [19] * (test_config.seq_len - num_tokens)
        tokens_padded = torch.tensor(tokens_padded).unsqueeze(0).to(device)
        # print("tokens_padded: ", tokens_padded)
        logits = model(tokens_padded)
        predicted_token = torch.argmax(logits[0, num_tokens-1, :]).item()
        tokens.append(predicted_token)
    return detokenize(tokens)

print("Original: ", text_example[:test_config.seq_len])
row_model_prediction = inference(text_example[:3], max_new_tokens=1)
print("Predicted:", row_model_prediction)

Original:  ali ata bak.
Predicted: aliy


In [68]:
with open("tr_texts_400.txt", "r", encoding="utf-8") as file:
    tr_texts = file.read()

# text_example = tr_texts

tokenized_text = tokenize(text_example)

def get_dataset(num_examples, context_window_length, test_split=0.1):
    input_blocks = [] # List to store input sequences
    target_blocks = [] # List to store target sequences

    # Use a sliding window to create input/target sequences
    for i in range(0, len(tokenized_text), context_window_length + 1):
        block = tokenized_text[i:i+context_window_length+ 1]
        
        # Skip blocks that are too short
        if len(block) < context_window_length + 1:
            continue

        input_seq = block[:-1]  
        target_seq = block[1:]  

        input_blocks.append(input_seq)
        target_blocks.append(target_seq)
        
        # Stop if we have enough examples
        if len(input_blocks) >= num_examples:
            break

    # Convert to tensors for pytorch and move to gpu
    inputs = torch.tensor(input_blocks, dtype=torch.long).to(device)
    targets = torch.tensor(target_blocks, dtype=torch.long).to(device)

    # Calculate train/test split point
    split_idx = int(num_examples * (1 - test_split))

    # Split into train/test
    train_inputs = inputs[:split_idx]
    train_targets = targets[:split_idx]
    test_inputs = inputs[split_idx:]
    test_targets = targets[split_idx:]
    return train_inputs, train_targets, test_inputs, test_targets

# Get a small dataset
i, o, _, _ = get_dataset(4, test_config.seq_len, 0)
print("Input Shape", i.shape)
print("Output Shape", o.shape)
print("Input Example:")
print(i)
print("Output Example:")
print(o)

Input Shape torch.Size([1, 12])
Output Shape torch.Size([1, 12])
Input Example:
tensor([[ 0, 14, 11, 29,  0, 23,  0, 29,  1,  0, 13, 30]], device='mps:0')
Output Example:
tensor([[14, 11, 29,  0, 23,  0, 29,  1,  0, 13, 30, 29]], device='mps:0')


In [69]:
import torch.nn.functional as F

batch_size = 1
num_steps = 1000

# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=5e-2)

# Define Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',factor=0.2, patience=20, min_lr=5e-6, threshold=1e-4)

# Training loop
i = 1
losses = []

train_inputs, train_targets, _, _ = get_dataset(100, test_config.seq_len, 0)

while i < num_steps:
    for j in range(0, len(train_inputs), batch_size):
        x = train_inputs[j:j+batch_size]
        y = train_targets[j:j+batch_size]

        # Forward pass
        logits = model(x)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1))
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        losses.append(loss.item())
        
        optimizer.step()
        optimizer.zero_grad()
    

        loss = loss.item()
        scheduler.step(loss)

        # Print the average loss for the epoch
        lr = optimizer.param_groups[0]["lr"]
        if i % 150 == 1:
            print(f"Step {i+1}/{num_steps}\t\tLoss: {loss:.6f}\t\tLR: {lr}")
            print(f"Original: {text_example[:test_config.seq_len]}\tPredicted: {inference(text_example[0], max_new_tokens=test_config.seq_len)}\tRow: {row_model_prediction}")

        i += 1


Step 2/1000		Loss: 3.282363		LR: 0.05
Original: ali ata bak.	Predicted: akjk.kekek.ke	Row: aliy
Step 152/1000		Loss: 0.145941		LR: 0.05
Original: ali ata bak.	Predicted: ali ata bak. 	Row: aliy
Step 302/1000		Loss: 0.004908		LR: 0.05
Original: ali ata bak.	Predicted: ali ata bak. 	Row: aliy
Step 452/1000		Loss: 0.013486		LR: 0.05
Original: ali ata bak.	Predicted: ali atab ak. 	Row: aliy
Step 602/1000		Loss: 0.002301		LR: 5e-06
Original: ali ata bak.	Predicted: ali ata bak. 	Row: aliy
Step 752/1000		Loss: 0.002295		LR: 5e-06
Original: ali ata bak.	Predicted: ali ata bak. 	Row: aliy
Step 902/1000		Loss: 0.002288		LR: 5e-06
Original: ali ata bak.	Predicted: ali ata bak. 	Row: aliy


In [79]:
inference("ali ata ba", max_new_tokens=3)

'ali ata bak. '