In [1]:
from english_tokenizer import tokenize, detokenize, vocab_list, text, reverse_vocab_list

tokens = tokenize(text)
print(tokens)

text = detokenize(tokens)
print(text)
print(vocab_list)


Detokenized text:
'the capital of the united states is not london. the capital of france is paris, and berlin is the capital of germany. rome is in italy, madrid is in spain, and lisbon is in portugal. the capital of the united kingdom is not paris, and the capital of the united states is not berlin. although these places are often mentioned together, although these capitals are often mentioned together, although these are often mentioned together, each country has its own capital, and each country has its own city, and each capital has its own identity, and each capital has its own history. washington is the capital of the united states, and london is the capital of the united kingdom. paris is known for art and fashion, and berlin is known for art and history, and rome is known for art and history, and madrid is known for culture and history, and lisbon is known for culture and art. rome is rich with culture, rome is rich with history, rome is rich with art, and madrid is rich with 

In [2]:
from gpt_config import GPTConfig

test_config = GPTConfig(
    vocab_size=64,
    n_layer=6,
    n_head=4,
    n_embd=12,
    seq_len=32,
)

print(test_config.vocab_size)


64


In [3]:
import torch

device = 'cpu'

if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'

print(device)


mps


In [4]:
from gpt_model import GPTModel

torch.manual_seed(42)
model = GPTModel(test_config, device)

parameters_count = 0

for p in model.parameters():
    parameters_count += p.numel()

print(parameters_count)
model

5296


GPTModel(
  (token_embedding): Embedding(64, 12)
  (blocks): Sequential(
    (0): GPTBlock(
      (mha): MultiHeadAttention(
        (attn_heads): ModuleList(
          (0-3): 4 x CausalSelfAttention()
        )
        (projection): Linear(in_features=48, out_features=12, bias=True)
      )
      (ln1): LayerNorm((12,), eps=1e-05, elementwise_affine=True)
    )
    (1): GPTBlock(
      (mha): MultiHeadAttention(
        (attn_heads): ModuleList(
          (0-3): 4 x CausalSelfAttention()
        )
        (projection): Linear(in_features=48, out_features=12, bias=True)
      )
      (ln1): LayerNorm((12,), eps=1e-05, elementwise_affine=True)
    )
    (2): GPTBlock(
      (mha): MultiHeadAttention(
        (attn_heads): ModuleList(
          (0-3): 4 x CausalSelfAttention()
        )
        (projection): Linear(in_features=48, out_features=12, bias=True)
      )
      (ln1): LayerNorm((12,), eps=1e-05, elementwise_affine=True)
    )
    (3): GPTBlock(
      (mha): MultiHeadAttention(

In [5]:
untrained_weights = model.token_embedding.weight.data.cpu().numpy()
untrained_weights

array([[ 1.92691505e+00,  1.48728418e+00,  9.00717199e-01,
        -2.10552144e+00,  6.78418458e-01, -1.23454499e+00,
        -4.30674814e-02, -1.60466695e+00, -7.52136171e-01,
         1.64872289e+00, -3.92478645e-01, -1.40360677e+00],
       [-7.27881253e-01, -5.59429884e-01, -7.68838942e-01,
         7.62445390e-01,  1.64231694e+00, -1.59597322e-01,
        -4.97397482e-01,  4.39589232e-01, -7.58131146e-01,
         1.07831764e+00,  8.00800502e-01,  1.68062055e+00],
       [ 1.27912438e+00,  1.29642284e+00,  6.10466480e-01,
         1.33473778e+00, -2.31624320e-01,  4.17594910e-02,
        -2.51575261e-01,  8.59858513e-01, -1.38467419e+00,
        -8.71236145e-01, -2.23365933e-01,  1.71736109e+00],
       [ 3.18879724e-01, -4.24518973e-01,  3.05720329e-01,
        -7.74592519e-01, -1.55757225e+00,  9.95636106e-01,
        -8.79785836e-01, -6.01142943e-01, -1.27415144e+00,
         2.12278509e+00, -1.23465335e+00, -4.87913877e-01],
       [-9.13823009e-01, -6.58137262e-01,  7.8023873

In [35]:
trained_weights = model.token_embedding.weight.data.cpu().numpy()
trained_weights

array([[ 1.96563876e+00,  1.45459330e+00,  9.25036430e-01,
        -2.10686803e+00,  6.64693117e-01, -1.24020886e+00,
        -3.70992906e-02, -1.62299430e+00, -7.73678422e-01,
         1.59684420e+00, -3.80364358e-01, -1.37007582e+00],
       [-7.11033881e-01, -5.54280221e-01, -8.06302965e-01,
         7.99924374e-01,  1.61554301e+00, -1.52125403e-01,
        -4.69703645e-01,  4.04414564e-01, -7.51436651e-01,
         1.06743884e+00,  8.08465362e-01,  1.66844857e+00],
       [ 1.23820460e+00,  1.31468141e+00,  6.40396476e-01,
         1.33877301e+00, -1.75310016e-01,  6.11681603e-02,
        -3.04339647e-01,  8.26386034e-01, -1.40538216e+00,
        -8.74114335e-01, -2.36074805e-01,  1.72334373e+00],
       [ 2.29158044e-01, -3.98609698e-01,  3.74818802e-01,
        -6.68083608e-01, -1.40524197e+00,  9.54515457e-01,
        -9.36116040e-01, -6.35181487e-01, -1.36811090e+00,
         2.05426049e+00, -1.29738927e+00, -3.75230491e-01],
       [-8.75866830e-01, -7.43966758e-01,  1.1920624

In [7]:
# Create an interactive visualization using plotly
import plotly.graph_objects as go
import plotly.offline

In [47]:
def plot_dots(dots_data, title):
  data = [
      go.Scatter3d(
          x=dot_data["dots"][:, 0],
          y=dot_data["dots"][:, 1],
          z=dot_data["dots"][:, 2],
          mode='markers+text',
          marker=dict(
              size=8,
              color=dot_data["color"],
          ),
          text=dot_data["labels"],
          hoverinfo='text'
      ) for dot_data in dots_data
  ]
  layout = go.Layout(
    scene = dict(
      xaxis_title='Meyve',
      yaxis_title='Teknoloji',
      zaxis_title='Diğer'
    ),
    width=800,
    height=800,
    showlegend=False,
    title=title
  )
  plot_figure = go.Figure(data=data, layout=layout)
  plotly.offline.iplot(plot_figure)
  

In [48]:
dots_data = [
  {
    "dots": untrained_weights,
    "color": "blue",
    "labels": [reverse_vocab_list[i] for i in range(len(reverse_vocab_list))]
  },
  {
    "dots": trained_weights,
    "color": "red",
    "labels": [reverse_vocab_list[i] for i in range(len(reverse_vocab_list))]
  }
]

plot_dots(dots_data, "Embedding Uzayı yani sözlük uzayı")


In [10]:
prompt = "the capital of france is"
tokens = tokenize(prompt)
num_tokens = len(tokens)
tokens_padded = tokens + [vocab_list['<pad>']] * (test_config.seq_len - num_tokens)
tokens, tokens_padded

([0, 61, 1, 61, 2, 61, 8, 61, 5],
 [0,
  61,
  1,
  61,
  2,
  61,
  8,
  61,
  5,
  63,
  63,
  63,
  63,
  63,
  63,
  63,
  63,
  63,
  63,
  63,
  63,
  63,
  63,
  63,
  63,
  63,
  63,
  63,
  63,
  63,
  63,
  63])

In [11]:
cumle = model.token_embedding(torch.tensor([tokens_padded]).to(device))
cumle.shape

torch.Size([1, 32, 12])

In [12]:
cumle[0]

tensor([[ 1.9269,  1.4873,  0.9007, -2.1055,  0.6784, -1.2345, -0.0431, -1.6047,
         -0.7521,  1.6487, -0.3925, -1.4036],
        [ 0.3429, -1.4591, -1.4937, -0.2214,  0.2252, -0.0772,  0.9857,  1.2783,
          0.2882,  0.8690, -0.8097, -1.4299],
        [-0.7279, -0.5594, -0.7688,  0.7624,  1.6423, -0.1596, -0.4974,  0.4396,
         -0.7581,  1.0783,  0.8008,  1.6806],
        [ 0.3429, -1.4591, -1.4937, -0.2214,  0.2252, -0.0772,  0.9857,  1.2783,
          0.2882,  0.8690, -0.8097, -1.4299],
        [ 1.2791,  1.2964,  0.6105,  1.3347, -0.2316,  0.0418, -0.2516,  0.8599,
         -1.3847, -0.8712, -0.2234,  1.7174],
        [ 0.3429, -1.4591, -1.4937, -0.2214,  0.2252, -0.0772,  0.9857,  1.2783,
          0.2882,  0.8690, -0.8097, -1.4299],
        [-2.5095,  0.4880,  0.7846,  0.0286,  0.6408,  0.5832,  1.0669, -0.4502,
         -0.1853,  0.7528,  0.4048,  0.1785],
        [ 0.3429, -1.4591, -1.4937, -0.2214,  0.2252, -0.0772,  0.9857,  1.2783,
          0.2882,  0.8690, -0.

In [13]:

dots_data = [
  {
    "dots": cumle[0].detach().cpu().numpy(),
    "color": "red",
    "labels": [reverse_vocab_list[i] for i in tokens_padded]
  }
]

plot_dots(dots_data, "Attention Uzayı yani bağlam uzayı")

In [14]:
from gpt_model import get_position_encoding

position_encoding = get_position_encoding(test_config.seq_len, test_config.n_embd, device=device)
positioned_cumle = cumle + position_encoding
positioned_cumle


tensor([[[ 1.9269,  2.4873,  0.9007, -1.1055,  0.6784, -0.2345, -0.0431,
          -0.6047, -0.7521,  2.6487, -0.3925, -0.4036],
         [ 1.1843, -0.9188, -1.2799,  0.7555,  0.2716,  0.9217,  0.9957,
           2.2783,  0.2903,  1.8690, -0.8092, -0.4299],
         [ 0.1814, -0.9756, -0.3512,  1.6710,  1.7350,  0.8361, -0.4774,
           1.4394, -0.7538,  2.0783,  0.8017,  2.6806],
         [ 0.4840, -2.4491, -0.8914,  0.5769,  0.3640,  0.9131,  1.0157,
           2.2779,  0.2946,  1.8690, -0.8083, -0.4299],
         [ 0.5223,  0.6428,  1.3695,  1.9858, -0.0470,  1.0246, -0.2116,
           1.8591, -1.3761,  0.1287, -0.2215,  2.7174],
         [-0.6160, -1.1754, -0.6130,  0.2524,  0.4552,  0.8959,  1.0357,
           2.2771,  0.2989,  1.8690, -0.8074, -0.4299],
         [-2.7890,  1.4482,  1.7462,  0.3032,  0.9157,  1.5447,  1.1269,
           0.5480, -0.1723,  1.7527,  0.4075,  1.1785],
         [ 0.9999, -0.7052, -0.4956, -0.1587,  0.5445,  0.8704,  1.0556,
           2.2759,  0.30

In [15]:
dots_data = [
  {
    "dots": positioned_cumle[0].detach().cpu().numpy(),
    "color": "red",
    "labels": [reverse_vocab_list[i] for i in tokens_padded]
  },
  {
    "dots": cumle[0].detach().cpu().numpy(),
    "color": "blue",
    "labels": [reverse_vocab_list[i] for i in tokens_padded]
  }
]

plot_dots(dots_data, "Position Encoding")

In [16]:
model.blocks[0].mha.attn_heads[0].Wq

tensor([[ 0.3444, -3.1016, -1.4587, -1.4318, -0.6071, -0.2597, -0.7190, -0.3858,
          0.5234, -0.8212, -0.4709,  0.6016],
        [-0.2825,  0.7693, -0.7669, -0.9495,  0.0169,  0.0803,  0.7448,  1.3455,
          0.1268, -2.4521,  0.4160,  1.9025],
        [-0.7347,  0.0447, -1.5211,  0.3478,  0.7402,  1.4162,  0.6834, -0.1383,
          0.9213,  0.5282, -0.0082, -1.4493],
        [-0.6052, -0.1792,  0.1996, -1.2462, -0.4146,  1.4559,  0.3317, -1.0001,
         -0.6920, -0.4720, -1.2894,  1.0763],
        [-1.0667, -1.9893,  0.2973,  0.4345,  0.0034, -1.0240,  0.2240, -0.7555,
          1.3676, -0.3197, -0.9131,  1.9192],
        [-1.6515,  2.1477, -0.6604,  0.1135, -0.2206,  0.7118,  0.3416,  1.5886,
         -0.3489, -0.4579, -1.2322, -0.5981],
        [-0.2815,  0.0528,  0.4250,  0.4826,  0.4881,  1.0082, -0.5950,  0.3926,
          0.8230, -0.8860,  1.4801,  0.8392],
        [-0.2000,  0.9950,  0.7202, -0.1341, -1.4068, -2.3610, -0.2905, -0.1335,
         -0.1569,  1.1383, -0.

In [17]:
attention_output = model.blocks[0].mha.attn_heads[0](positioned_cumle)
attention_output


tensor([[[ 6.7951, -1.3240, -4.2024, -9.5275, -1.3352, -3.0350,  6.1411,
           4.5569, -2.3280, -1.6019,  7.6986,  3.6092],
         [ 6.7809, -1.3340, -4.1601, -9.4753, -1.3161, -2.9810,  6.1285,
           4.5397, -2.3490, -1.6296,  7.7097,  3.5860],
         [ 3.4518, -3.6974,  5.7888,  2.8127,  3.1824,  9.7019,  3.1825,
           0.5022, -7.2760, -8.1291, 10.3237, -1.8657],
         [ 4.3968, -3.0169,  2.9829, -0.6498,  1.9056,  6.1096,  4.0104,
           1.6308, -5.8762, -6.2884,  9.5758, -0.3209],
         [ 6.1177, -2.2096,  0.5250, -2.7651, -3.9434,  5.5242,  4.3857,
          -2.6253,  2.3173, -1.4423, 14.7557,  3.8697],
         [ 5.0727, -2.0716,  0.2987, -3.8080,  0.4281,  2.0342,  4.1419,
           2.0372, -4.9573, -4.0329,  8.0598,  1.4006],
         [ 3.8244, -2.9540, -3.3364, -6.5394, -1.0544, -4.3810,  1.4876,
           1.7957, -6.6680,  1.3385,  7.5117,  4.9827],
         [ 6.7779, -1.3355, -4.1514, -9.4644, -1.3125, -2.9704,  6.1257,
           4.5357, -2.35

In [18]:
dots_data = [
  {
    "dots": attention_output[0].detach().cpu().numpy(),
    "color": "red",
    "labels": [reverse_vocab_list[i] for i in tokens_padded]
  }
]

plot_dots(dots_data, "Attention")

In [19]:
l_normalized = model.ln_f(attention_output)
l_normalized

tensor([[[ 1.2606, -0.3534, -0.9257, -1.9843, -0.3557, -0.6936,  1.1306,
           0.8157, -0.5530, -0.4087,  1.4402,  0.6273],
         [ 1.2612, -0.3575, -0.9212, -1.9815, -0.3539, -0.6861,  1.1311,
           0.8141, -0.5600, -0.4165,  1.4465,  0.6239],
         [ 0.3455, -0.9187,  0.7587,  0.2324,  0.2978,  1.4506,  0.2978,
          -0.1761, -1.5515, -1.7023,  1.5606, -0.5948],
         [ 0.7051, -0.9326,  0.3928, -0.4097,  0.1548,  1.0835,  0.6197,
           0.0941, -1.5643, -1.6553,  1.8492, -0.3371],
         [ 0.7986, -0.8332, -0.2973, -0.9421, -1.1730,  0.6822,  0.4592,
          -0.9147,  0.0539, -0.6829,  2.4912,  0.3580],
         [ 1.1511, -0.7369, -0.1105, -1.1958, -0.0763,  0.3481,  0.9051,
           0.3489, -1.4995, -1.2552,  1.9404,  0.1807],
         [ 0.9513, -0.5998, -0.6873, -1.4203, -0.1651, -0.9264,  0.4165,
           0.4870, -1.4497,  0.3824,  1.7950,  1.2163],
         [ 1.2614, -0.3582, -0.9203, -1.9809, -0.3537, -0.6846,  1.1312,
           0.8138, -0.56

In [20]:
dots_data = [
  {
    "dots": l_normalized[0].detach().cpu().numpy(),
    "color": "red",
    "labels": [reverse_vocab_list[i] for i in tokens_padded]
  }
]

plot_dots(dots_data, "Normalized")

In [21]:
logits = model(torch.tensor([tokens_padded]).to(device))


In [22]:
logits.shape

torch.Size([1, 32, 64])

In [23]:
reverse_vocab_list[torch.argmax(logits[0, 2, :]).item()]

'and'

In [24]:
num_tokens-1, torch.argmax(logits[0, 11, :]).item()

(8, 10)

In [25]:
l1 = torch.nn.Linear(3, 3)
p1 = torch.nn.Parameter(torch.randn(3, 3))
l1.weight, p1

(Parameter containing:
 tensor([[-0.0278,  0.4427,  0.0776],
         [ 0.4939,  0.0236, -0.4395],
         [-0.3948,  0.2665,  0.3651]], requires_grad=True),
 Parameter containing:
 tensor([[-0.2618,  1.4711, -1.0085],
         [-1.3464, -0.8184,  0.2786],
         [ 0.7985,  1.1991,  1.3985]], requires_grad=True))

In [26]:
def inference(prompt, max_new_tokens):
    tokens = tokenize(prompt)
    #print("tokens: ", tokens)
    for _ in range(max_new_tokens):
        num_tokens = len(tokens)
        tokens_padded = tokens + [19] * (test_config.seq_len - num_tokens)
        tokens_padded = torch.tensor(tokens_padded).unsqueeze(0).to(device)
        # print("tokens_padded: ", tokens_padded)
        logits = model(tokens_padded)
        predicted_token = torch.argmax(logits[0, num_tokens-1, :]).item()
        tokens.append(predicted_token)
    return detokenize(tokens)

print("Original: ", text[:test_config.seq_len])
row_model_prediction = inference(text[:10], max_new_tokens=4)
print("Predicted:", row_model_prediction)

Original:  the capital of the united states
Predicted: the <unk>a<unk><unk><unk>aeuropetheyinand


In [27]:
# with open("tr_texts_400.txt", "r", encoding="utf-8") as file:
#     tr_texts = file.read()

# text_example = tr_texts

tokenized_text = tokenize(text)

def get_dataset(num_examples, context_window_length, test_split=0.1):
    input_blocks = [] # List to store input sequences
    target_blocks = [] # List to store target sequences

    # Use a sliding window to create input/target sequences
    for i in range(0, len(tokenized_text), context_window_length + 1):
        block = tokenized_text[i:i+context_window_length+ 1]
        
        # Skip blocks that are too short
        if len(block) < context_window_length + 1:
            continue

        input_seq = block[:-1]  
        target_seq = block[1:]  

        input_blocks.append(input_seq)
        target_blocks.append(target_seq)
        
        # Stop if we have enough examples
        if len(input_blocks) >= num_examples:
            break

    # Convert to tensors for pytorch and move to gpu
    inputs = torch.tensor(input_blocks, dtype=torch.long).to(device)
    targets = torch.tensor(target_blocks, dtype=torch.long).to(device)

    # Calculate train/test split point
    split_idx = int(num_examples * (1 - test_split))

    # Split into train/test
    train_inputs = inputs[:split_idx]
    train_targets = targets[:split_idx]
    test_inputs = inputs[split_idx:]
    test_targets = targets[split_idx:]
    return train_inputs, train_targets, test_inputs, test_targets

# Get a small dataset
i, o, _, _ = get_dataset(4, test_config.seq_len, 0)
print("Input Shape", i.shape)
print("Output Shape", o.shape)
print("Input Example:")
print(i)
print("Output Example:")
print(o)

Input Shape torch.Size([4, 32])
Output Shape torch.Size([4, 32])
Input Example:
tensor([[ 0, 61,  1, 61,  2, 61,  0, 61,  3, 61,  4, 61,  5, 61,  6, 61,  7, 59,
         61,  0, 61,  1, 61,  2, 61,  8, 61,  5, 61,  9, 60, 61],
        [61, 11, 61,  5, 61,  0, 61,  1, 61,  2, 61, 12, 59, 61, 13, 61,  5, 61,
         14, 61, 15, 60, 61, 16, 61,  5, 61, 14, 61, 17, 60, 61],
        [61, 18, 61,  5, 61, 14, 61, 19, 59, 61,  0, 61,  1, 61,  2, 61,  0, 61,
          3, 61, 20, 61,  5, 61,  6, 61,  9, 60, 61, 10, 61,  0],
        [ 1, 61,  2, 61,  0, 61,  3, 61,  4, 61,  5, 61,  6, 61, 11, 59, 61, 22,
         61, 23, 61, 24, 58, 61, 25, 61, 26, 61, 27, 57, 61, 28]],
       device='mps:0')
Output Example:
tensor([[61,  1, 61,  2, 61,  0, 61,  3, 61,  4, 61,  5, 61,  6, 61,  7, 59, 61,
          0, 61,  1, 61,  2, 61,  8, 61,  5, 61,  9, 60, 61, 10],
        [11, 61,  5, 61,  0, 61,  1, 61,  2, 61, 12, 59, 61, 13, 61,  5, 61, 14,
         61, 15, 60, 61, 16, 61,  5, 61, 14, 61, 17, 60, 61, 10]

In [56]:
import torch.nn.functional as F

batch_size = 32
num_steps = 150000

# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

# Define Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',factor=0.2, patience=20, min_lr=5e-5, threshold=1e-4)

# Training loop
i = 1
losses = []

train_inputs, train_targets, _, _ = get_dataset(100, test_config.seq_len, 0)

while i < num_steps:
    for j in range(0, len(train_inputs), batch_size):
        x = train_inputs[j:j+batch_size]
        y = train_targets[j:j+batch_size]

        # Forward pass
        logits = model(x)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1))
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        losses.append(loss.item())
        
        optimizer.step()
        optimizer.zero_grad()
    

        loss = loss.item()
        scheduler.step(loss)

        # Print the average loss for the epoch
        lr = optimizer.param_groups[0]["lr"]
        if i % 150 == 1:
            print(f"Step {i+1}/{num_steps}\t\tLoss: {loss:.6f}\t\tLR: {lr}")
            print(f"Original: {text[:test_config.seq_len]}\tPredicted: {inference(text[0], max_new_tokens=test_config.seq_len)}\tRow: {row_model_prediction}")

        i += 1


Step 2/150000		Loss: 0.606795		LR: 0.0005
Original: the capital of the united states	Predicted: <unk>, italyis states,,history,capital,and,romesis iss,capitalmanyeded,rome oftencapitalcapitaleach	Row: the <unk>a<unk><unk><unk>aeuropetheyinand
Step 152/150000		Loss: 0.988691		LR: 5e-05
Original: the capital of the united states	Predicted: <unk>, italyis  a    ,rich,,rome a a ,  ,europeedownwhile identitymany	Row: the <unk>a<unk><unk><unk>aeuropetheyinand
Step 302/150000		Loss: 0.849405		LR: 5e-05
Original: the capital of the united states	Predicted: <unk>, italyis  a  is ,,capital,europe,,,,,oftens.,capitaleurope,in   	Row: the <unk>a<unk><unk><unk>aeuropetheyinand
Step 452/150000		Loss: 0.790062		LR: 5e-05
Original: the capital of the united states	Predicted: <unk>, italyis is,,is  ,.washington,capitalin each italyidentity. own and isinandidentity	Row: the <unk>a<unk><unk><unk>aeuropetheyinand
Step 602/150000		Loss: 0.757429		LR: 5e-05
Original: the capital of the united states	Predict

KeyboardInterrupt: 

In [66]:
inference("and the capital of the united states", max_new_tokens=14)

'and the capital of the united states rich madrid  rome, cultureromecapitaliss'