In [1]:
from turkish_tokenizer.turkish_tokenizer import tokenize, decode_text, vocab_size

text_example = "Merhabalar"

tokenize(text_example)

{'tokens': ['<uppercase>', 'merhaba', 'lar'], 'ids': [0, 2036, 22268]}

In [2]:
from gpt_config import GPTConfig

test_config = GPTConfig(
    vocab_size=vocab_size,
    n_layer=8,
    n_head=8,
    n_embd=512,
    seq_len=512,
)

config1 = GPTConfig(
    vocab_size=vocab_size,
    n_layer=4,   # fewer layers for a quick demo
    n_head=4,
    n_embd=128,
    seq_len=128,
)

test_config = config1

print(test_config.vocab_size)

31356


In [3]:
import torch

device = 'cpu'

if torch.cuda.is_available():
    device = 'cuda'
""" elif torch.backends.mps.is_available():
    device = 'mps' """

print(device)


cpu


In [4]:
from gpt_model import GPTModel

torch.manual_seed(42)
model = GPTModel(test_config, device)

parameters_count = 0

for p in model.parameters():
    parameters_count += p.numel()

print(parameters_count)
model

9636732


GPTModel(
  (token_embedding): Embedding(31356, 128)
  (blocks): Sequential(
    (0): GPTBlock(
      (mha): MultiHeadAttention(
        (attn_heads): ModuleList(
          (0-3): 4 x CausalSelfAttention()
        )
        (projection): Linear(in_features=512, out_features=128, bias=True)
      )
      (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (ffn): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=512, out_features=128, bias=True)
      )
      (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    )
    (1): GPTBlock(
      (mha): MultiHeadAttention(
        (attn_heads): ModuleList(
          (0-3): 4 x CausalSelfAttention()
        )
        (projection): Linear(in_features=512, out_features=128, bias=True)
      )
      (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (ffn): Sequential(
        (0): Linear(in_features=128, out_fea

In [5]:

def inference(prompt, max_new_tokens = 20):
    tokens = tokenize(prompt)['ids']
    for _ in range(max_new_tokens):
        num_tokens = len(tokens)
        tokens_padded = tokens + [5] * (test_config.seq_len - num_tokens)
        tokens_padded = torch.tensor(tokens_padded).unsqueeze(0).to(device)
        logits = model(tokens_padded)
        predicted_token = torch.argmax(logits[0, num_tokens-1, :]).item()
        tokens.append(predicted_token)
    return decode_text(tokens)

print("Original: ", text_example[:test_config.seq_len])
row_model_prediction = inference("Merhabal", 64)
print("Predicted:", row_model_prediction)

Original:  Merhabalar
Predicted: Merhabal」sıvılimearkınkavaraayalrahmtrasyonңbirliğibıktırhareketlisubtropikschçekirdekligcesaretayarsilahşor-[zonk]-diktenulamaçözherhaldekörelaverajsoluksuzsoluksuzek_22517nazilhidroelektrikröportaizleyiciyeköpükspecial_88göksubasımaksaltafırkateynkatıldıkaradenizçıkartmaek_22359kok_21656stetoskoptefekölçek1974altaastrolojikherhaldetalanbezortakːbizdikakımkıçiğnetmealtaiflahbildinıl


In [6]:
with open("tr_texts_400.txt", "r", encoding="utf-8") as file:
    tr_texts = file.read()

text_example = tr_texts

tokenized_text = tokenize(text_example)
tokenized_text

{'tokens': ['<uppercase>',
  'b',
  '<uppercase>',
  'a',
  '<uppercase>',
  't',
  '<uppercase>',
  'u',
  '<uppercase>',
  'h',
  '<uppercase>',
  'a',
  '<uppercase>',
  'n',
  '<space>',
  '<uppercase>',
  'e',
  '<uppercase>',
  'r',
  '<uppercase>',
  'd',
  '<uppercase>',
  'u',
  '<uppercase>',
  'r',
  '<uppercase>',
  'c',
  '<uppercase>',
  'a',
  '<uppercase>',
  'n',
  '<space>',
  '<newline>',
  '21',
  '30',
  '18',
  '55',
  '<space>',
  '<newline>',
  '<uppercase>',
  't',
  '<uppercase>',
  'u',
  '<uppercase>',
  'r',
  '<uppercase>',
  'k',
  '<space>',
  '101',
  '13',
  '<space>',
  '<newline>',
  '<uppercase>',
  'ö',
  '<uppercase>',
  'd',
  '<uppercase>',
  'e',
  '<uppercase>',
  'v',
  '<space>',
  '3',
  '2',
  '<space>',
  '<newline>',
  '<uppercase>',
  'a',
  '<uppercase>',
  'li̇',
  '<space>',
  '<uppercase>',
  't',
  '<uppercase>',
  'u',
  '<uppercase>',
  'r',
  '<uppercase>',
  'a',
  '<uppercase>',
  'n',
  '<space>',
  '<uppercase>',
  'gö',
  '

In [7]:
from datasets import load_dataset

ds = load_dataset("wikimedia/wikipedia", "20231101.tr")

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [None]:
text = ""

for item in ds['train']:
  text += item['title']
  text += "\n"
  text += item['text']
  text += "\n"

tokenized_text = tokenize(text)


In [41]:
# save text to file
with open("tr_wikipedia.txt", "w", encoding="utf-8") as file:
    file.write(text)


In [42]:
tokenized_text = tokenize(text)


In [None]:
len(tokens)

In [None]:
tokens = tokenized_text['ids']

def get_dataset(num_examples, context_window_length, test_split=0.1):
    input_blocks = [] # List to store input sequences
    target_blocks = [] # List to store target sequences

    # Use a sliding window to create input/target sequences
    for i in range(0, len(tokens), context_window_length):
        block = tokens[i: i + context_window_length + 1]
        
        # Skip blocks that are too short
        if len(block) < context_window_length + 1:
            continue

        input_seq = block[:-1]  
        target_seq = block[1:]  

        input_blocks.append(input_seq)
        target_blocks.append(target_seq)
        
        # Stop if we have enough examples
        if len(input_blocks) >= num_examples:
            break

    # Convert to tensors for pytorch and move to gpu
    inputs = torch.tensor(input_blocks, dtype=torch.long).to(device)
    targets = torch.tensor(target_blocks, dtype=torch.long).to(device)

    # Calculate train/test split point
    split_idx = int(num_examples * (1 - test_split))

    # Split into train/test
    train_inputs = inputs[:split_idx]
    train_targets = targets[:split_idx]
    test_inputs = inputs[split_idx:]
    test_targets = targets[split_idx:]
    return train_inputs, train_targets, test_inputs, test_targets

# Get a small dataset
i, o, _, _ = get_dataset(3, test_config.seq_len, 0.1)
print("Input Shape", i.shape)
print("Output Shape", o.shape)
print("Input Example:")
print(i)
print("Output Example:")
print(o)

In [44]:
import torch.nn.functional as F

batch_size = 16
num_steps = 2560

# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

# Define Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',factor=0.2, patience=20, min_lr=5e-6, threshold=1e-4)

train_inputs, train_targets, _, _ = get_dataset(100, test_config.seq_len, 0)

In [None]:

# Training loop
i = 1
losses = []

while i < num_steps:
    for j in range(0, len(train_inputs), batch_size):
        x = train_inputs[j:j+batch_size]
        y = train_targets[j:j+batch_size]

        # Forward pass
        logits = model(x)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1))
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        losses.append(loss.item())
        
        optimizer.step()
        optimizer.zero_grad()
    

        loss = loss.item()
        scheduler.step(loss)

   
        # Print the average loss for the epoch
        lr = optimizer.param_groups[0]["lr"]
        # if i % 150 == 1:
        print(f"Step {i+1}/{num_steps}\t\tLoss: {loss:.6f}\t\tLR: {lr}")
        # print(f"Original: {text_example[:test_config.seq_len]}\tPredicted: {inference(text_example[0], max_new_tokens=96)}\tRow: {row_model_prediction}")

        i += 1


In [None]:
inference("Merhaba", max_new_tokens=96)