<a href="https://colab.research.google.com/github/mohamed-ben-lboukht/Translation/blob/main/translationmachine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

In [None]:
import sentencepiece as spm
import tqdm


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
data = pd.read_csv('Cleaned_Sentence_Pairs (1).csv')

In [None]:
# data = data[len(data)//2:]

In [None]:
with open("train.txt", "w", encoding="utf-8") as f:
    for _, row in data.iterrows():
        # Convert to string before stripping to handle potential non-string values like NaN
        f.write(str(row["English"]).strip() + "\n")
        f.write(str(row["Turkish"]).strip() + "\n")

In [None]:
spm.SentencePieceTrainer.train(
    input='train.txt',
    model_prefix='tokenizer',         # outputs tokenizer.model, tokenizer.vocab
    vocab_size=16000,                  # or 16000, depending on data size
    character_coverage=1.0,           # full Unicode range (for Turkish characters)
    model_type='bpe'                  # Byte-Pair Encoding (best for this case)
)


In [None]:
sp = spm.SentencePieceProcessor()
sp.load("tokenizer.model")

# Example
text = "I love cats."
ids = sp.encode(text, out_type=int)
print("Token IDs:", ids)
print("Decoded:", sp.decode(ids))


Token IDs: [29, 1010, 5854, 15812]
Decoded: I love cats.


In [None]:
len(sp)

16000

In [None]:
src_encoded = [sp.encode(str(row["English"]), out_type=int) for _, row in data.iterrows()]
tgt_encoded = [sp.encode(str(row["Turkish"]), out_type=int) for _, row in data.iterrows()]

vocab_size = len(sp)  # very important for model

In [None]:
src_encoded = [torch.tensor(sp.encode(str(row["English"]), out_type=int)) for _, row in data.iterrows()]
tgt_encoded = [torch.tensor([1] + sp.encode(str(row["Turkish"]), out_type=int) + [2]) for _, row in data.iterrows()]  # BOS = 1, EOS = 2

In [None]:
src = pad_sequence(src_encoded, batch_first=True, padding_value=0)
tgt = pad_sequence(tgt_encoded, batch_first=True, padding_value=0)

In [None]:
src = torch.load("src.pt")
tgt = torch.load("tgt.pt")

In [None]:
src = src.to(device)
tgt = tgt.to(device)

In [None]:
tgt.shape[1]

146

In [None]:
src.shape[1]

180

In [None]:
max_seq_len = (max(src.shape[1],tgt.shape[1]))

In [None]:
src.shape

torch.Size([208514, 180])

In [None]:
class Config():
    # Global hyperparameters
    d_model = 128
    n_layer = 4
    num_heads = 4
    head_size = d_model // num_heads
    dropout = 0.1
    vocab_size = 16000
    max_seq_len = 512
    batch_size = 32
    warmup_steps = 4000


In [None]:
class TranslationDataset(Dataset):
    def __init__(self, src, tgt):
        self.src = src
        self.tgt = tgt

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        return self.src[idx], self.tgt[idx]




In [None]:
class Head(nn.Module):

    def __init__(self,config):
        super().__init__()
        self.key = nn.Linear(config.d_model, config.head_size, bias=False)
        self.query = nn.Linear(config.d_model, config.head_size, bias=False)
        self.value = nn.Linear(config.d_model, config.head_size, bias=False)

        self.dropout = nn.Dropout(config.dropout)

    def forward(self, query,key,value):
        B,T,C = value.shape
        k = self.key(key)   # (B,T,C)
        q = self.query(query) # (B,T,C)
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        v = self.value(value) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out



In [None]:
class MultiHeadAttention(nn.Module):

    def __init__(self,config):
        super().__init__()
        self.heads = nn.ModuleList([ Head(config) for _ in range(config.num_heads)])
        # Input dimension should be num_heads * head_size, output dimension should be d_model
        self.proj = nn.Linear(config.num_heads * config.head_size, config.d_model)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self,query,key,value):
        # out shape will be (B, T, num_heads * head_size)
        out = torch.cat([h(query,key,value) for h in self.heads], dim=-1)
        # Project to d_model dimension
        out = self.dropout(self.proj(out))
        return out

In [None]:
class FeedFoward(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(config.d_model, 4 * config.d_model),
            nn.ReLU(),
            nn.Linear(4 * config.d_model, config.d_model),
            nn.Dropout(config.dropout),
        )

    def forward(self, x):
        return self.net(x)


In [None]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self,config):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        self.sa = MultiHeadAttention(config)
        self.ffwd = FeedFoward(config)
        self.ln1 = nn.LayerNorm(config.d_model)
        self.ln2 = nn.LayerNorm(config.d_model)

    def forward(self, query,key,value):
        x = query
        x = x + self.ln1(self.sa(query,key,value))
        x = x + self.ln2(self.ffwd(x))
        return x

In [None]:
class SinusoidalPositionalEncoding(nn.Module):
    def __init__(self, dim, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, dim)
        position = torch.arange(0, max_len).unsqueeze(1)
        # Convert 10000.0 to a tensor
        div_term = torch.exp(torch.arange(0, dim, 2) * (-torch.log(torch.tensor(10000.0)) / dim))

        pe[:, 0::2] = torch.sin(position * div_term)   # Even indices
        pe[:, 1::2] = torch.cos(position * div_term)   # Odd indices
        pe = pe.unsqueeze(0)  # (1, max_len, dim)
        self.register_buffer('pe', pe)  # ensures it's on correct device and not updated during training

    def forward(self, x):
        """
        x: Tensor of shape (batch_size, seq_len, dim)
        """
        seq_len = x.size(1)
        return self.pe[:, :seq_len]

In [None]:
class encoder(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.token_embedding_table = nn.Embedding(config.vocab_size, config.d_model, padding_idx=0)
        self.position_embedding_table = SinusoidalPositionalEncoding(config.d_model, config.max_seq_len)
        self.blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)]) # Use ModuleList instead of Sequential
        self.ln_f = nn.LayerNorm(config.d_model)

    def forward(self, idx):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx) # (B, T, d_model)
        pos_emb = self.position_embedding_table(tok_emb) # Get positional embeddings based on token embedding shape
        x = tok_emb + pos_emb

        # Manually pass query, key, value through each block
        for block in self.blocks:
            x = block(x, x, x) # Self-attention in encoder

        x = self.ln_f(x)
        return x

In [None]:
class decoder(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.token_embedding_table = nn.Embedding(config.vocab_size, config.d_model, padding_idx=0)
        self.position_embedding_table = SinusoidalPositionalEncoding(config.d_model, config.max_seq_len)
        self.blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)]) # Use ModuleList instead of Sequential
        self.ln_f = nn.LayerNorm(config.d_model)
        self.lm_head = nn.Linear(config.d_model, config.vocab_size)

    def forward(self, idx, encoder_output=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx) # (B, T, d_model)
        pos_emb = self.position_embedding_table(tok_emb) # Get positional embeddings based on token embedding shape
        x = tok_emb + pos_emb

        # Manually pass query, key, value through each block
        if encoder_output is not None:
            for block in self.blocks:
                # Assuming first attention is self-attention, second is cross-attention if encoder_output is provided
                # This structure of Block might need refinement based on standard Transformer architecture
                # For simplicity, passing appropriate K, V here based on typical decoder block
                 x = block(x, encoder_output, encoder_output) # Cross-attention in decoder

        else:
            for block in self.blocks:
                 x = block(x, x, x) # Self-attention in decoder (e.g., if no encoder output)


        x = self.ln_f(x)
        logits = self.lm_head(x)
        return logits

In [None]:
class Transformer(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.encoder = encoder(config)
        self.decoder = decoder(config)
        self.block_size = config.max_seq_len # Added block_size attribute from config

    def forward(self, src, tgt):
        enc_output = self.encoder(src)
        logits = self.decoder(tgt, enc_output)  # (B, tgt_len, vocab_size)
        return logits

    def translate(self, src, max_new_tokens=50, start_token_id=1):

      B = src.size(0)
      tgt = torch.full((B, 1), start_token_id, dtype=torch.long).to(src.device)  # initial token (e.g. <BOS>)

      # Encode once
      enc_output = self.encoder(src)

      for _ in range(max_new_tokens):
          # Ensure tgt_cond is on the same device as enc_output and get positional embeddings
          tgt_cond = tgt[:, -self.block_size:].to(src.device)  # (B, T) cropped decoder input
          # Correctly pass tgt_cond to decoder for positional embedding calculation
          dec_output = self.decoder(tgt_cond, enc_output)
          logits = dec_output[:, -1, :]         # take last token only: (B, vocab_size)
          probs = F.softmax(logits, dim=-1)     # (B, vocab_size)
          idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
          tgt = torch.cat((tgt, idx_next.to(tgt.device)), dim=1)  # append next token

      return tgt[:, 1:]  # remove start_token

In [None]:
model = Transformer(Config())

In [None]:
model = model.to(device)

In [None]:
model = torch.compile(model)

In [None]:
loss_fn = nn.CrossEntropyLoss(ignore_index=0)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
test_src , testx , test_tgt , testy = train_test_split(src,tgt,test_size=0.2,random_state=42)

In [None]:
device

device(type='cuda')

In [None]:
test_src.shape

torch.Size([166811, 180])

In [None]:
dataset = TranslationDataset(test_src, test_tgt)

In [None]:
train_loader = DataLoader(dataset, batch_size=Config().batch_size, shuffle=True)

In [None]:
torch.set_float32_matmul_precision('high')

In [None]:
from torch.optim.lr_scheduler import LambdaLR

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1.0,betas=(0.9, 0.98), eps=1e-9)
# Noam learning rate schedule
def get_transformer_lr_lambda(warmup_steps, d_model):
    def lr_lambda(step):
        step = max(1, step)
        return (d_model ** -0.5) * min(step ** -0.5, step * (warmup_steps ** -1.5))
    return lr_lambda

scheduler = LambdaLR(optimizer, lr_lambda=get_transformer_lr_lambda(Config().warmup_steps, Config().d_model))

In [None]:
# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    loop = tqdm.tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for src1, tgt1 in loop:
        src1, tgt1 = src1.to(device), tgt1.to(device)
        tgt_input = tgt1[:, :-1]
        tgt_target = tgt1[:, 1:]

        output = model(src1, tgt_input)  # (B, T, vocab_size)
        output = output.view(-1, Config().vocab_size)
        tgt_target = tgt_target.reshape(-1)

        loss = loss_fn(output, tgt_target)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()  # <-- update learning rate every step

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item(), lr=scheduler.get_last_lr()[0])

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} average loss: {avg_loss:.4f}")

Epoch 1/5:  18%|█▊        | 945/5213 [01:06<04:47, 14.83it/s, loss=1.61, lr=0.000244]

In [None]:
torch.save(model.state_dict(), "tre.pth")

In [None]:
torch.save(model,"model.pt")

In [None]:
def top_k_filtering(logits, k=10):
    """
    Keep only top-k logits (set rest to -inf for zero probability).
    """
    values, indices = torch.topk(logits, k)
    filtered_logits = torch.full_like(logits, float('-inf'))
    filtered_logits.scatter_(1, indices, values)
    return filtered_logits

def translate_sentence(
    sentence, model, tokenizer=sp, max_len=50, bos=1, eos=2, k=10, temperature=1.0
):
    model.eval()

    # Encode input sentence
    src_ids = tokenizer.encode(sentence, out_type=int)
    src_tensor = torch.tensor(src_ids, dtype=torch.long).unsqueeze(0).to(device)

    # Start decoder with BOS token
    tgt_ids = torch.tensor([[bos]], dtype=torch.long).to(device)

    for _ in range(max_len):
        with torch.no_grad():
            output = model(src_tensor, tgt_ids)            # (1, T, vocab_size)
            logits = output[:, -1, :] / temperature        # last token only, scale

            filtered_logits = top_k_filtering(logits, k=k) # apply top-k
            probs = torch.softmax(filtered_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

        tgt_ids = torch.cat([tgt_ids, next_token], dim=1)

        if next_token.item() == eos:
            break

    # Decode output, skip BOS and EOS
    output_ids = tgt_ids.squeeze(0).tolist()
    decoded = tokenizer.decode(output_ids[1:-1])

    return decoded.strip()


In [None]:
# torch.manual_seed(12)
examples = [
    "Hello, how are you?",
    "Where is the airport?",
    "I love cats and dogs.",
    "The weather is nice today.",
    "Can you help me please?"
]

for en in examples:
    tr = translate_sentence(en, model, k=10,temperature=0.7)
    print(f"EN: {en}")
    print(f"→  {tr}")
    print("-" * 30)


EN: Hello, how are you?
→  Merhaba. Bugün de?
------------------------------
EN: Where is the airport?
→  Havaalanı çalışır.
------------------------------
EN: I love cats and dogs.
→  Büyükannem yalılar köpeklerden vazgeç.
------------------------------
EN: The weather is nice today.
→  Hava bugün Bugün hava bugün hava kadar soğuk hav bugün Bugün hava kadar soğuk.
------------------------------
EN: Can you help me please?
→  Bana yardım eder misin?
------------------------------
