## Install Required Libraries


In [20]:
! pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


## Import Libraries

In [36]:
import os
import torch
import random
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import AutoTokenizer

## Dataset

In [37]:
dataset = load_dataset('Helsinki-NLP/tatoeba_mt', 'ara-eng', split='test[:5%]')
arabic_sentences = [item['sourceString'] for item in dataset]
english_sentences = [item['targetString'] for item in dataset]

## Tokenizer

In [38]:
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ar-en")

pairs = []
for ar, en in zip(arabic_sentences, english_sentences):
    ar_ids = tokenizer.encode(ar, return_tensors='pt', padding=False, truncation=True)[0]
    en_ids = tokenizer.encode(en, return_tensors='pt', padding=False, truncation=True)[0]
    pairs.append((ar_ids, en_ids))

max_ar_len = max(len(p[0]) for p in pairs)
max_en_len = max(len(p[1]) for p in pairs)

# Pad sequences
def pad_sequence(seq, max_len):
    return torch.cat([seq, torch.tensor([tokenizer.pad_token_id] * (max_len - len(seq)))])

X = torch.stack([pad_sequence(p[0], max_ar_len) for p in pairs])
Y = torch.stack([pad_sequence(p[1], max_en_len) for p in pairs])

## Model (LSTM + Attention)

In [39]:
class BahdanauAttention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim * 2, hid_dim)
        self.v = nn.Parameter(torch.rand(hid_dim))

    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.shape[0]
        batch_size = encoder_outputs.shape[1]

        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        v = self.v.repeat(batch_size, 1).unsqueeze(1)
        attention = torch.bmm(v, energy.permute(0, 2, 1)).squeeze(1)

        return F.softmax(attention, dim=1)

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim)

    def forward(self, src):
        src = src.long()  # <-- Ensure Long tensor for embedding
        embedded = self.embedding(src).permute(1, 0, 2)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, (hidden, cell)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.attention = BahdanauAttention(hid_dim)
        self.lstm = nn.LSTM(emb_dim + hid_dim, hid_dim)
        self.fc_out = nn.Linear(hid_dim * 2 + emb_dim, output_dim)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.long()  # <-- Ensure Long tensor for embedding
        input = input.unsqueeze(0)
        embedded = self.embedding(input)

        attn_weights = self.attention(hidden[-1], encoder_outputs).unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        context = torch.bmm(attn_weights, encoder_outputs).permute(1, 0, 2)

        rnn_input = torch.cat((embedded, context), dim=2)
        output, (hidden, cell) = self.lstm(rnn_input, (hidden, cell))
        output = output.squeeze(0)
        context = context.squeeze(0)
        embedded = embedded.squeeze(0)

        prediction = self.fc_out(torch.cat((output, context, embedded), dim=1))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        encoder_outputs, (hidden, cell) = self.encoder(src)
        input = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1

        return outputs

## Training

In [40]:
EMB_DIM = 256
HID_DIM = 512
BATCH_SIZE = 16
EPOCHS = 30
VOCAB_SIZE = tokenizer.vocab_size
INPUT_DIM = OUTPUT_DIM = VOCAB_SIZE

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM)
dec = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM)
model = Seq2Seq(enc, dec, device).to(device)

optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

val_size = int(0.1 * len(X))
train_size = len(X) - val_size
train_ds, val_ds = random_split(TensorDataset(X, Y), [train_size, val_size])
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)

def train():
    model.train()
    for epoch in range(EPOCHS):
        total_loss = 0
        for src, trg in train_loader:
            src = src.long().to(device)  # <-- Ensure Long before embedding
            trg = trg.long().to(device)
            optimizer.zero_grad()
            output = model(src, trg)
            output = output[:, 1:].reshape(-1, output.shape[-1])
            trg = trg[:, 1:].reshape(-1)
            loss = criterion(output, trg)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

def validate():
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, trg in val_loader:
            src = src.long().to(device)  # <-- Ensure Long before embedding
            trg = trg.long().to(device)
            output = model(src, trg, 0)
            output = output[:, 1:].reshape(-1, output.shape[-1])
            trg = trg[:, 1:].reshape(-1)
            loss = criterion(output, trg)
            total_loss += loss.item()
    print(f"Validation Loss: {total_loss / len(val_loader):.4f}")

train()
validate()

Epoch 1, Loss: 6.6146
Epoch 2, Loss: 4.8713
Epoch 3, Loss: 4.5002
Epoch 4, Loss: 4.2292
Epoch 5, Loss: 4.0755
Epoch 6, Loss: 3.8515
Epoch 7, Loss: 3.6796
Epoch 8, Loss: 3.5118
Epoch 9, Loss: 3.2970
Epoch 10, Loss: 2.9954
Epoch 11, Loss: 2.8104
Epoch 12, Loss: 2.4330
Epoch 13, Loss: 2.1774
Epoch 14, Loss: 1.9171
Epoch 15, Loss: 1.5747
Epoch 16, Loss: 1.3772
Epoch 17, Loss: 1.0652
Epoch 18, Loss: 0.8855
Epoch 19, Loss: 0.7200
Epoch 20, Loss: 0.5919
Epoch 21, Loss: 0.4826
Epoch 22, Loss: 0.3885
Epoch 23, Loss: 0.3050
Epoch 24, Loss: 0.2457
Epoch 25, Loss: 0.1900
Epoch 26, Loss: 0.1506
Epoch 27, Loss: 0.1308
Epoch 28, Loss: 0.1163
Epoch 29, Loss: 0.1014
Epoch 30, Loss: 0.0935
Validation Loss: 7.2940


## Save Model

In [41]:
torch.save(model.state_dict(), "seq2seq_model.pth")
# tokenizer.save_pretrained("hf_tokenizer")


## Inference

In [42]:
def translate_sentence(sentence):
    model.eval()
    tokens = tokenizer.encode(sentence, return_tensors='pt')[0]
    src_tensor = pad_sequence(tokens, max_ar_len).unsqueeze(0).to(device)
    src_tensor = src_tensor.long()  # ensure long for embedding

    with torch.no_grad():
        encoder_outputs, (hidden, cell) = model.encoder(src_tensor)

        # Use bos_token_id if available, else pad_token_id as start token
        start_token = tokenizer.bos_token_id if tokenizer.bos_token_id is not None else tokenizer.pad_token_id
        input = torch.tensor([start_token], device=device)

        outputs = []
        for _ in range(max_en_len):
            output, hidden, cell = model.decoder(input, hidden, cell, encoder_outputs)
            top1 = output.argmax(1).item()
            if top1 == tokenizer.eos_token_id:
                break
            outputs.append(top1)
            input = torch.tensor([top1], device=device)

    return tokenizer.decode(outputs, skip_special_tokens=True)


In [45]:
print("\nSample Translation:")
print("AR:", arabic_sentences[3])
print("EN:", translate_sentence(arabic_sentences[3]))


Sample Translation:
AR: استجمع توم ما يكفي من الشجاعة لطلب علاوة .
EN: k summoned up enough courage to ask for a raise.
