In [None]:
# Install NLTK and PyArabic
!pip install nltk pyarabic

# Import libraries
import nltk
import pyarabic.araby as araby

# Download NLTK data
nltk.download('punkt')

# Function for Arabic tokenization using PyArabic
def tokenize_arabic(text):
    tokens = araby.tokenize(text)
    return tokens

# Example usage
example_text = "مثال على نص عربي لاختبار التحليل اللغوي"
print("Original Text:", example_text)
print("Tokenized Text:", tokenize_arabic(example_text))


Collecting pyarabic
  Downloading PyArabic-0.6.15-py3-none-any.whl (126 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m122.9/126.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.4/126.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarabic
Successfully installed pyarabic-0.6.15


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Original Text: مثال على نص عربي لاختبار التحليل اللغوي
Tokenized Text: ['مثال', 'على', 'نص', 'عربي', 'لاختبار', 'التحليل', 'اللغوي']


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import pandas as pd
from collections import Counter
import nltk
import pyarabic.araby as araby
import random
import math

# Ensure NLTK is downloaded (for tokenizing English text)
nltk.download('punkt')

# Tokenization functions
def tokenize_english(text):
    return nltk.word_tokenize(text)

def tokenize_arabic(text):
    return araby.tokenize(text)

def build_vocab(texts, min_freq=1):
    # Flatten the list of lists into a single list of words
    flat_texts = [word for sentence in texts for word in sentence]

    word_freq = Counter(flat_texts)
    word_freq = {word: freq for word, freq in word_freq.items() if freq >= min_freq}
    word_to_idx = {word: idx + 2 for idx, word in enumerate(word_freq)}
    word_to_idx['<unk>'] = 0
    word_to_idx['<pad>'] = 1
    idx_to_word = {idx: word for word, idx in word_to_idx.items()}
    return word_to_idx, idx_to_word


# Function to convert text to indices
def text_to_indices(tokenized_text, vocab):
    return [vocab.get(word, vocab['<unk>']) for word in tokenized_text]

# Load dataset
df = pd.read_csv('/content/separated_translations.csv')  # Update the path to your dataset

# Tokenize and build vocab
tokenized_en = [tokenize_english(sentence) for sentence in df['English']]
tokenized_ar = [tokenize_arabic(sentence) for sentence in df['Arabic']]
en_vocab, en_inv_vocab = build_vocab(tokenized_en)
ar_vocab, ar_inv_vocab = build_vocab(tokenized_ar)

# Convert text to indices
indexed_en = [text_to_indices(sentence, en_vocab) for sentence in tokenized_en]
indexed_ar = [text_to_indices(sentence, ar_vocab) for sentence in tokenized_ar]

# Pad sequences
def pad_sequences(sequences, padding_value=0):
    max_len = max(len(seq) for seq in sequences)
    return [seq + [padding_value] * (max_len - len(seq)) for seq in sequences]

padded_en = pad_sequences(indexed_en, padding_value=en_vocab['<pad>'])
padded_ar = pad_sequences(indexed_ar, padding_value=ar_vocab['<pad>'])

# TranslationDataset class
class TranslationDataset(Dataset):
    def __init__(self, src_data, trg_data):
        self.src_data = src_data
        self.trg_data = trg_data

    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, idx):
        src = torch.tensor(self.src_data[idx], dtype=torch.long)
        trg = torch.tensor(self.trg_data[idx], dtype=torch.long)
        return src, trg

# Split data and create datasets
train_src, valid_src, train_trg, valid_trg = train_test_split(padded_en, padded_ar, test_size=0.2)
train_dataset = TranslationDataset(train_src, train_trg)
valid_dataset = TranslationDataset(valid_src, valid_trg)

# DataLoader
def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_batch_padded = pad_sequence(src_batch, padding_value=en_vocab['<pad>'])
    trg_batch_padded = pad_sequence(trg_batch, padding_value=ar_vocab['<pad>'])
    return src_batch_padded, trg_batch_padded

train_dataloader = DataLoader(train_dataset, batch_size=16, collate_fn=collate_fn)  # Reduced from 32
valid_dataloader = DataLoader(valid_dataset, batch_size=16, collate_fn=collate_fn)


# Encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
        return outputs, hidden

# Attention
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear(enc_hid_dim * 2 + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        return torch.softmax(attention, dim=1)

# Decoder
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(enc_hid_dim * 2 + emb_dim, dec_hid_dim)
        self.fc_out = nn.Linear(enc_hid_dim * 2 + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden, encoder_outputs).unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs).permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
        return prediction, hidden.squeeze(0)

# Seq2Seq Model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len = trg.shape[0]
        batch_size = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src)
        input = trg[0,:]
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs


# Define hyperparameters
INPUT_DIM = len(en_vocab)
OUTPUT_DIM = len(ar_vocab)
ENC_EMB_DIM = 128  # Reduce embedding dimension
DEC_EMB_DIM = 128
ENC_HID_DIM = 256  # Reduce hidden layer size
DEC_HID_DIM = 256
ENC_DROPOUT = 0.5  # Dropout rate for the encoder
DEC_DROPOUT = 0.5  # Dropout rate for the decoder

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)
model = Seq2Seq(enc, dec, device).to(device)


# Instantiate the model, optimizer, criterion
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Seq2Seq(enc, dec, device).to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=ar_vocab['<pad>'])



# Training function
def train(model, iterator, optimizer, criterion, clip):
    model.train()  # Set the model to training mode
    epoch_loss = 0

    for i, batch in enumerate(iterator):
        src, trg = batch
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()  # Clear the gradients

        output = model(src, trg[:-1, :])  # Forward pass, trg[:-1, :] omits the last token

        # Reshape output and target to compute loss
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        trg = trg[1:].contiguous().view(-1)

        loss = criterion(output, trg)  # Compute loss

        loss.backward()  # Backpropagation

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)  # Gradient clipping

        optimizer.step()  # Update model parameters

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, trg in iterator:
            src, trg = src.to(device), trg.to(device)

            # Forward pass, without teacher forcing
            output = model(src, trg, 0)  # turn off teacher forcing

            # Calculate loss
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)


# Training loop
N_EPOCHS = 10
CLIP = 1
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_dataloader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_dataloader, criterion)

    print(f'Epoch: {epoch+1}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tVal. Loss: {valid_loss:.3f}')

# Save model
torch.save(model.state_dict(), 'translation_model.pt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch: 1
	Train Loss: 8.994
	Val. Loss: 8.607
Epoch: 2
	Train Loss: 7.750
	Val. Loss: 8.636
Epoch: 3
	Train Loss: 6.847
	Val. Loss: 8.976
Epoch: 4
	Train Loss: 6.165
	Val. Loss: 9.365
Epoch: 5
	Train Loss: 5.699
	Val. Loss: 9.446
Epoch: 6
	Train Loss: 5.444
	Val. Loss: 9.683
Epoch: 7
	Train Loss: 5.294
	Val. Loss: 9.718
Epoch: 8
	Train Loss: 5.164
	Val. Loss: 9.860
Epoch: 9
	Train Loss: 5.057
	Val. Loss: 9.965
Epoch: 10
	Train Loss: 4.973
	Val. Loss: 10.127
