In [None]:
!pip install nltk
!pip install sacrebleu
!pip install torch


In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam
import unicodedata
import re

# Dataset Parsing
SEPARATOR = '<sep>'
# Download the training set.
!wget https://raw.githubusercontent.com/divar167/data/refs/heads/main/train.txt -O train.txt
with open('train.txt') as file:
    train = [line.rstrip() for line in file]

# Unicode normalization
def normalize_unicode(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def preprocess_sentence(s):
    s = normalize_unicode(s)
    s = s.lower()
    s = re.sub(r"([?.!,¿])", r" \1 ", s)
    s = re.sub(r'[" "]+', " ", s)
    s = s.strip()
    return s

# Filter and preprocess the dataset
train_filtered = [pair for pair in train if SEPARATOR in pair and pair.strip() != '']
train_input, train_target = map(list, zip(*[pair.split(SEPARATOR) for pair in train_filtered]))
train_input = [preprocess_sentence(sentence.strip()) for sentence in train_input]
train_target = [preprocess_sentence(sentence.strip()) for sentence in train_target]

# Vocabulary Creation
def build_vocab(sentences):
    vocab = {"<sos>": 0, "<eos>": 1, "<unk>": 2}
    for sentence in sentences:
        for word in sentence.split():
            if word not in vocab:
                vocab[word] = len(vocab)
    return vocab

src_vocab = build_vocab(train_input)
tgt_vocab = build_vocab(train_target)
rev_tgt_vocab = {idx: token for token, idx in tgt_vocab.items()}

# Dataset Class
class TranslationDataset(Dataset):
    def __init__(self, src_sentences, tgt_sentences, src_vocab, tgt_vocab, max_len=50):
        self.src_sentences = src_sentences
        self.tgt_sentences = tgt_sentences
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, idx):
        src = self.src_sentences[idx]
        tgt = self.tgt_sentences[idx]
        return src, tgt

# Collate function to pad sequences
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)

    # Convert lists to tensors and add SOS and EOS tokens
    src_batch = [torch.tensor([src_vocab["<sos>"]] + [src_vocab.get(token, src_vocab["<unk>"]) for token in sentence.split()] + [src_vocab["<eos>"]]) for sentence in src_batch]
    tgt_batch = [torch.tensor([tgt_vocab["<sos>"]] + [tgt_vocab.get(token, tgt_vocab["<unk>"]) for token in sentence.split()] + [tgt_vocab["<eos>"]]) for sentence in tgt_batch]

    # Pad sequences
    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=src_vocab["<unk>"])
    tgt_batch = pad_sequence(tgt_batch, batch_first=True, padding_value=tgt_vocab["<unk>"])

    return src_batch, tgt_batch

# Create dataset and dataloader
dataset = TranslationDataset(train_input, train_target, src_vocab, tgt_vocab)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

# Translation Model
class TranslationModel(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_dim, num_heads, num_layers, ff_dim):
        super(TranslationModel, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, embed_dim)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, embed_dim)

        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(embed_dim, num_heads, ff_dim), num_layers
        )
        self.decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(embed_dim, num_heads, ff_dim), num_layers
        )

        self.fc = nn.Linear(embed_dim, tgt_vocab_size)

    def forward(self, src, tgt):
        src_key_padding_mask = (src == src_vocab["<unk>"])  # Mask for padding tokens
        tgt_key_padding_mask = (tgt == tgt_vocab["<unk>"])  # Mask for padding tokens

        tgt_seq_len = tgt.size(1)
        tgt_mask = nn.Transformer().generate_square_subsequent_mask(tgt_seq_len).to(tgt.device)

        src_emb = self.encoder_embedding(src)
        tgt_emb = self.decoder_embedding(tgt)

        memory = self.encoder(
            src_emb.transpose(0, 1),
            src_key_padding_mask=src_key_padding_mask
        )
        output = self.decoder(
            tgt_emb.transpose(0, 1),
            memory,
            tgt_mask=tgt_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )
        return self.fc(output.transpose(0, 1))

# Model Parameters
src_vocab_size = len(src_vocab)
tgt_vocab_size = len(tgt_vocab)
embed_dim = 512
num_heads = 8
num_layers = 4
ff_dim = 4096

model = TranslationModel(src_vocab_size, tgt_vocab_size, embed_dim, num_heads, num_layers, ff_dim)

# Training Loop
criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab["<unk>"])
optimizer = Adam(model.parameters(), lr=0.0001)
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for src, tgt in dataloader:
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        output = model(src, tgt_input)
        output = output.reshape(-1, tgt_vocab_size)
        tgt_output = tgt_output.reshape(-1)
        loss = criterion(output, tgt_output)
        epoch_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(dataloader)}")

# Translation Function
def translate(sentence, model, src_vocab, tgt_vocab, rev_tgt_vocab, max_len=50):
    model.eval()
    tokens = [src_vocab.get(t, src_vocab["<unk>"]) for t in sentence.split()]
    src = torch.tensor([src_vocab["<sos>"]] + tokens + [src_vocab["<eos>"]]).unsqueeze(0)
    tgt = torch.tensor([tgt_vocab["<sos>"]]).unsqueeze(0)

    outputs = []
    with torch.no_grad():
        for _ in range(max_len):
            output = model(src, tgt)
            next_token = output.argmax(-1)[:, -1].item()
            outputs.append(next_token)
            if next_token == tgt_vocab["<eos>"]:
                break
            tgt = torch.cat([tgt, torch.tensor([[next_token]])], dim=1)

    return " ".join(rev_tgt_vocab[t] for t in outputs[:-1])




--2024-11-16 14:08:51--  https://raw.githubusercontent.com/divar167/data/refs/heads/main/train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 74015 (72K) [text/plain]
Saving to: ‘train.txt’


2024-11-16 14:08:52 (712 KB/s) - ‘train.txt’ saved [74015/74015]





Epoch 1, Loss: 4.7976017618179325
Epoch 2, Loss: 3.479969654083252
Epoch 3, Loss: 2.7090363693237305
Epoch 4, Loss: 2.1963003063201905
Epoch 5, Loss: 1.7973324251174927
Epoch 6, Loss: 1.4727373778820039
Epoch 7, Loss: 1.2140832459926605
Epoch 8, Loss: 0.9812359869480133
Epoch 9, Loss: 0.8077787256240845


In [None]:
from prettytable import PrettyTable

# Test the Model
test_sentence = [
    "Adam Dârayavauš xšâyathiya vazraka",
    "aivam parûvnâm xšâyathiyam ",
    "hya šiyâtim adâ martiyahyâ ",
    "hya martiyam adâ",
    "hauv xšâyathiya abava",
    "hya avam asmânam adâ",
    "šiyâtim hauv agarbâyatâ vavam",
    "Yadiy Pârsa pâta ahatiy",
    "Ariya Ariya ciça"
]
data_translation = [
    "I am Darius the great king",
    "one king for many",
    "who created happiness for man ",
    "who created man ",
    "he became king",
    "who created yonder sky ",
    "he seized the happiness of my family",
    "If the Persian people are protected",
    "an Aryan of Aryan descent  "

]

# Initialize table
table = PrettyTable()
table.field_names = ["Sentence", "Data Translation", "Model Translation"]

# Generate translations and populate the table
for s, dt in zip(test_sentence, data_translation):
    translation = translate(s, model, src_vocab, tgt_vocab, rev_tgt_vocab)
    table.add_row([s.strip(), dt.strip(), translation.strip()])

# Print the table
print(table)


In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
import unicodedata
import re
from sklearn.model_selection import train_test_split

# Dataset Parsing
SEPARATOR = '<sep>'
!wget https://raw.githubusercontent.com/divar167/data/refs/heads/main/train.txt -O train.txt
with open('train.txt') as file:
    train = [line.rstrip() for line in file]

# Unicode normalization
def normalize_unicode(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def preprocess_sentence(s):
    s = normalize_unicode(s)
    s = re.sub(r"([?.!,¿])", r" \1 ", s)
    s = re.sub(r'[" "]+', " ", s)
    s = s.lower()
    s = s.strip()
    return s

# Filter and preprocess the dataset
train_filtered = [pair for pair in train if SEPARATOR in pair and pair.strip() != '']
train_input, train_target = map(list, zip(*[pair.split(SEPARATOR) for pair in train_filtered]))
train_input = [preprocess_sentence(sentence.strip()) for sentence in train_input]
train_target = [preprocess_sentence(sentence.strip()) for sentence in train_target]

# Vocabulary Creation
def build_vocab(sentences):
    vocab = {"<sos>": 0, "<eos>": 1, "<unk>": 2}
    for sentence in sentences:
        for word in sentence.split():
            if word not in vocab:
                vocab[word] = len(vocab)
    return vocab

src_vocab = build_vocab(train_input)
tgt_vocab = build_vocab(train_target)
rev_tgt_vocab = {idx: token for token, idx in tgt_vocab.items()}

# Dataset Class
class TranslationDataset(Dataset):
    def __init__(self, src_sentences, tgt_sentences, src_vocab, tgt_vocab, max_len=50):
        self.src_sentences = src_sentences
        self.tgt_sentences = tgt_sentences
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, idx):
        src = self.src_sentences[idx]
        tgt = self.tgt_sentences[idx]
        return src, tgt

# Collate function to pad sequences
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = [torch.tensor([src_vocab["<sos>"]] + [src_vocab.get(token, src_vocab["<unk>"]) for token in sentence.split()] + [src_vocab["<eos>"]]) for sentence in src_batch]
    tgt_batch = [torch.tensor([tgt_vocab["<sos>"]] + [tgt_vocab.get(token, tgt_vocab["<unk>"]) for token in sentence.split()] + [tgt_vocab["<eos>"]]) for sentence in tgt_batch]
    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=src_vocab["<unk>"])
    tgt_batch = pad_sequence(tgt_batch, batch_first=True, padding_value=tgt_vocab["<unk>"])
    return src_batch, tgt_batch

# Split the data into training and validation sets
train_input, val_input, train_target, val_target = train_test_split(
    train_input, train_target, test_size=0.1, random_state=42
)

# Create datasets and dataloaders for training and validation
train_dataset = TranslationDataset(train_input, train_target, src_vocab, tgt_vocab)
val_dataset = TranslationDataset(val_input, val_target, src_vocab, tgt_vocab)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Translation Model
class TranslationModel(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_dim, num_heads, num_layers, ff_dim, dropout=0.2):
        super(TranslationModel, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, embed_dim)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, embed_dim)

        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(embed_dim, num_heads, ff_dim, dropout=dropout), num_layers
        )
        self.decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(embed_dim, num_heads, ff_dim, dropout=dropout), num_layers
        )

        self.fc = nn.Linear(embed_dim, tgt_vocab_size)

    def forward(self, src, tgt):
        src_key_padding_mask = (src == src_vocab["<unk>"])
        tgt_key_padding_mask = (tgt == tgt_vocab["<unk>"])

        tgt_seq_len = tgt.size(1)
        tgt_mask = nn.Transformer().generate_square_subsequent_mask(tgt_seq_len).to(tgt.device)

        src_emb = self.encoder_embedding(src)
        tgt_emb = self.decoder_embedding(tgt)

        memory = self.encoder(
            src_emb.transpose(0, 1),
            src_key_padding_mask=src_key_padding_mask
        )
        output = self.decoder(
            tgt_emb.transpose(0, 1),
            memory,
            tgt_mask=tgt_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )
        return self.fc(output.transpose(0, 1))

# Model Parameters
src_vocab_size = len(src_vocab)
tgt_vocab_size = len(tgt_vocab)
embed_dim = 1024
num_heads = 16
num_layers = 8
ff_dim = 4096
dropout = 0.2

model = TranslationModel(src_vocab_size, tgt_vocab_size, embed_dim, num_heads, num_layers, ff_dim, dropout)

# Optimizer and Loss Function
optimizer = AdamW(model.parameters(), lr=0.00005, weight_decay=1e-6)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)
criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab["<unk>"])

# Training Loop
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for src, tgt in train_dataloader:
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        output = model(src, tgt_input)
        output = output.reshape(-1, tgt_vocab_size)
        tgt_output = tgt_output.reshape(-1)
        loss = criterion(output, tgt_output)
        train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
        optimizer.step()

    val_loss = 0
    model.eval()
    with torch.no_grad():
        for src, tgt in val_dataloader:
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            output = model(src, tgt_input)
            output = output.reshape(-1, tgt_vocab_size)
            tgt_output = tgt_output.reshape(-1)
            loss = criterion(output, tgt_output)
            val_loss += loss.item()

    scheduler.step(val_loss / len(val_dataloader))
    print(f"Epoch {epoch + 1}, Train Loss: {train_loss / len(train_dataloader):.4f}, Val Loss: {val_loss / len(val_dataloader):.4f}")

# Translation Function
def translate2(sentence, model, src_vocab, tgt_vocab, rev_tgt_vocab, max_len=50):
    model.eval()
    tokens = [src_vocab.get(t, src_vocab["<unk>"]) for t in sentence.split()]
    src = torch.tensor([src_vocab["<sos>"]] + tokens + [src_vocab["<eos>"]]).unsqueeze(0)
    tgt = torch.tensor([tgt_vocab["<sos>"]]).unsqueeze(0)

    outputs = []
    with torch.no_grad():
        for _ in range(max_len):
            output = model(src, tgt)
            next_token = output.argmax(-1)[:, -1].item()
            outputs.append(next_token)
            if next_token == tgt_vocab["<eos>"]:
                break
            tgt = torch.cat([tgt, torch.tensor([[next_token]])], dim=1)

    return " ".join(rev_tgt_vocab[t] for t in outputs[:-1])


--2024-11-19 04:35:16--  https://raw.githubusercontent.com/divar167/data/refs/heads/main/train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 72818 (71K) [text/plain]
Saving to: ‘train.txt’


2024-11-19 04:35:16 (4.56 MB/s) - ‘train.txt’ saved [72818/72818]





Epoch 1, Train Loss: 5.0974, Val Loss: 4.6235
Epoch 2, Train Loss: 4.2667, Val Loss: 3.8892


In [None]:
from prettytable import PrettyTable

# Test the Model
test_sentence = [
    "Adam Dârayavauš xšâyathiya vazraka",
    "aivam parûvnâm xšâyathiyam ",
    "hya šiyâtim adâ martiyahyâ ",
    "hya martiyam adâ",
    "hauv xšâyathiya abava",
    "hya avam asmânam adâ",
    "šiyâtim hauv agarbâyatâ vavam",
    "Yadiy Pârsa pâta ahatiy",
    "Ariya Ariya ciça"
]
data_translation = [
    "I am Darius the great king",
    "one king for many",
    "who created happiness for man ",
    "who created man ",
    "he became king",
    "who created yonder sky ",
    "he seized the happiness of my family",
    "If the Persian people are protected",
    "an Aryan of Aryan descent  "

]

# Initialize table
table = PrettyTable()
table.field_names = ["Sentence", "Data Translation", "Model Translation"]

# Generate translations and populate the table
output_rows = []
for s, dt in zip(test_sentence, data_translation):
    translation = translate2(s, model, src_vocab, tgt_vocab, rev_tgt_vocab)
    row = [s.strip(), dt.strip(), translation.strip()]
    table.add_row(row)
    output_rows.append(row)

# # Print the table in the notebook
print(table)

# # Save the table to a file
# with open("translations.txt", "w") as file:
#     file.write(str(table))

# from google.colab import files
# files.download("translations.txt")


+------------------------------------+--------------------------------------+--------------------------------+
|              Sentence              |           Data Translation           |       Model Translation        |
+------------------------------------+--------------------------------------+--------------------------------+
| Adam Dârayavauš xšâyathiya vazraka |      I am Darius the great king      |   I am Darius the great king   |
|     aivam parûvnâm xšâyathiyam     |          one king for many           |       one ruler of many        |
|     hya šiyâtim adâ martiyahyâ     |    who created happiness for man     |          who created           |
|          hya martiyam adâ          |           who created man            |        who created man         |
|       hauv xšâyathiya abava        |            he became king            |         He became king         |
|        hya avam asmânam adâ        |        who created yonder sky        |     who created yonder sky     |
|