In [None]:
import os
import re
import math
import warnings
import evaluate

import numpy as np
import matplotlib.pyplot as plt
from ftfy import fix_text
from tqdm import tqdm, trange
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, Subset, DataLoader

from transformers import MarianMTModel, MarianTokenizer

In [None]:
DATA_DIR = "data"
DATA_DIR2 = DATA_DIR

# DATA_DIR = "/kaggle/input/es-tw-dataset/"
# DATA_DIR2 = "/kaggle/working"

if not os.path.exists(DATA_DIR):
    !mkdir {DATA_DIR}

    # Data sourced from here: https://opus.nlpl.eu/results/en&tw/corpus-result-table
    url = "https://object.pouta.csc.fi/OPUS-NLLB/v1/moses/en-tw.txt.zip"

    !wget {url} -O {DATA_DIR}/en-tw.txt.zip
    !unzip -j {DATA_DIR}/en-tw.txt.zip -d {DATA_DIR}/ '*.tw' '*.en'
    !rm {DATA_DIR}/en-tw.txt.zip
    !cat {DATA_DIR}/NLLB.en-tw.en >> {DATA_DIR}/eng.txt && cat {DATA_DIR}/NLLB.en-tw.tw >> {DATA_DIR}/twi.txt
    !rm {DATA_DIR}/*en {DATA_DIR}/*tw

if not os.path.exists(DATA_DIR2):
    !mkdir {DATA_DIR2}

if not os.path.exists("models"):
    !mkdir models

In [None]:
def translate_data(lines, model_name="Helsinki-NLP/opus-mt-en-es", device="cuda"):
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name).to(device)

    inputs = tokenizer(lines, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    outputs = model.generate(**inputs)
    translated = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    translated_fixed = [fix_text(text) for text in translated]

    return translated

def translate_save_data(input_file, output_file, batch_size=100):
    if not os.path.exists(output_file):

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            
            with open(input_file, 'r', encoding='utf-8') as in_file, \
                open(output_file, 'w', encoding='utf-8') as out_file:

                lines = in_file.readlines()
                total_lines = len(lines)

                for i in tqdm(range(0, total_lines, batch_size), desc="Translating", unit="batch"):
                    batch = [line.strip() for line in lines[i:i+batch_size]]
                    transations = translate(batch)
                    for translation in translations:
                        out_file.write(translation + '\n')

In [None]:
def clean_data(input_file_lang1, input_file_lang2, output_file_lang1, output_file_lang2):
    pattern = r'[^\wáéíóúüÁÉÍÓÚÜ\s?!.¿¡\'\:]'
    seen_lines = set()
    
    with open(input_file_lang1, 'r', encoding='utf-8') as f_input_lang1, \
         open(input_file_lang2, 'r', encoding='utf-8') as f_input_lang2, \
         open(output_file_lang1, 'w', encoding='utf-8') as f_output_lang1, \
         open(output_file_lang2, 'w', encoding='utf-8') as f_output_lang2:
        
        for line1, line2 in zip(f_input_lang1, f_input_lang2):
            clean_line1 = re.sub(pattern, '', line1.strip()).strip().lower()
            clean_line2 = re.sub(pattern, '', line2.strip()).strip().lower()
            
            clean_line1 = ' '.join(word for word in clean_line1.split() if not any(char.isdigit() for char in word))
            clean_line2 = ' '.join(word for word in clean_line2.split() if not any(char.isdigit() for char in word))
            
            if len(clean_line1) < 5 or len(clean_line2) < 5:
                continue
            
            if (clean_line1, clean_line2) in seen_lines:
                continue
                
            seen_lines.add((clean_line1, clean_line2))
            f_output_lang1.write(clean_line1 + '\n')
            f_output_lang2.write(clean_line2 + '\n')


In [None]:
# translate_save_data(f'{DATA_DIR}/eng.txt', f'{DATA_DIR}/esp.txt')  # 2,5 hours for 100,000 sentences
# clean_data(f'{DATA_DIR}/esp.txt', f'{DATA_DIR}/twi.txt', f'{DATA_DIR2}/esp2.txt', f'{DATA_DIR2}/twi2.txt')

In [None]:
START_TOKEN = '<sos>'
PADDING_TOKEN = '<pad>'
END_TOKEN = '<eos>'
UNK_TOKEN = '<unk>'
special_tokens = [START_TOKEN, PADDING_TOKEN, END_TOKEN, UNK_TOKEN]

esp_vocab = special_tokens + list("aábcdeéfghiíjklmnñoópqrstuúüvwxyz.,¿?¡!': ")
twi_vocab = special_tokens + list("abcdeɛfghijklmnoɔpqrstuvwxyz.,?!': ")

index_to_esp = {k:v for k,v in enumerate(esp_vocab)}
esp_to_index = {v:k for k,v in enumerate(esp_vocab)}
index_to_twi = {k:v for k,v in enumerate(twi_vocab)}
twi_to_index = {v:k for k,v in enumerate(twi_vocab)}

def encode(line, lang, max_len):
    result = [lang[START_TOKEN]]
    for char in line:
        try:
            result.append(lang[char])
        except:
            result.append(lang[UNK_TOKEN])

    if len(result) >= max_len:
        result = result[:max_len - 1]
        result.append(lang[END_TOKEN])
    else:
        result.append(lang[END_TOKEN])
        result += [lang[PADDING_TOKEN]] * (max_len - len(line) - 2)

    return result

def decode(line, lang, skip_special_tokens=False):
    if not skip_special_tokens:
        return ''.join([lang[char] for char in line])
    else:
        return ''.join([lang[char] for char in line if lang[char] not in special_tokens])

print(index_to_esp)
print(index_to_twi)

In [None]:
n_instances = 85326

with open(f'{DATA_DIR2}/esp2.txt', 'r', encoding='utf-8') as file_esp, open(f'{DATA_DIR2}/twi2.txt', 'r', encoding='utf-8') as file_twi:
    x = [line.rstrip('\n') for line in file_esp.readlines()[:n_instances]]
    y = [line.rstrip('\n') for line in file_twi.readlines()[:n_instances]]

len_esp = [len(line) for line in x]
len_twi = [len(line) for line in y]

print(f"Dataset size: {n_instances} lines")
print(f"Spanish vocabulary size: {len(esp_vocab)} characters")
print(f"Twi vocabulary size: {len(twi_vocab)} characters\n")

percentile = 99

print(f"Average length of Spanish sentences: {int(np.mean(len_esp))} characters")
print(f"Average length of Twi sentences: {int(np.mean(len_twi))} characters\n")

print(f"{percentile}th percentile of Spanish sentence lengths:", int(np.percentile(len_esp, percentile)))
print(f"{percentile}th percentile of Twi sentence lengths:", int(np.percentile(len_twi, percentile)), '\n')

len_lengths_list = 40

top_esp = sorted(len_esp, reverse=True)[:len_lengths_list]
top_twi = sorted(len_twi, reverse=True)[:len_lengths_list]

print(f"Top {len_lengths_list} longest line lengths for Spanish:")
print(top_esp)

print(f"\nTop {len_lengths_list} longest line lengths for Twi:")
print(top_twi)

In [None]:
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        tgt_vocab_size,
        src_pad_idx,
        tgt_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device
    ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.tgt_word_embedding = nn.Embedding(tgt_vocab_size, embedding_size)

        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            dim_feedforward=forward_expansion,
            dropout=dropout,
            batch_first=True
        )
        self.fc_out = nn.Linear(embedding_size, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx
        self.tgt_pad_idx = tgt_pad_idx

    def make_src_mask(self, src):
        src_mask = (src == self.src_pad_idx)
        return src_mask.to(self.device).bool()

    def make_tgt_mask(self, tgt):
        tgt_mask = (tgt == self.tgt_pad_idx)
        return tgt_mask.to(self.device).bool()

    def positional_encoding(self, seq_length, embedding_size):
        pe = torch.zeros(seq_length, embedding_size, device=self.device)
        position = torch.arange(0, seq_length, dtype=torch.float, device=self.device).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_size, 2, dtype=torch.float, device=self.device) * (-math.log(10000.0) / embedding_size))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(0)  

    def forward(self, src, tgt):
        batch_size, src_seq_length = src.shape
        batch_size, tgt_seq_length = tgt.shape

        embed_src = self.src_word_embedding(src) + self.positional_encoding(src_seq_length, embedding_size)
        embed_tgt = self.tgt_word_embedding(tgt) + self.positional_encoding(tgt_seq_length, embedding_size)

        embed_src = self.dropout(embed_src)
        embed_tgt = self.dropout(embed_tgt)

        src_padding_mask = self.make_src_mask(src)
        tgt_padding_mask = self.make_tgt_mask(tgt)
        tgt_mask = torch.triu(torch.ones(tgt_seq_length, tgt_seq_length, dtype=torch.bool, device=self.device), diagonal=1)

        out = self.transformer(
            embed_src,
            embed_tgt,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_padding_mask,
            tgt_key_padding_mask=tgt_padding_mask,
        )

        out = self.fc_out(out)

        return out


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
src_vocab_size = len(esp_vocab)
tgt_vocab_size = len(twi_vocab)
max_len = 250

embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.1
forward_expansion = 2048
src_pad_idx = esp_to_index[PADDING_TOKEN]
tgt_pad_idx = twi_to_index[PADDING_TOKEN]

model = Transformer(
    embedding_size,
    src_vocab_size,
    tgt_vocab_size,
    src_pad_idx,
    tgt_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

num_epochs = 55
learning_rate = 1e-4

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=tgt_pad_idx)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=5, verbose=True)

In [None]:
class EspTwiDataset(Dataset):
    def __init__(self, src_lines, tgt_lines, src_lang, tgt_lang, max_len):
        self.src_lines = list(map(lambda x: encode(x, src_lang, max_len), src_lines))
        self.tgt_lines = list(map(lambda x: encode(x, tgt_lang, max_len), tgt_lines))

    def __len__(self):
        return len(self.src_lines)

    def __getitem__(self, idx):
        return torch.tensor(self.src_lines[idx]), torch.tensor(self.tgt_lines[idx])

In [None]:
dataset = EspTwiDataset(x, y, esp_to_index, twi_to_index, max_len)

In [None]:
batch_size = 16

indices = np.arange(len(dataset))
np.random.shuffle(indices)
train_indices, tmp_indices = train_test_split(indices, train_size=0.9)
dev_indices, test_indices = train_test_split(tmp_indices, train_size=0.98)

train_set = Subset(dataset, train_indices)
dev_set = Subset(dataset, dev_indices)
test_set = Subset(dataset, test_indices)

train_dl = DataLoader(train_set, batch_size=batch_size, shuffle=True)
dev_dl = DataLoader(dev_set, batch_size=batch_size, shuffle=True)
test_dl = DataLoader(test_set, batch_size=batch_size, shuffle=True)

print(f"train set size: {len(train_set)}")
print(f"dev set size: {len(dev_set)}")
print(f"test set size: {len(test_set)}")

In [None]:
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0

    for i in (t := trange(len(dataloader), desc="training", unit="batch")):
        src, tgt = next(iter(dataloader))
        src, tgt = src.to(device), tgt.to(device)
        
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])  
        output = output.view(-1, output.size(-1))
        tgt = tgt[:, 1:].contiguous().view(-1)

        loss = criterion(output, tgt)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        
        t.set_description(f"training loss: {loss.item():.10f}")

    return total_loss / len(dataloader)

def validate_model(model, dev_loader, criterion, device):
    model.eval()
    total_loss = 0.0

    with torch.no_grad(), trange(len(dev_loader), desc="validation", unit="batch") as t:
        for src, tgt in dev_loader:
            src, tgt = src.to(device), tgt.to(device)

            output = model(src, tgt[:, :-1])
            output = output.view(-1, output.size(-1))
            tgt = tgt[:, 1:].contiguous().view(-1)
            
            loss = criterion(output, tgt)
            total_loss += loss.item()

            t.set_description(f"validation loss: {loss.item():.10f}")

    return total_loss / len(dev_loader)


def train_loop(model, train_loader, dev_loader, optimizer, criterion, device, num_epochs):
    model.to(device)
    criterion.to(device)

    train_losses = []
    dev_losses = []
    train_info = f"NUM_EPOCHS={num_epochs}   BATCH_SIZE={batch_size}   LEARNING_RATE={learning_rate}   DROPOUT={dropout}"

    print(train_info)

    for epoch in range(num_epochs):
        print(f"\nEPOCH {epoch + 1}/{num_epochs}")
        train_loss = train_model(model, train_loader, optimizer, criterion, device)
        train_losses.append(train_loss)

        if epoch != 0:
            signo = f"+" if train_losses[-1] > train_losses[-2] else ""
            print(f"average train loss for epoch {epoch + 1}: {train_loss:.10f} ({signo}{(train_losses[-1] - train_losses[-2]):.10f})")
        else: 
            print(f"average train loss for epoch {epoch + 1}: {train_loss:.10f}")
        
        dev_loss = validate_model(model, dev_loader, criterion, device)
        dev_losses.append(dev_loss)

        if epoch != 0:
            signo = f"+" if dev_losses[-1] > dev_losses[-2] else ""
            print(f"average val loss for epoch {epoch + 1}: {dev_loss:.10f} ({signo}{(dev_losses[-1] - dev_losses[-2]):.10f})\n")
        else: 
            print(f"average val loss for epoch {epoch + 1}: {dev_loss:.10f}\n")

        scheduler.step(dev_loss)
        torch.save(model.state_dict(), f'models/e{num_epochs}_b{batch_size}.pt')


    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))

    ax1.plot(train_losses, label='Train Loss', color='blue')
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Loss')
    ax1.set_title('Training Loss')
    ax1.legend()

    ax2.plot(dev_losses, label='Validation Loss', color='green')
    ax2.set_xlabel('Epochs')
    ax2.set_ylabel('Loss')
    ax2.set_title('Validation Loss')
    ax2.legend()

    plt.tight_layout()
    plt.show()

    return model

In [None]:
model = train_loop(model, train_dl, dev_dl, optimizer, criterion, device, num_epochs)

In [None]:
# model.load_state_dict(torch.load(f'models/e{num_epochs}_b{batch_size}.pt'))
# model = model.to(device)

In [None]:

def translate(src, model, skip_special_tokens=False):
    src_ids = torch.tensor([encode(src, esp_to_index, max_len)]).to(device)

    with torch.no_grad():
        tgt_ids = torch.tensor([twi_to_index[START_TOKEN]]).unsqueeze(1).to(device)

        while True:
            out = model(src_ids, tgt_ids)
            predicted_index = out.argmax(dim=-1)[:, -1].unsqueeze(1)
            tgt_ids = torch.cat((tgt_ids, predicted_index), dim=1)
            
            if twi_to_index[END_TOKEN] in predicted_index or len(tgt_ids[0]) >= max_len:
                break
    
    result = tgt_ids
    predicted_words = decode(result.squeeze().tolist(), index_to_twi, skip_special_tokens=skip_special_tokens)

    return result, predicted_words


def test_model(model, test_loader, num_phrases_to_print):
    model.to(device)

    sources = []
    predictions = []
    references = []
    phrases_printed = 0

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing", unit="batch"):
            src, tgt = batch
            src, tgt = src.to(device), tgt.to(device)

            for i in range(src.shape[0]):
                src_sentence = decode(src[i].tolist(), index_to_esp, skip_special_tokens=True)
                _, prediction = translate(src_sentence, model, skip_special_tokens=True)

                try:
                    reference = decode(tgt[i].tolist(), index_to_twi, skip_special_tokens=True)
                except KeyError as e:
                    print(f"KeyError: {e} in reference")
                    reference = ""

                sources.append(src_sentence)
                predictions.append(prediction)
                references.append(reference)

        for i in range(min(num_phrases_to_print, len(predictions))):
            print("\n---")
            print(f"Source: {sources[i]}")
            print(f"Prediction: {predictions[i]}")
            print(f"Reference: {references[i]}")
            print("---")

    bleu = evaluate.load("bleu")
    bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references], max_order=2)

    return bleu_score

In [None]:
source = "si de lo que hay en la tierra os digo y no creéis ¿cómo podríais creerme si os dijera de lo que hay en el cielo?"
# target: "sɛ meka asase yi so nsɛm kyerɛ mo na munnye nni a ɛbɛyɛ dɛn na sɛ meka ɔsoro nsɛm kyerɛ mo a mubegye adi?"

result, prediction = translate(source, model, skip_special_tokens=True)
# Translate the output sentence in Google Translate: https://translate.google.es/?hl=es&sl=ak&tl=es&op=translate
print(prediction)

In [None]:
bleu_score = test_model(model, test_dl, num_phrases_to_print=3)

for key, value in bleu_score.items():
    print(f"{key}: {value}")