In [1]:
from architecture.transformer_module import Transformer
from tokenization.tokenizer_module import language_set, spec_tokens, translation_tokenizer
from torch.utils.data import Dataset, DataLoader
import random
import numpy as np
import torch
import os
import gc
import copy
from timeit import default_timer as timer

seed = 1996
torch.manual_seed(seed)
torch.mps.manual_seed(seed)
random.seed(seed)

### Set data

In [2]:
def get_data(file_path):
    txt = None
    with open(file_path) as f:
        txt = f.read().split("\n")
    txt = [t.strip() for t in txt]
    temp = []
    for t in txt:
        if len(t) > 0:
            temp.append(t)
    txt = temp
    return np.array(txt)


def split_train_test_val(it_data, fr_data, perc_test = 0.1, n_val_cases = 100):
    assert len(it_data) > n_val_cases
    assert len(it_data) - n_val_cases > 0
    n_train_test, n = len(it_data) - n_val_cases, len(it_data)
    n_test = int(n_train_test * (1-perc_test))
    indices = random.sample(range(len(it_data)), len(it_data))
    train_indices, test_indices, val_indices  = indices[:n_test], indices[n_test:-n_val_cases], indices[n-n_val_cases:]
    return (language_set(source=it_data[train_indices], target=fr_data[train_indices]), 
            language_set(source=it_data[test_indices], target=fr_data[test_indices]),
            language_set(source=it_data[val_indices], target=fr_data[val_indices]) )


root = "data/it_fr/"

it_data = get_data(os.path.join(root, "Tatoeba.fr-it.it"))
fr_data = get_data(os.path.join(root, "Tatoeba.fr-it.fr"))

train_data, test_data, val_data = split_train_test_val(it_data, fr_data)

In [3]:
class translation_dataset(Dataset):
    def __init__(self, data:language_set):
        self.data = data
        
    def __len__(self):
        return len(self.data.source)
    
    def __getitem__(self, idx):
        return self.data.source[idx], self.data.target[idx]

train_set = translation_dataset(train_data)
test_set =  translation_dataset(test_data)
val_set =   translation_dataset(val_data)

### Tokenizer

In [4]:
VOCAB_SIZE = 10_000
MAX_SEQUENCE_LEN = 30

tokenizer = translation_tokenizer(VOCAB_SIZE, MAX_SEQUENCE_LEN)
tokenizer.set_tokenizers(language_set(source=[*train_data.source, *test_data.source], target=[*train_data.target, *test_data.target]))









In [5]:
tokenizer.save_tokenizer("models")

In [6]:
print(tokenizer.src_wrap.vocab)

{'signore': 3822, 'vedervi': 3327, 'aci': 4760, 'batteva': 8983, 'fluentemente': 4148, 'incro': 4350, 'mali': 6446, 'espl': 5696, 'protegg': 4714, 'laurie': 9071, 'scuro': 9409, 'sottovalutato': 8223, 'accel': 1476, 'ubriaca': 4233, 'vada': 2469, 'assicurata': 9886, 'filoso': 6535, 'riuscirete': 8814, '##mele': 9370, '##erato': 1924, 'gennaio': 5394, 'siate': 3230, 'asciuga': 1512, 'disegna': 4571, 'sottovalu': 5677, 'capire': 1991, 'disposizione': 8654, 'manda': 9360, 'incendio': 5123, 'canto': 4809, '##mene': 9189, '##gono': 1501, 'add': 1494, 'capite': 5903, '##tone': 2961, 'trucca': 3584, 'luog': 2389, 'opin': 1633, 'ignorate': 8122, 'impo': 7888, 'sedici': 6767, 'fiori': 2226, 'danzato': 4519, '##trato': 8447, 'scimm': 5968, 'esager': 7481, 'dimentichiamo': 8221, '##mono': 9187, 'esporta': 6940, 'fresco': 6208, 'cieca': 8057, '\u200e': 80, 'inviare': 6593, '##mp': 231, '##1': 115, '##bra': 5327, '##sita': 1698, 'meto': 3967, '##cche': 2285, 'progetto': 1816, 'legi': 8383, 'pensiam

In [7]:
import random

for k in range(0,5):
    i=random.randint(0, len(train_data.source))
    print(tokenizer.src_wrap(train_data.source[i]).tokens(), 
          tokenizer.trg_wrap(train_data.target[i]).tokens())
    
    print(tokenizer.src_wrap.encode(train_data.source[i]), 
          tokenizer.trg_wrap.encode(train_data.target[i]))
    
    print(tokenizer.src_wrap.decode(tokenizer.src_wrap.encode(train_data.source[i])), 
          tokenizer.trg_wrap.decode(tokenizer.trg_wrap.encode(train_data.target[i])))
    print()

['[SOS]', 'linda', 'seguira', 'il', 'vostro', 'consiglio', '.', '[EOS]'] ['[SOS]', 'linda', 'va', 'suivre', 'votre', 'conseil', '.', '[EOS]']
[1, 1086, 5688, 168, 757, 1504, 15, 2] [1, 1049, 262, 1873, 340, 1423, 16, 2]
[SOS] linda seguira il vostro consiglio. [EOS] [SOS] linda va suivre votre conseil. [EOS]

['[SOS]', 'non', 'conosce', 'mio', 'fratello', '.', '[EOS]'] ['[SOS]', 'vous', 'ne', 'connaissez', 'pas', 'mon', 'frere', '.', '[EOS]']
[1, 151, 1065, 331, 1003, 15, 2] [1, 164, 171, 2046, 158, 257, 926, 16, 2]
[SOS] non conosce mio fratello. [EOS] [SOS] vous ne connaissez pas mon frere. [EOS]

['[SOS]', 'tom', 'ha', 'accumulato', 'una', 'grande', 'fortuna', '.', '[EOS]'] ['[SOS]', 'tom', 'a', 'amasse', 'une', 'grande', 'fortune', '.', '[EOS]']
[1, 210, 166, 2145, 209, 658, 1531, 15, 2] [1, 208, 38, 3609, 200, 878, 2325, 16, 2]
[SOS] tom ha accumulato una grande fortuna. [EOS] [SOS] tom a amasse une grande fortune. [EOS]

['[SOS]', 'noi', 'siamo', 'arrivate', 'in', 'tempo', 'in', 

### Define model

In [8]:
DEVICE = "mps"
BATCH_SIZE = 128
SRC_VOCAB_SIZE = len(tokenizer.src_wrap)
TRG_VOCAB_SIZE = len(tokenizer.trg_wrap)
EMBEDDING_DIM = 256
N_LAYERS = 2
N_HEADS = 4
DROPOUT = 0.3

model = Transformer(SRC_VOCAB_SIZE, TRG_VOCAB_SIZE, N_LAYERS, N_HEADS, EMBEDDING_DIM, dropout=DROPOUT).to(DEVICE)

In [9]:
print(model)

Transformer(
  (src_embedding): Sequential(
    (0): Embedding(10001, 256)
    (1): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (trg_embedding): Sequential(
    (0): Embedding(10001, 256)
    (1): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (encoder): Encoder(
    (encoder_layers): ModuleList(
      (0-1): 2 x EncoderLayer(
        (multi_head_attn): MultiHeadAttention(
          (heads): ModuleList(
            (0-3): 4 x AttentionHead(
              (K): Linear(in_features=256, out_features=64, bias=False)
              (Q): Linear(in_features=256, out_features=64, bias=False)
              (V): Linear(in_features=256, out_features=64, bias=False)
            )
          )
        )
        (norm_1): NormLayer()
        (ff): FeedForward(
          (linear): Sequential(
            (0): Linear(in_features=256, out_features=256, bias=True)
            (1): ReLU()
          )
        )
        (norm_2): NormLayer()
 

In [10]:
def collate_func(batch):
    src, trg = [], []
    for b in batch:
        src.append(b[0])
        trg.append(b[1])
    src_tokens, trg_tokens = tokenizer(src, trg)

    src_batch, target_batch = torch.tensor( src_tokens , dtype=torch.long ) , torch.tensor( trg_tokens, dtype=torch.long  )
        
    return src_batch, target_batch[:,:-1], target_batch[:, 1:]


In [11]:
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, collate_fn=collate_func)
test_loader =  DataLoader(test_set, batch_size=BATCH_SIZE,  collate_fn=collate_func)

In [12]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_id(), label_smoothing=0.2)
#torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_id(), label_smoothing=1) #torch.nn.NLLLoss(ignore_index=tokenizer.pad_id())
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.98), eps=1e-9)

### Training functions

In [17]:
def train_fn(model:Transformer, data_loader, optimizer, device, clip=1):
    model.train()
    epoch_loss = 0
    optimizer.zero_grad()   
    for src, trg_in, trg_out in data_loader:
        src, trg_in, trg_out = src.to(device), trg_in.to(device), trg_out.to(device)
        # get masks
        e_mask, d_mask = model.get_masks(src, trg_in, tokenizer.pad_id())
        # compute logits
        out_model = model(src, trg_in, e_mask, d_mask)
        logits = model.get_logits(out_model) 
        # zero the gradients
        optimizer.zero_grad()
        # compute loss
        loss = loss_fn(logits.contiguous().view(-1, TRG_VOCAB_SIZE), trg_out.contiguous().view(-1))
        # update gradients
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

def evaluate_fn(model:Transformer, data_loader, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, trg_in, trg_out in data_loader:
            src, trg_in, trg_out = src.to(device), trg_in.to(device), trg_out.to(device)
            # get masks
            e_mask, d_mask = model.get_masks(src, trg_in, tokenizer.pad_id())
            # compute logits
            out_model = model(src, trg_in, e_mask, d_mask)
            logits = model.get_logits(out_model) 
            # get loss
            loss = loss_fn(logits.contiguous().view(-1, TRG_VOCAB_SIZE), trg_out.contiguous().view(-1))
            epoch_loss += loss.item()           
    return epoch_loss / len(data_loader)


def translate_sentence( sentence:str, model:Transformer, max_len = 25):
    model.eval()
    with torch.no_grad():
        in_tokens = tokenizer.src_wrap(sentence)["input_ids"]
        in_tensor = torch.tensor( in_tokens ).unsqueeze(0).to(DEVICE)
        e_mask = model.make_src_mask(in_tensor, tokenizer.pad_id())
        memory = model.encode(in_tensor, e_mask)
        out_tokens = torch.ones(1).fill_(tokenizer.sos_id()).type(in_tensor.dtype).to(DEVICE)

        for i in range(max_len):
            d_mask = model.subsequent_mask(out_tokens.size(0)).type_as(in_tensor.data)
            decoder_output = model.decode(memory, out_tokens.unsqueeze(0), e_mask, d_mask) # (1, L, d_model)
            probabilities = model.get_probabilities(decoder_output) # (1, L, trg_vocab_size)
            max_probabs = torch.argmax(probabilities, dim=-1).squeeze(0) # (L)
            predicted_token = max_probabs[-1].item()
            out_tokens = torch.cat( [out_tokens, torch.empty(1).type_as(in_tensor).fill_(predicted_token)], dim=0 )
            if i >= MAX_SEQUENCE_LEN-1:
                break   
        return "".join( tokenizer.decode( out_tokens.cpu().tolist()) )

In [18]:
for i in range(1):
   print(translate_sentence(test_data.source[i], model))

diale dialeolleolleolleolleolle contact contact contact contact ecrir ecrir ecrirununun ecrir ecrir ecrir contact contact contact contact competi


In [19]:
NUM_EPOCHS=20

gc.collect()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

train_loss_list, valid_loss_list = [], []
torch.mps.empty_cache()  

best_valid_loss = float("inf")
best_model = copy.deepcopy(model)
sentence = test_data.source[0]
for epoch in range(NUM_EPOCHS):
    start_time = timer()
    train_loss = train_fn( model, train_loader, optimizer, DEVICE)
    valid_loss = evaluate_fn( model, test_loader, DEVICE )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        best_model = copy.deepcopy(model)
        torch.save(model.state_dict(), os.path.join("models", "checkpoint.pt"))
    end_time = timer()  
    print(f"Epoch: {epoch+1:02}\t time = {(end_time - start_time):.3f}s")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")
    print(f"Original text: {sentence}")
    print(f"Translated text: {translate_sentence(sentence, best_model)}")
    print()

Epoch: 01	 time = 37.677s
	Train Loss:   5.702 | Train PPL: 299.594
	Valid Loss:   5.048 | Valid PPL: 155.658
Original text: Voi non accenderete delle candele nella vostra camera.
Translated text: vous n'avez pas les enfants a l'ecole.......

Epoch: 02	 time = 38.150s
	Train Loss:   4.775 | Train PPL: 118.469
	Valid Loss:   4.461 | Valid PPL:  86.575
Original text: Voi non accenderete delle candele nella vostra camera.
Translated text: vous ne ferez pas de bougies dans sa chambre........

Epoch: 03	 time = 40.227s
	Train Loss:   4.324 | Train PPL:  75.499
	Valid Loss:   4.068 | Valid PPL:  58.432
Original text: Voi non accenderete delle candele nella vostra camera.
Translated text: vous ne faisiez pas de bougies dans sa chambre........

Epoch: 04	 time = 41.048s
	Train Loss:   3.996 | Train PPL:  54.395
	Valid Loss:   3.824 | Valid PPL:  45.770
Original text: Voi non accenderete delle candele nella vostra camera.
Translated text: vous n'allumera pas de bougies dans votre chambre.......

In [20]:
torch.save(best_model, 'models/model.pth')

In [26]:
n_val = 4
sentence = val_data.source[n_val]
translation = translate_sentence(sentence, best_model)

print(f"Original text:\t\t{sentence}")
print(f"Translated text:\t{translation}")
print(f"Real:\t\t\t{val_data.target[n_val]}")

Original text:		Corregga quegli errori.
Translated text:	corrigez ces fautes...
Real:			Corrigez ces fautes.
