In [None]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [None]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchinfo import summary
from my_transformer import Transformer
from my_train import train
import warnings
warnings.filterwarnings("ignore")

In [None]:
def aux(line):
  line = line.replace('.', '')
  line = line.replace(',', '')
  line = line.replace('!', '')
  line = line.replace('?', '')
  return line

In [None]:
English_sens = []
Spanish_sens = []
with open('/content/spa.txt', 'r') as f:
  for line in f.readlines():
    en, sp = line.split('CC')[0].split('\t')[:-1]
    en = aux(en.lower())
    sp = '<st> ' + aux(sp) + ' <end>'
    English_sens.append(en)
    Spanish_sens.append(sp)

In [None]:
sen_length = []
for idx in range(len(English_sens)):
  en_len = len(English_sens[idx].split())
  sp_len = len(Spanish_sens[idx].split())
  sen_length.append(max(en_len, sp_len))

In [None]:
MAX_LEN = max(sen_length)
batch_size = 256
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
def create_tokens(source_dataset): # <pad>:0, <unk>:the last token
  dic = {}
  for sen in source_dataset:
    for word in sen.split():
      if word not in dic:
        dic.setdefault(word, 1)
      else:
        dic[word] += 1

  token_dic = {}
  token_dic.setdefault('<pad>', 0)

  sort_dic = dict(sorted(dic.items(), key=lambda x:x[1], reverse=True))

  for idx, word in enumerate(sort_dic):
    token_dic[word] = idx + 1

  token_dic.setdefault('<unk>', len(token_dic))

  return token_dic
SRC_Tokens = create_tokens(English_sens)
TRG_Tokens = create_tokens(Spanish_sens)
idx_to_tok_src = {v:k for (k,v) in SRC_Tokens.items()}
idx_to_tok_trg = {v:k for (k,v) in TRG_Tokens.items()}

In [None]:
def tokenizer(sentence, tokens, max_len):
  words = sentence.split()
  for i in range(len(words)):
    if words[i] in tokens:
      words[i] = tokens[words[i]]
    else:
      words[i] = tokens['<unk>']
  for _ in range(max_len-len(words)):
    words.append(tokens['<pad>'])
  return torch.tensor(words)

print(tokenizer('i am fine bro', SRC_Tokens, MAX_LEN))
print(tokenizer('<st> yo estoy bien <end>', TRG_Tokens, MAX_LEN))

tensor([    1,   106,   694, 10903,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0])
tensor([ 1, 98, 80, 57,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0])


In [None]:
class MyDataset(Dataset):
  def __init__(self, SRC, TRG, device):
    self.SRC = SRC
    self.TRG = TRG
    self.device = device
  def __len__(self):
    return len(self.SRC)
  def __getitem__(self, idx):
    src, trg = self.SRC[idx], self.TRG[idx]
    src = tokenizer(src, SRC_Tokens, MAX_LEN)
    trg = tokenizer(trg, TRG_Tokens, MAX_LEN)
    return src.to(self.device), trg.to(self.device)
dataloader = DataLoader(MyDataset(English_sens, Spanish_sens, device),
                        batch_size=batch_size)

In [None]:
src_vocab = len(SRC_Tokens)
trg_vocab = len(TRG_Tokens)

d_model = 40
N = 1
heads = 2

model = Transformer(src_vocab, trg_vocab, d_model, N, heads, MAX_LEN).to(device)
summary(model, [(batch_size, MAX_LEN), (batch_size, MAX_LEN)], dtypes=[torch.long, torch.long])

Layer (type:depth-idx)                        Output Shape              Param #
Transformer                                   [256, 21, 24189]          --
├─Encoder: 1-1                                [256, 21, 40]             --
│    └─IO_Embedding: 2-1                      [256, 21, 40]             --
│    │    └─Embedding: 3-1                    [256, 21, 40]             436,160
│    └─PositionalEncoding: 2-2                [256, 21, 40]             --
│    └─ModuleList: 2-3                        --                        --
│    │    └─SingleEncoderLayer: 3-2           [256, 21, 40]             172,648
├─Decoder: 1-2                                [256, 21, 40]             --
│    └─IO_Embedding: 2-4                      [256, 21, 40]             --
│    │    └─Embedding: 3-3                    [256, 21, 40]             967,560
│    └─PositionalEncoding: 2-5                [256, 21, 40]             --
│    └─ModuleList: 2-6                        --                        --
│    

In [None]:
@torch.no_grad()
def translate(sentence, device):

  sen_SRC = tokenizer(sentence, SRC_Tokens, MAX_LEN).unsqueeze(0).to(device)
  sen_TRG = '<st>'

  while '<end>' not in sen_TRG:

    length = len(sen_TRG.split())
    trg_input = tokenizer(sen_TRG, TRG_Tokens, MAX_LEN).unsqueeze(0)[:, :-1].to(device)
    # see my_train code => remember: trg_input = trg[:,:-1]
    preds = model(sen_SRC, trg_input, src_mask=None, trg_mask=None).squeeze()
    next_word_idx = torch.argmax(preds, dim=-1)[length-1] #IMPORTANT
    sen_TRG += (' ' + idx_to_tok_trg[next_word_idx.item()])

  return sen_TRG

In [None]:
epochs = 200
print_step = 10
lr = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
test_sen = "we spent the night in a cheap hotel"

loss = train(model, optimizer, dataloader, epochs, translate, test_sen, device, print_step)

Epoch: 1 -> Loss:  6.33125949
we spent the night in a cheap hotel -> <st> El niño <end>
Epoch: 11 -> Loss:  2.73496600
we spent the night in a cheap hotel -> <st> Tenemos un hotel <end>
Epoch: 21 -> Loss:  1.93771589
we spent the night in a cheap hotel -> <st> Pasamos agua <end>
Epoch: 31 -> Loss:  1.57507215
we spent the night in a cheap hotel -> <st> Pasamos que un hotel en un hotel <end>
Epoch: 41 -> Loss:  1.33963099
we spent the night in a cheap hotel -> <st> Pasamos bien en la noche en la noche en un hotel económico <end>
Epoch: 51 -> Loss:  1.18330918
we spent the night in a cheap hotel -> <st> Pasamos general muy cómoda por una noche <end>
Epoch: 61 -> Loss:  1.06112733
we spent the night in a cheap hotel -> <st> Pasamos el hotel económico <end>
Epoch: 71 -> Loss:  0.96489214
we spent the night in a cheap hotel -> <st> Pasamos estas pinturas cama <end>
Epoch: 81 -> Loss:  0.89934241
we spent the night in a cheap hotel -> <st> Pasamos el hotel económico <end>
Epoch: 91 -> Loss: 

In [None]:
epochs = 100
print_step = 10
lr = 5e-4
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
test_sen = "we spent the night in a cheap hotel"

loss = train(model, optimizer, dataloader, epochs, translate, test_sen, device, print_step)

Epoch: 1 -> Loss:  0.49058775
we spent the night in a cheap hotel -> <st> Pasamos el hotel económico <end>
Epoch: 11 -> Loss:  0.44193941
we spent the night in a cheap hotel -> <st> Pasamos los ojos muy tristes <end>
Epoch: 21 -> Loss:  0.42937651
we spent the night in a cheap hotel -> <st> Pasamos harto un hotel menos un hotel peligroso <end>
Epoch: 31 -> Loss:  0.41925283
we spent the night in a cheap hotel -> <st> Pasamos la noche entrar a un hotel <end>
Epoch: 41 -> Loss:  0.40831821
we spent the night in a cheap hotel -> <st> Pasamos la noche <end>
Epoch: 51 -> Loss:  0.40286723
we spent the night in a cheap hotel -> <st> Pasamos todas roer que noche en un hotel barato de noche bajo en el hotel económico <end>
Epoch: 61 -> Loss:  0.39488486
we spent the night in a cheap hotel -> <st> Pasamos otro hotel económico <end>
Epoch: 71 -> Loss:  0.38778390
we spent the night in a cheap hotel -> <st> Pasamos el hotel menos un hotel barato en un hotel económico <end>
Epoch: 81 -> Loss:  0.3

In [None]:
epochs = 100
print_step = 10
lr = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
test_sen = "we spent the night in a cheap hotel"

loss = train(model, optimizer, dataloader, epochs, translate, test_sen, device, print_step)

Epoch: 1 -> Loss:  0.35998609
we spent the night in a cheap hotel -> <st> Pasamos el hotel barato entrar un hotel <end>
Epoch: 11 -> Loss:  0.33275364
we spent the night in a cheap hotel -> <st> Pasamos el príncipe Casi Haré el hotel barato <end>
Epoch: 21 -> Loss:  0.32907799
we spent the night in a cheap hotel -> <st> Pasamos el hotel barato de noche <end>
Epoch: 31 -> Loss:  0.32483271
we spent the night in a cheap hotel -> <st> Pasamos la cama blanda debajo de noche <end>
Epoch: 41 -> Loss:  0.32026825
we spent the night in a cheap hotel -> <st> Hubo silencio es el hotel barato <end>
Epoch: 51 -> Loss:  0.31809756
we spent the night in a cheap hotel -> <st> Pasamos ninguna noche ¿Acaso Regresa carne <end>
Epoch: 61 -> Loss:  0.31795131
we spent the night in a cheap hotel -> <st> Pasamos peligro la multa dulce <end>
Epoch: 71 -> Loss:  0.31438766
we spent the night in a cheap hotel -> <st> Pasamos ninguna noche pescando <end>
Epoch: 81 -> Loss:  0.31334435
we spent the night in a ch

In [None]:
epochs = 50
print_step = 10
lr = 5e-5
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
test_sen = "we spent the night in a cheap hotel"

loss = train(model, optimizer, dataloader, epochs, translate, test_sen, device, print_step)

Epoch: 1 -> Loss:  0.30984435
we spent the night in a cheap hotel -> <st> Pasamos el hotel económico <end>
Epoch: 11 -> Loss:  0.30576440
we spent the night in a cheap hotel -> <st> El lunes por un hotel económico <end>
Epoch: 21 -> Loss:  0.30694344
we spent the night in a cheap hotel -> <st> Pasamos el hotel económico <end>
Epoch: 31 -> Loss:  0.30241099
we spent the night in a cheap hotel -> <st> Pasamos el hotel barato de noche <end>
Epoch: 41 -> Loss:  0.30056255
we spent the night in a cheap hotel -> <st> Pasamos el hotel menos un hotel económico <end>


In [None]:
new_en_sens = ['hello!', 'what is my name?',
               'how are you?', 'she is pretty.',
               'we spent the night in a cheap hotel']

In [None]:
for new_en_sen in new_en_sens:
  spanish = translate(aux(new_en_sen).lower(), device)
  spanish = spanish.replace('<st> ', '').replace(' <end>', '')
  print(new_en_sen, '->', spanish)

hello! -> Hola
what is my name? -> ¿Cómo es mi nombre
how are you? -> ¿Cómo haces
she is pretty. -> Está muy hermosa
we spent the night in a cheap hotel -> Pasamos el hotel barato de noche


In [None]:
import pickle

with open('params.pkl', 'wb') as f:
  pickle.dump([SRC_Tokens, TRG_Tokens, idx_to_tok_trg], f)