In [57]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [58]:
!wget http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
!unzip -q spa-eng.zip

--2024-04-12 08:26:58--  http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.253.117.207, 142.250.99.207, 74.125.20.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.253.117.207|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2638744 (2.5M) [application/zip]
Saving to: ‘spa-eng.zip.1’


2024-04-12 08:26:58 (309 MB/s) - ‘spa-eng.zip.1’ saved [2638744/2638744]

replace spa-eng/_about.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace spa-eng/spa.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y


In [59]:
text_file = "spa-eng/spa.txt"
with open(text_file) as f:
  lines = f.read().split("\n")[:-1]

data = []
source_data = []
target_data = []
for line in lines:
  source, target = line.split('\t')
  source_data.append(source)
  target_data.append(target)
  data.append((source, target))

In [60]:
data[-1]

('If you want to sound like a native speaker, you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo.',
 'Si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado.')

In [61]:
import random

random.shuffle(data)
num_val_samples = int(0.15 * len(data))
num_train_samples = len(data) - 2 * num_val_samples

train_pairs = data[:num_train_samples]
val_pairs = data[num_train_samples:num_train_samples + num_val_samples]
test_pairs = data[num_train_samples + num_val_samples:]

In [62]:
from collections import Counter
import string
from tqdm import tqdm

strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")


class TextVectorizer:

  def __init__(self, sequence_length, vocab_size, target=False):
    self.target = target
    self.sequence_length = sequence_length
    self.vocab_size = vocab_size
    self.vocab_counter = Counter()
    self.stoi = {"[pad]": 0, "[start]": 1, "[end]": 2, "[UNK]": 3}
    self.itos = {0: "[pad]", 1: "[start]", 2: "[end]", 3: "[UNK]"}

  def standardize(self, text):
    text = text.lower()
    return "".join(char for char in text
                  if char not in strip_chars)

  def tokenize(self, text):
    text = self.standardize(text)
    return text.split()

  def adapt(self, dataset):

    for text in tqdm(dataset):
      tokens = self.tokenize(text)
      for token in tokens:
        self.vocab_counter[token] += 1

    for token, _ in self.vocab_counter.most_common(self.vocab_size):
      indx = len(self.stoi)
      self.stoi[token] = indx
      self.itos[indx] = token

  def encode(self, text):
    text = self.standardize(text)
    tokens = self.tokenize(text)
    if self.target:
      result = ([self.stoi["[start]"]] + [self.stoi.get(token, 3) for token in tokens]
            + [self.stoi["[end]"]])
    else:
      result = [self.stoi.get(token, 3) for token in tokens]

    if len(result) <= self.sequence_length:
        pad_size = self.sequence_length - len(result)
        result += [self.stoi.get("[pad]")] * (pad_size)
    else:
      #truncate!
      result = result[:self.sequence_length]

    return result

  def decode(self, int_sequence):
    return " ".join(self.itos.get(i, "[UNK]") for i in int_sequence)

In [63]:
vocab_size = 15000
sequence_length = 20

source_vectorizer = TextVectorizer(sequence_length, vocab_size)
target_vectorizer = TextVectorizer(sequence_length + 1, vocab_size, target=True)

In [64]:
source_vectorizer.adapt(source_data)
target_vectorizer.adapt(target_data)

100%|██████████| 118964/118964 [00:00<00:00, 172544.06it/s]
100%|██████████| 118964/118964 [00:00<00:00, 150749.81it/s]


In [65]:
encoded_ = source_vectorizer.encode('If you want to sound')
source_vectorizer.decode(encoded_)

'if you want to sound [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad]'

In [66]:
eng, spa = train_pairs[1]
source_vectorizer.encode(eng)

[11, 10, 249, 5843, 4, 3776, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [67]:
class EngSpaDataset(Dataset):
  def __init__(self, data, source_vectorizer, target_vectorizer):
    self.data = data
    self.source_vectorizer = source_vectorizer
    self.target_vectorizer = target_vectorizer

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    eng, spa = self.data[idx]
    eng = self.source_vectorizer.encode(eng)
    spa = self.target_vectorizer.encode(spa)
    return ({
          "english": torch.tensor(eng).long(),
          "spanish": torch.tensor(spa[:-1]).long(),
          }, torch.tensor(spa[1:]).long())

In [68]:
train_ds = EngSpaDataset(train_pairs, source_vectorizer, target_vectorizer)
val_ds = EngSpaDataset(val_pairs, source_vectorizer, target_vectorizer)
test_ds = EngSpaDataset(test_pairs, source_vectorizer, target_vectorizer)

In [69]:
def permute_batch_seq_collate(data: torch.Tensor):
  batch_size = len(data)
  source_input = torch.zeros(batch_size, data[0][0]["english"].size(0))
  target_input = torch.zeros(batch_size, data[0][0]["spanish"].size(0))
  target_output = torch.zeros(batch_size, data[0][1].size(0))
  for idx, (inputs, output) in enumerate(data):
    source_input[idx] = inputs["english"]
    target_input[idx] = inputs["spanish"]
    target_output[idx] = output

  return (source_input.permute(1, 0).long(), target_input.permute(1, 0).long(),
          target_output.permute(1, 0).long())

In [70]:
batch_size = 64

train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True,
                      collate_fn=permute_batch_seq_collate)
val_dl = DataLoader(val_ds, batch_size=batch_size,
                    collate_fn=permute_batch_seq_collate)
test_dl = DataLoader(test_ds, batch_size=batch_size,
                     collate_fn=permute_batch_seq_collate)

In [71]:
for source, target_inp, target_out in train_dl:
  print(source)
  print("source tensor size: ", source.size())
  print("target input tensor size: ", target_inp.size())
  print("target tensor size: ", target_out.size())
  break

tensor([[   5,    8,   23,  ...,    8,   26,    8],
        [ 305,  207,   61,  ...,  203, 4677,   10],
        [   5,   27,  468,  ...,    6,   75,  841],
        ...,
        [   0,    0,    0,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0]])
source tensor size:  torch.Size([20, 64])
target input tensor size:  torch.Size([20, 64])
target tensor size:  torch.Size([20, 64])


In [72]:
class Encoder(nn.Module):
  def __init__(self, source_dim : int, embedding_dim : int, hidden_dim : int,
               padding_idx : int=0, num_rnn_layers: int=1, dropout: int = 0.2):
    super().__init__()
    self.source_dim = source_dim
    self.embedding_dim =  embedding_dim
    self.hidden_dim =  hidden_dim

    self.dropout = nn.Dropout(dropout)

    self.embedding_layer = nn.Embedding(self.source_dim, self.embedding_dim,
                                        padding_idx=padding_idx)
    self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim,
                        num_layers=num_rnn_layers, )

  def forward(self, x : torch.Tensor):
    x = self.embedding_layer(x)
    x = self.dropout(x)
    output, (hidden_state, cell_state) = self.lstm(x)
    return hidden_state, cell_state

# vocab_size = len(source_vectorizer.stoi)
# print("vocab_size", vocab_size)
# x = torch.randint(0, vocab_size, size = (20, 64))
# encoder = Encoder(vocab_size, 256, 300)
# encoder(x)[0].size()

In [73]:
class Decoder(nn.Module):
  def __init__(self, target_dim : int, embedding_dim : int, hidden_dim : int,
               padding_idx : int=0, num_rnn_layers: int=1, dropout: int = 0.2):
    super().__init__()
    self.target_dim = target_dim
    self.embedding_dim =  embedding_dim
    self.hidden_dim =  hidden_dim

    self.dropout = nn.Dropout(dropout)

    self.embedding_layer = nn.Embedding(self.target_dim, self.embedding_dim,
                                        padding_idx=padding_idx)
    self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim,
                        num_layers=num_rnn_layers)
    self.classifier = nn.Linear(hidden_dim, target_dim)

  def forward(self, x, hidden_state, cell_state):
    x = self.embedding_layer(x)
    x = self.dropout(x)
    outputs, (hidden_state, cell_state) = self.lstm(x, (hidden_state, cell_state))
    predictions = self.classifier(outputs)

    return predictions

# vocab_size = len(target_vectorizer.stoi)
# print("vocab_size", vocab_size)
# x = torch.randint(0, vocab_size, size = (20, 64))
# h, c = torch.randn(1, 64, 300), torch.randn(1, 64, 300)
# decoder = Decoder(vocab_size, 256, 300)
# decoder(x, h, c).size()

In [74]:
class NMTNet(nn.Module):
  def __init__(self, encoder, decoder):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self, source, target):
    encoder_output = self.encoder(source)
    output = self.decoder(target, *encoder_output)
    return output

source_vocab_size = len(source_vectorizer.stoi)
target_vocab_size = len(target_vectorizer.stoi)
print("source_vocab_size", source_vocab_size)
print("target_vocab_size", target_vocab_size)
source = torch.randint(0, source_vocab_size, size = (20, 64))
target = torch.randint(0, target_vocab_size, size = (20, 64))
encoder = Encoder(source_vocab_size, 256, 300)
decoder = Decoder(target_vocab_size, 256, 300)
model = NMTNet(encoder, decoder)
model(source, target).size()

source_vocab_size 13636
target_vocab_size 15004


torch.Size([20, 64, 15004])

In [None]:
nn.LSTM(12, 10)

In [75]:
# model hyperparameters
source_vocab_size = len(source_vectorizer.stoi)
target_vocab_size = len(target_vectorizer.stoi)
device = "cuda" if torch.cuda.is_available() else "cpu"
hidden_dim = 512
encoder_embedding_dim = 128
decoder_embedding_dim = 128
padding_index = target_vectorizer.stoi["[pad]"]

learning_rate = 0.001
num_epochs = 20


encoder = Encoder(
    source_vocab_size,
    encoder_embedding_dim,
    hidden_dim
    ).to(device)

decoder = Decoder(
    target_vocab_size,
    decoder_embedding_dim,
    hidden_dim
    ).to(device)

model = NMTNet(
    encoder,
    decoder
    ).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=padding_index)
optimizer = torch.optim.NAdam(model.parameters(), lr=learning_rate)


In [76]:
x = torch.randn(12, 3, 32)
x.reshape(-1, 32).size()

torch.Size([36, 32])

In [77]:
if torch.tensor(2) == torch.tensor(2):
  print("SALAM")

SALAM


In [78]:
def cal_accuracy(source, target):
  predictions = source.argmax(1)
  correct = 0
  for idx, token in enumerate(predictions):
    text = target_vectorizer.itos[token.item()]
    if target[idx] == token:
      correct += 1
    if text == "[end]":
      break
  return correct / (idx+1)

In [79]:
from tqdm import tqdm

for epoch in range(num_epochs):

  model.train()
  train_loss = 0.0
  val_loss = 0.0
  train_acc = 0.0
  val_acc = 0.0

  for idx, (source, target_inp, target_out) in enumerate(tqdm(train_dl)):
    source = source.to(device)
    target_inp = target_inp.to(device)
    target_out = target_out.to(device)

    predictions = model(source, target_inp)
    # seq_length * batch
    loss = criterion(predictions.reshape(-1, predictions.shape[2]), target_out.reshape(-1))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    train_loss += loss.item()
    with torch.no_grad():
      train_acc += cal_accuracy(predictions.reshape(-1, predictions.shape[2]), target_out.reshape(-1))

    # if idx % 500 == 0:
    #   print(f"Epoch{epoch+1}/{num_epochs} step {idx+1} | \
    #               train_loss: {train_loss / (idx+1)} | train_acc {train_acc / (idx+1)}")

  model.eval()
  with torch.no_grad():
    for idx, (source, target_inp, target_out) in enumerate((val_dl)):
      source = source.to(device)
      target_inp = target_inp.to(device)
      target_out = target_out.to(device)

      predictions = model(source, target_inp)
      # seq_length * batch
      loss = criterion(predictions.reshape(-1, predictions.shape[2]), target_out.reshape(-1))

      val_loss += loss.item()
      val_acc += cal_accuracy(predictions.reshape(-1, predictions.shape[2]), target_out.reshape(-1))

  print(f"\n Epoch{epoch+1}/{num_epochs} | \
          train_loss: {train_loss / len(train_dl)}| train_acc {train_acc / len(train_dl)} \
           | val_loss: {val_loss / len(val_dl)} | val_acc {val_acc / len(val_dl)}")

100%|██████████| 1302/1302 [00:48<00:00, 26.76it/s]



 Epoch1/20 |           train_loss: 4.475375846783686| train_acc 0.3254478127483873            | val_loss: 3.4453619346823743 | val_acc 0.48056486591941605


100%|██████████| 1302/1302 [00:49<00:00, 26.42it/s]



 Epoch2/20 |           train_loss: 2.9149965362797867| train_acc 0.5155839568795977            | val_loss: 2.6993112068449725 | val_acc 0.5585776918759865


100%|██████████| 1302/1302 [00:49<00:00, 26.09it/s]



 Epoch3/20 |           train_loss: 2.206947722834193| train_acc 0.577698804786036            | val_loss: 2.3888655366863403 | val_acc 0.5929203062515481


100%|██████████| 1302/1302 [00:50<00:00, 25.84it/s]



 Epoch4/20 |           train_loss: 1.7647765539758216| train_acc 0.6226053515431009            | val_loss: 2.246700680811345 | val_acc 0.6090100019450841


100%|██████████| 1302/1302 [00:50<00:00, 25.80it/s]



 Epoch5/20 |           train_loss: 1.4707486049920184| train_acc 0.6607976704068343            | val_loss: 2.1809156898102025 | val_acc 0.6221130918366956


100%|██████████| 1302/1302 [00:50<00:00, 25.62it/s]



 Epoch6/20 |           train_loss: 1.2603264001016425| train_acc 0.6904105985316764            | val_loss: 2.1631913706393227 | val_acc 0.6230686125064631


100%|██████████| 1302/1302 [00:50<00:00, 25.67it/s]



 Epoch7/20 |           train_loss: 1.1115955782284568| train_acc 0.7116710874841048            | val_loss: 2.1467980428408553 | val_acc 0.6320457105486332


100%|██████████| 1302/1302 [00:51<00:00, 25.40it/s]



 Epoch8/20 |           train_loss: 0.9954095465887893| train_acc 0.7309588313772689            | val_loss: 2.1536935033764038 | val_acc 0.6347677640556496


100%|██████████| 1302/1302 [00:51<00:00, 25.37it/s]



 Epoch9/20 |           train_loss: 0.9069521975132727| train_acc 0.7461852787089038            | val_loss: 2.1772274983826505 | val_acc 0.6392055948895343


100%|██████████| 1302/1302 [00:51<00:00, 25.40it/s]



 Epoch10/20 |           train_loss: 0.8337412425266799| train_acc 0.7591472104877967            | val_loss: 2.1978165723089678 | val_acc 0.6368526761286892


100%|██████████| 1302/1302 [00:51<00:00, 25.37it/s]



 Epoch11/20 |           train_loss: 0.7730866520452426| train_acc 0.7719352944383923            | val_loss: 2.218692331331178 | val_acc 0.6426024190698066


100%|██████████| 1302/1302 [00:51<00:00, 25.49it/s]



 Epoch12/20 |           train_loss: 0.7220242944089682| train_acc 0.7814706935142135            | val_loss: 2.2507016060600145 | val_acc 0.6403277436949546


100%|██████████| 1302/1302 [00:50<00:00, 25.58it/s]



 Epoch13/20 |           train_loss: 0.6793933025519786| train_acc 0.7903385124214117            | val_loss: 2.28068047185098 | val_acc 0.6406670713061889


100%|██████████| 1302/1302 [00:51<00:00, 25.47it/s]



 Epoch14/20 |           train_loss: 0.6411904086074155| train_acc 0.7979402966519744            | val_loss: 2.3143176442833355 | val_acc 0.6403923977869014


100%|██████████| 1302/1302 [00:50<00:00, 25.82it/s]



 Epoch15/20 |           train_loss: 0.6105503520474822| train_acc 0.8049781496163247            | val_loss: 2.3355849596761886 | val_acc 0.6421134442736047


100%|██████████| 1302/1302 [00:50<00:00, 25.71it/s]



 Epoch16/20 |           train_loss: 0.5825207333914512| train_acc 0.8106307146824847            | val_loss: 2.3653518974140124 | val_acc 0.6396632323385345


100%|██████████| 1302/1302 [00:50<00:00, 25.74it/s]



 Epoch17/20 |           train_loss: 0.5543347009544914| train_acc 0.8172604529117572            | val_loss: 2.401506536750383 | val_acc 0.6420390049933009


100%|██████████| 1302/1302 [00:50<00:00, 25.60it/s]



 Epoch18/20 |           train_loss: 0.530749029850447| train_acc 0.8236021049772841            | val_loss: 2.431323788072046 | val_acc 0.6407041564606353


100%|██████████| 1302/1302 [00:50<00:00, 25.54it/s]



 Epoch19/20 |           train_loss: 0.5108629057095164| train_acc 0.8269811250653376            | val_loss: 2.4456300346654802 | val_acc 0.6412336706066967


100%|██████████| 1302/1302 [00:50<00:00, 25.71it/s]



 Epoch20/20 |           train_loss: 0.49367910655595926| train_acc 0.832519700144898            | val_loss: 2.481045982743677 | val_acc 0.6401899285437315


In [80]:
source, target_inp, target_out = next(iter(val_dl))
prediction = model(source.to(device), target_inp.to(device))
# target_vectorizer
prediction[:, 0].argmax(1)

tensor([ 18,  14, 885,  40,  40,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2], device='cuda:0')

In [81]:
translate = ""
for i in prediction[:, 18].argmax(1):
 text = target_vectorizer.itos[i.item()]
 translate += " " + text
 if text == "[end]":
  break
translate

' el lugar está completamente desierto [end]'

In [82]:
translate = ""
for i in target_out[:, 18]:
 text = target_vectorizer.itos[i.item()]
 translate += " " + text
 if text == "[end]":
  break
translate

' el lugar está completamente desierto [end]'

In [None]:
from collections import Counter




: 