#Lab6 Q1



In [None]:
import requests
import torch
import torch.nn.functional as F 
import torchtext

url="https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/"

train_en = [line.split() for line in requests.get(url + "train.en").text.splitlines()]
train_vi = [line.split() for line in requests.get(url + "train.vi").text.splitlines()]

test_en = [line.split() for line in requests.get(url + "tst2013.en").text.splitlines()]
test_vi = [line.split() for line in requests.get(url + "tst2013.vi").text.splitlines()]

In [None]:
print(len(train_en))
train_vi[0]

133317


['Khoa', 'học', 'đằng', 'sau', 'một', 'tiêu', 'đề', 'về', 'khí', 'hậu']

In [None]:
MODELNAME = "iwslt15-en-vi-rnn.model"
EPOCH = 10
BATCHSIZE = 128
LR = 0.0001
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
def make_vocab(train_data, min_freq):
  vocab = {}
  for tokenlist in train_data:
    for token in tokenlist:
      if token not in vocab:
        vocab[token] = 0
      vocab[token] += 1
  vocablist = [('<unk>', 0), ('<pad>', 0), ('<cls>', 0), ('<eos>', 0)]
  vocabidx = {}
  for token, freq in vocab.items():
    if freq >= min_freq:
      idx = len(vocablist)
      vocablist.append((token, freq))
      vocabidx[token]=idx
  vocabidx['<unk>']=0
  vocabidx['<pad>']=1
  vocabidx['<cls>']=2
  vocabidx['<eos>']=3
  return vocablist, vocabidx

vocablist_en, vocabidx_en = make_vocab(train_en, 3)
vocablist_vi, vocabidx_vi = make_vocab(train_vi, 3)

print("vocab size en:", len(vocablist_en))
print("vocab size vi:", len(vocablist_vi))


vocab size en: 24420
vocab size vi: 10666


In [None]:
def preprocess(data, vocabidx):
  rr = []
  for tokenlist in data:
    tkl = ['<cls>']
    for token in tokenlist:
      tkl.append(token if token in vocabidx else '<unk>')
    tkl.append('<eos>')
    rr.append(tkl)
  return rr

train_en_prep = preprocess(train_en, vocabidx_en)
train_vi_prep = preprocess(train_vi, vocabidx_vi)
test_en_prep = preprocess(test_en, vocabidx_en)

for i in range(5):
  print(train_en_prep[i])
  print(train_vi_prep[i])
  print(test_en_prep[i])

['<cls>', 'Rachel', 'Pike', ':', 'The', 'science', 'behind', 'a', 'climate', 'headline', '<eos>']
['<cls>', 'Khoa', 'học', 'đằng', 'sau', 'một', 'tiêu', 'đề', 'về', 'khí', 'hậu', '<eos>']
['<cls>', 'When', 'I', 'was', 'little', ',', 'I', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'I', 'grew', 'up', 'singing', 'a', 'song', 'called', '&quot;', 'Nothing', 'To', '<unk>', '.', '&quot;', '<eos>']
['<cls>', 'In', '4', 'minutes', ',', 'atmospheric', 'chemist', 'Rachel', 'Pike', 'provides', 'a', 'glimpse', 'of', 'the', 'massive', 'scientific', 'effort', 'behind', 'the', 'bold', 'headlines', 'on', 'climate', 'change', ',', 'with', 'her', 'team', '--', 'one', 'of', 'thousands', 'who', 'contributed', '--', 'taking', 'a', 'risky', 'flight', 'over', 'the', 'rainforest', 'in', 'pursuit', 'of', 'data', 'on', 'a', 'key', 'molecule', '.', '<eos>']
['<cls>', 'Trong', '4', 'phút', ',', 'chuyên', 'gia', 'hoá', 'học', 'khí', 'quyển', 'Rachel', 'Pike', 'giới', 'thiệu

In [None]:
train_data = list(zip(train_en_prep, train_vi_prep))
train_data.sort(key = lambda x: (len(x[0]), len(x[1])))
test_data = list(zip(test_en_prep, test_en, test_vi))

for i in range(5):
  print(train_data[i])

for i in range(5):   
  print(test_data[i])

(['<cls>', '<eos>'], ['<cls>', '<eos>'])
(['<cls>', '<eos>'], ['<cls>', '<eos>'])
(['<cls>', '<eos>'], ['<cls>', '<eos>'])
(['<cls>', '<eos>'], ['<cls>', '<eos>'])
(['<cls>', '<eos>'], ['<cls>', '<eos>'])
(['<cls>', 'When', 'I', 'was', 'little', ',', 'I', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'I', 'grew', 'up', 'singing', 'a', 'song', 'called', '&quot;', 'Nothing', 'To', '<unk>', '.', '&quot;', '<eos>'], ['When', 'I', 'was', 'little', ',', 'I', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'I', 'grew', 'up', 'singing', 'a', 'song', 'called', '&quot;', 'Nothing', 'To', 'Envy', '.', '&quot;'], ['Khi', 'tôi', 'còn', 'nhỏ', ',', 'Tôi', 'nghĩ', 'rằng', 'BắcTriều', 'Tiên', 'là', 'đất', 'nước', 'tốt', 'nhất', 'trên', 'thế', 'giới', 'và', 'tôi', 'thường', 'hát', 'bài', '&quot;', 'Chúng', 'ta', 'chẳng', 'có', 'gì', 'phải', 'ghen', 'tị', '.', '&quot;'])
(['<cls>', 'And', 'I', 'was', 'very', 'proud', '.', '<eos>

In [None]:
def make_batch(data, batchsize):
  bb = []
  ben = []
  bvi = []
  for en, vi in data: 
    ben.append(en)
    bvi.append(vi)
    if len(ben) >= batchsize:
      bb.append((ben, bvi))
      ben = []
      bvi = []
  if len(ben) > 0:
    bb.append((ben, bvi))
  return bb

train_data = make_batch(train_data, BATCHSIZE)

for i in range(5):
  print(train_data[i])

([['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>']

In [None]:
def padding_batch(b):
  maxlen = max([len(x) for x in b])
  for tkl in b:
    for i in range(maxlen - len(tkl)):
      tkl.append('<pad>')

def padding(bb):
  for ben, bvi in bb:
    padding_batch(ben)
    padding_batch(bvi)

padding(train_data)

for i in range(3):
  print(train_data[i])

([['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>']

In [None]:
train_data = [([[vocabidx_en[token] for token in tokenlist] for tokenlist in ben],
               [[vocabidx_vi [token] for token in tokenlist] for tokenlist in bvi]) for ben, bvi in train_data]
test_data = [([vocabidx_en[token] for token in enprep], en, vi) for enprep, en, vi in test_data]

for i in range (3): 
  print(train_data[i]) 
for i in range(3): 
  print(test_data[i])

([[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]

In [None]:
class RNNEncDec(torch.nn.Module):
  def __init__(self, vocablist_x, vocabidx_x, vocablist_y, vocabidx_y):
    super(RNNEncDec, self).__init__() 
    self.encemb = torch.nn.Embedding(len(vocablist_x), 300, padding_idx=vocabidx_x['<pad>'])
    self.encrnn = torch.nn.Linear(300, 300) 
    self.decemb = torch.nn.Embedding(len(vocablist_y), 300, padding_idx=vocabidx_y['<pad>'])
    self.decrnn = torch.nn.Linear(300, 300)
    self.decout = torch.nn.Linear(300, len(vocablist_y))
  
  def forward(self,x):
    x, y = x[0], x[1]
    #enc
    e_x = self.encemb(x)
    n_x = e_x.size()[0]
    h = torch.zeros(300, dtype=torch.float32).to(DEVICE)
    for i in range(n_x):
      h = F.relu(e_x[i] + self.encrnn(h))
    #dec
    e_y = self.decemb(y)
    n_y = e_y.size()[0]
    loss = torch.tensor(0., dtype=torch.float32).to(DEVICE)
    for i in range (n_y-1):
      h = F.relu(e_y[i] + self.decrnn(h))
      loss += F.cross_entropy(self.decout(h), y[i+1])
    return loss

  def evaluate(self, x, vocablist_y, vocabidx_y):
    #enc
    e_x = self.encemb(x)
    n_x=e_x.size()[0]
    h = torch.zeros(300, dtype=torch.float32).to(DEVICE)
    for i in range (n_x):
      h = F.relu(e_x[i] + self.encrnn(h)) 
    #dec
    y = torch.tensor([vocabidx_y['<cls>']]).to(DEVICE)
    e_y= self.decemb(y)
    pred = []
    for i in range (30):
      h = F.relu(e_y + self.decrnn(h))
      pred_id = self.decout(h).squeeze().argmax() 
      if pred_id == vocabidx_y['<eos>']:
        break
      pred_y = vocablist_y[pred_id][0]
      pred.append(pred_y) 
      y[0] = pred_id
      e_y= self.decemb(y)
    return pred

In [None]:
def train():
  model = RNNEncDec(vocablist_en, vocabidx_en, vocablist_vi, vocabidx_vi).to(DEVICE)
  optimizer = torch.optim.Adam(model.parameters(), lr=LR) 
  for epoch in range(EPOCH):
    loss = 0
    step = 0
    for ben, bvi in train_data:
      ben = torch.tensor(ben, dtype=torch.int64).transpose(0,1).to(DEVICE) 
      bvi = torch.tensor(bvi, dtype=torch.int64).transpose(0,1).to(DEVICE)
      optimizer.zero_grad()
      batchloss = model((ben, bvi))
      batchloss.backward()
      optimizer.step() 
      loss = loss + batchloss.item()
      if step % 100 == 0:
        print("step:", step, "batch loss:", batchloss.item())
      step += 1
    print("epoch", epoch, ": loss", loss)
  torch.save(model.state_dict(), MODELNAME)

In [None]:
def test():
  total = 0
  correct = 0
  model = RNNEncDec(vocablist_en, vocabidx_en, vocablist_vi, vocabidx_vi).to(DEVICE)
  model.load_state_dict(torch.load(MODELNAME))
  model.eval()
  ref = []
  pred = []
  for enprep, en, vi in test_data:
    input = torch.tensor([enprep], dtype=torch.int64).transpose(0, 1).to(DEVICE)
    p=model.evaluate(input, vocablist_vi, vocabidx_vi)
    print("INPUT", en)
    print("REF", vi)
    print("MT", p)
    ref.append([vi])
    pred.append(p)
  bleu = torchtext.data.metrics.bleu_score(pred, ref)
  print("total:", len(test_data)) 
  print("bleu:", bleu)

In [None]:
train()
test()

step: 0 batch loss: 28.806880950927734
step: 100 batch loss: 58.58261489868164
step: 200 batch loss: 75.17407989501953
step: 300 batch loss: 92.27848052978516
step: 400 batch loss: 121.72640228271484
step: 500 batch loss: 107.84327697753906
step: 600 batch loss: 127.04374694824219
step: 700 batch loss: 180.84402465820312
step: 800 batch loss: 192.70179748535156
step: 900 batch loss: 250.98251342773438
step: 1000 batch loss: 308.3873291015625
epoch 0 : loss 162578.50700759888
step: 0 batch loss: 18.294334411621094
step: 100 batch loss: 47.40180969238281
step: 200 batch loss: 62.996856689453125
step: 300 batch loss: 79.18585968017578
step: 400 batch loss: 106.6675033569336
step: 500 batch loss: 96.40930938720703
step: 600 batch loss: 112.40162658691406
step: 700 batch loss: 163.86366271972656
step: 800 batch loss: 175.44813537597656
step: 900 batch loss: 223.525390625
step: 1000 batch loss: 287.82672119140625
epoch 1 : loss 137898.19924926758
step: 0 batch loss: 20.24225616455078
step: 1

#Lab6 Q2



In [None]:
import requests
import torch
import torch.nn.functional as F 
import torchtext

url="https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/"

train_en = [line.split() for line in requests.get(url + "train.en").text.splitlines()]
train_vi = [line.split() for line in requests.get(url + "train.vi").text.splitlines()]

test_en = [line.split() for line in requests.get(url + "tst2013.en").text.splitlines()]
test_vi = [line.split() for line in requests.get(url + "tst2013.vi").text.splitlines()]

In [None]:
print(len(train_en))
train_vi[0]

133317


['Khoa', 'học', 'đằng', 'sau', 'một', 'tiêu', 'đề', 'về', 'khí', 'hậu']

In [None]:
MODELNAME = "iwslt15-en-vi-rnn.model"
EPOCH = 10
BATCHSIZE = 128
LR = 0.0001
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
def make_vocab(train_data, min_freq):
  vocab = {}
  for tokenlist in train_data:
    for token in tokenlist:
      if token not in vocab:
        vocab[token] = 0
      vocab[token] += 1
  vocablist = [('<unk>', 0), ('<pad>', 0), ('<cls>', 0), ('<eos>', 0)]
  vocabidx = {}
  for token, freq in vocab.items():
    if freq >= min_freq:
      idx = len(vocablist)
      vocablist.append((token, freq))
      vocabidx[token]=idx
  vocabidx['<unk>']=0
  vocabidx['<pad>']=1
  vocabidx['<cls>']=2
  vocabidx['<eos>']=3
  return vocablist, vocabidx

vocablist_en, vocabidx_en = make_vocab(train_en, 3)
vocablist_vi, vocabidx_vi = make_vocab(train_vi, 3)

print("vocab size en:", len(vocablist_en))
print("vocab size vi:", len(vocablist_vi))


vocab size en: 24420
vocab size vi: 10666


In [None]:
def preprocess(data, vocabidx):
  rr = []
  for tokenlist in data:
    tkl = ['<cls>']
    for token in tokenlist:
      tkl.append(token if token in vocabidx else '<unk>')
    tkl.append('<eos>')
    rr.append(tkl)
  return rr

train_en_prep = preprocess(train_en, vocabidx_en)
train_vi_prep = preprocess(train_vi, vocabidx_vi)
test_en_prep = preprocess(test_en, vocabidx_en)

for i in range(5):
  print(train_en_prep[i])
  print(train_vi_prep[i])
  print(test_en_prep[i])

['<cls>', 'Rachel', 'Pike', ':', 'The', 'science', 'behind', 'a', 'climate', 'headline', '<eos>']
['<cls>', 'Khoa', 'học', 'đằng', 'sau', 'một', 'tiêu', 'đề', 'về', 'khí', 'hậu', '<eos>']
['<cls>', 'When', 'I', 'was', 'little', ',', 'I', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'I', 'grew', 'up', 'singing', 'a', 'song', 'called', '&quot;', 'Nothing', 'To', '<unk>', '.', '&quot;', '<eos>']
['<cls>', 'In', '4', 'minutes', ',', 'atmospheric', 'chemist', 'Rachel', 'Pike', 'provides', 'a', 'glimpse', 'of', 'the', 'massive', 'scientific', 'effort', 'behind', 'the', 'bold', 'headlines', 'on', 'climate', 'change', ',', 'with', 'her', 'team', '--', 'one', 'of', 'thousands', 'who', 'contributed', '--', 'taking', 'a', 'risky', 'flight', 'over', 'the', 'rainforest', 'in', 'pursuit', 'of', 'data', 'on', 'a', 'key', 'molecule', '.', '<eos>']
['<cls>', 'Trong', '4', 'phút', ',', 'chuyên', 'gia', 'hoá', 'học', 'khí', 'quyển', 'Rachel', 'Pike', 'giới', 'thiệu

In [None]:
train_data = list(zip(train_en_prep, train_vi_prep))
train_data.sort(key = lambda x: (len(x[0]), len(x[1])))
test_data = list(zip(test_en_prep, test_en, test_vi))

for i in range(5):
  print(train_data[i])

for i in range(5):   
  print(test_data[i])

(['<cls>', '<eos>'], ['<cls>', '<eos>'])
(['<cls>', '<eos>'], ['<cls>', '<eos>'])
(['<cls>', '<eos>'], ['<cls>', '<eos>'])
(['<cls>', '<eos>'], ['<cls>', '<eos>'])
(['<cls>', '<eos>'], ['<cls>', '<eos>'])
(['<cls>', 'When', 'I', 'was', 'little', ',', 'I', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'I', 'grew', 'up', 'singing', 'a', 'song', 'called', '&quot;', 'Nothing', 'To', '<unk>', '.', '&quot;', '<eos>'], ['When', 'I', 'was', 'little', ',', 'I', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'I', 'grew', 'up', 'singing', 'a', 'song', 'called', '&quot;', 'Nothing', 'To', 'Envy', '.', '&quot;'], ['Khi', 'tôi', 'còn', 'nhỏ', ',', 'Tôi', 'nghĩ', 'rằng', 'BắcTriều', 'Tiên', 'là', 'đất', 'nước', 'tốt', 'nhất', 'trên', 'thế', 'giới', 'và', 'tôi', 'thường', 'hát', 'bài', '&quot;', 'Chúng', 'ta', 'chẳng', 'có', 'gì', 'phải', 'ghen', 'tị', '.', '&quot;'])
(['<cls>', 'And', 'I', 'was', 'very', 'proud', '.', '<eos>

In [None]:
def make_batch(data, batchsize):
  bb = []
  ben = []
  bvi = []
  for en, vi in data: 
    ben.append(en)
    bvi.append(vi)
    if len(ben) >= batchsize:
      bb.append((ben, bvi))
      ben = []
      bvi = []
  if len(ben) > 0:
    bb.append((ben, bvi))
  return bb

train_data = make_batch(train_data, BATCHSIZE)

for i in range(5):
  print(train_data[i])

([['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>']

In [None]:
def padding_batch(b):
  maxlen = max([len(x) for x in b])
  for tkl in b:
    for i in range(maxlen - len(tkl)):
      tkl.append('<pad>')

def padding(bb):
  for ben, bvi in bb:
    padding_batch(ben)
    padding_batch(bvi)

padding(train_data)

for i in range(3):
  print(train_data[i])

([['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>']

In [None]:
train_data = [([[vocabidx_en[token] for token in tokenlist] for tokenlist in ben],
               [[vocabidx_vi [token] for token in tokenlist] for tokenlist in bvi]) for ben, bvi in train_data]
test_data = [([vocabidx_en[token] for token in enprep], en, vi) for enprep, en, vi in test_data]

for i in range (3): 
  print(train_data[i]) 
for i in range(3): 
  print(test_data[i])

([[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]

In [None]:
class LSTM(torch.nn.Module):
  def __init__(self, vocablist_x, vocabidx_x, vocablist_y, vocabidx_y):
    super(LSTM, self).__init__()

    self.encemb = torch.nn.Embedding(len(vocablist_x), 256, padding_idx = vocabidx_x['<pad>'])
    self.dropout = torch.nn.Dropout(0.5)
    self.enclstm = torch.nn.LSTM(256,516,2,dropout=0.5)
    
    self.decemb = torch.nn.Embedding(len(vocablist_x), 256, padding_idx = vocabidx_y['<pad>'])
    self.declstm = torch.nn.LSTM(256,516,2,dropout=0.5)
    self.decout = torch.nn.Linear(516, len(vocabidx_y))
  
  def forward(self,x):
    x, y = x[0], x[1]
    e_x = self.dropout(self.encemb(x))
    outenc,(h,c) = self.enclstm(e_x)
    n_y=y.shape[0]
    outputs = torch.zeros(n_y,BATCHSIZE,len(vocablist_vi)).to(DEVICE)
    loss = torch.tensor(0.,dtype=torch.float32).to(DEVICE)
    for i in range(n_y-1):
      input = y[i]
      input = input.unsqueeze(0)
      input = self.dropout(self.decemb(input))
      outdec, (h,c) = self.declstm(input,(h,c))
      output = self.decout(outdec.squeeze(0))
      input = y[i+1]
      loss += F.cross_entropy(output, y[i+1])
    return loss

  def evaluate(self,x,vocablist_y,vocabidx_y):
    e_x = self.dropout(self.encemb(x))
    outenc,(h,c)=self.enclstm(e_x)
    
    y = torch.tensor([vocabidx_y['<cls>']]).to(DEVICE)
    pred=[]
    for i in range(30):
      input = y
      input = input.unsqueeze(0)
      input = self.dropout(self.decemb(input))
      outdec,(h,c)= self.declstm(input,(h,c))
      output = self.decout(outdec.squeeze(0))  
      pred_id = output.squeeze().argmax().item()
      if pred_id == vocabidx_y['<eos>']:
          break
      pred_y = vocablist_y[pred_id][0]
      pred.append(pred_y)
      y[0]=pred_id
      input=y
    return pred

In [None]:
def train():
  model = LSTM(vocablist_en, vocabidx_en, vocablist_vi, vocabidx_vi).to(DEVICE)
  optimizer = torch.optim.Adam(model.parameters(), lr=LR) 
  for epoch in range(EPOCH):
    loss = 0
    step = 0
    for ben, bvi in train_data:
      ben = torch.tensor(ben, dtype=torch.int64).transpose(0,1).to(DEVICE) 
      bvi = torch.tensor(bvi, dtype=torch.int64).transpose(0,1).to(DEVICE)
      optimizer.zero_grad()
      batchloss = model((ben, bvi))
      batchloss.backward()
      optimizer.step() 
      loss = loss + batchloss.item()
      if step % 100 == 0:
        print("step:", step, "batch loss:", batchloss.item())
      step += 1
    print("epoch", epoch, ": loss", loss)
  torch.save(model.state_dict(), MODELNAME)

In [None]:
def test():
  total = 0
  correct = 0
  model = LSTM(vocablist_en, vocabidx_en, vocablist_vi, vocabidx_vi).to(DEVICE)
  model.load_state_dict(torch.load(MODELNAME))
  model.eval()
  ref = []
  pred = []
  for enprep, en, vi in test_data:
    input = torch.tensor([enprep], dtype=torch.int64).transpose(0, 1).to(DEVICE)
    p=model.evaluate(input, vocablist_vi, vocabidx_vi)
    print("INPUT", en)
    print("REF", vi)
    print("MT", p)
    ref.append([vi])
    pred.append(p)
  bleu = torchtext.data.metrics.bleu_score(pred, ref)
  print("total:", len(test_data)) 
  print("bleu:", bleu)

In [None]:
train()
test()

step: 0 batch loss: 27.882171630859375
step: 100 batch loss: 58.98465347290039
step: 200 batch loss: 77.55376434326172
step: 300 batch loss: 95.47391510009766
step: 400 batch loss: 127.6159439086914
step: 500 batch loss: 112.33297729492188
step: 600 batch loss: 130.58009338378906
step: 700 batch loss: 200.29100036621094
step: 800 batch loss: 206.80206298828125
step: 900 batch loss: 267.996337890625
step: 1000 batch loss: 331.8670349121094
epoch 0 : loss 165314.624874115
step: 0 batch loss: 7.972996711730957
step: 100 batch loss: 51.40834426879883
step: 200 batch loss: 70.37197875976562
step: 300 batch loss: 88.37894439697266
step: 400 batch loss: 119.24732208251953
step: 500 batch loss: 105.18037414550781
step: 600 batch loss: 123.00938415527344
step: 700 batch loss: 187.1456298828125
step: 800 batch loss: 194.61997985839844
step: 900 batch loss: 251.82408142089844
step: 1000 batch loss: 315.46173095703125
epoch 1 : loss 152602.3923997879
step: 0 batch loss: 6.268912315368652
step: 100