In [2]:
import numpy as np
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
import math
import matplotlib.pyplot as plt
import transformer
import glob
import pickle
from torch.utils.data import TensorDataset, DataLoader

In [3]:
# load the dataset
poems = []

for path in glob.iglob("poemas_machado/*.txt"):
    with open(path, 'r') as f:
        x = f.read()
        poems.append(x.lower().split(' '))
        
print(poems[0])

['proverbios', 'y', 'cantares', '-', 'ix', '\n', 'el', 'hombre', ',', 'a', 'quien', 'el', 'hambre', 'de', 'la', 'rapiña', 'acucia', ',', '\n', 'de', 'ingénita', 'malicia', 'y', 'natural', 'astucia', ',', '\n', 'formó', 'la', 'inteligencia', 'y', 'acaparó', 'la', 'tierra', '.', '\n', '¡', 'y', 'aún', 'la', 'verdad', 'proclama', '!', '¡', 'supremo', 'ardid', 'de', 'guerra', '!', '\n', '<end>']


In [4]:
with open('word_idx.pkl', 'rb') as file:
    word_idx = pickle.load(file)
    
with open('idx_word.pkl', 'rb') as file:
    idx_word = pickle.load(file)

vocabulary_size = len(word_idx) + 1
print(vocabulary_size)

6299


In [5]:
tokenized = []
for poem in poems:
    tokenized.append([word_idx[word] for word in poem])

print(poems[0])
print(tokenized[0])
print("---")
print(poems[0][7], "->", word_idx[poems[0][7]])

['proverbios', 'y', 'cantares', '-', 'ix', '\n', 'el', 'hombre', ',', 'a', 'quien', 'el', 'hambre', 'de', 'la', 'rapiña', 'acucia', ',', '\n', 'de', 'ingénita', 'malicia', 'y', 'natural', 'astucia', ',', '\n', 'formó', 'la', 'inteligencia', 'y', 'acaparó', 'la', 'tierra', '.', '\n', '¡', 'y', 'aún', 'la', 'verdad', 'proclama', '!', '¡', 'supremo', 'ardid', 'de', 'guerra', '!', '\n', '<end>']
[50, 5, 49, 73, 1321, 1, 7, 134, 2, 12, 103, 7, 2389, 4, 6, 5334, 5335, 2, 1, 4, 5336, 5337, 5, 2293, 5338, 2, 1, 5339, 6, 5340, 5, 5341, 6, 34, 3, 1, 15, 5, 306, 6, 86, 5342, 14, 15, 5343, 5344, 4, 275, 14, 1, 24]
---
hombre -> 134


In [6]:
lengths = [len(sequence) for sequence in tokenized]
print(max(lengths))
print(min(lengths))

4758
13


In [7]:
max_seq_length = 400 #4758

padded = []
for sequence in tokenized:
    trimmed = sequence[-max_seq_length:]
    padding = [0] * (max_seq_length - len(trimmed))
    padded.append(padding + trimmed)
    
padded = np.array(padded)
print(padded.shape, padded[:,:-1].shape, padded[:,1:].shape)

(445, 400) (445, 399) (445, 399)


In [8]:
# create data loader
batch_size = 16

dataset = TensorDataset(torch.from_numpy(padded[:,:-1]), torch.from_numpy(padded[:,1:]))
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [9]:
x, y = next(iter(dataloader))

print(x.shape, y.shape)

print(x[0, -10:])
print(y[0, -10:])

torch.Size([16, 399]) torch.Size([16, 399])
tensor([ 353,    9,   74,    2,    1,  198,    2, 2016,    3,    1])
tensor([   9,   74,    2,    1,  198,    2, 2016,    3,    1,   24])


In [32]:
model = transformer.Transformer(vocabulary_size, 64, max_seq_length-1, blocks=1, heads=1)
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
# model

In [33]:
# training
epochs = 10

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001) #, betas=(0.9, 0.98), eps=1e-9)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=10, verbose=True)

model.train()
for e in range(1, epochs+1):
    total_loss = 0
    total_accuracy = 0
    batch = 0
    for x, y in dataloader:
        batch += 1
        optimizer.zero_grad()
    
        mask = transformer.gen_target_mask(x, 0)
    
        preds = model.encoder(x, mask)
        preds = model.out(preds)
        
        loss = F.cross_entropy(preds.view(-1, preds.size(-1)), y.view(-1), ignore_index=0)
        total_loss += loss.item()
    
        loss.backward()
        optimizer.step()
        
        equals = torch.argmax(preds, dim=-1).view(-1) == y.view(-1)
        total_accuracy += torch.mean(equals.type(torch.FloatTensor))
    
        print(f"EPOCH {e} ({batch}/{len(dataloader)}) - loss {total_loss/batch:.4f} - acc {total_accuracy/batch:.4f}", end='\r') 

    scheduler.step(total_loss)
    print(f"EPOCH {e} - loss {total_loss/len(dataloader):.4f} - acc {total_accuracy/len(dataloader):.4f} ---------------------- ")

EPOCH 1 - loss 8.5953 - acc 0.0235 ---------------------- 
EPOCH 2 - loss 8.3616 - acc 0.0509 ---------------------- 
EPOCH 3 - loss 8.1585 - acc 0.0511 ---------------------- 
EPOCH 4 - loss 7.9536 - acc 0.0511 ---------------------- 
EPOCH 5 - loss 7.7504 - acc 0.0511 ---------------------- 
EPOCH 6 - loss 7.5528 - acc 0.0510 ---------------------- 
EPOCH 7 - loss 7.3617 - acc 0.0511 ---------------------- 
EPOCH 8 - loss 7.1764 - acc 0.0511 ---------------------- 
EPOCH 9 - loss 6.9969 - acc 0.0510 ---------------------- 
EPOCH 10 - loss 6.8310 - acc 0.0510 ---------------------- 


In [34]:
# inference
seed = 'el amor de una mujer \n'
seed = [word_idx[word] for word in seed.split(' ')]

model.eval()
with torch.no_grad():
    for i in range(10):
        x = torch.from_numpy(np.array(seed)).unsqueeze(0)
        mask = transformer.gen_target_mask(x, 0)
        encoded = model.encoder(x, mask)
        out = model.out(encoded)
        out = F.softmax(out, dim=-1)
        out = torch.argmax(out, dim=-1)
        
        idx = out[:,-1].item()
        seed.append(idx)
        word = idx_word[idx]
        if word == '<end>':
            break

print(' '.join([idx_word[idx] for idx in seed]))

el amor de una mujer 
 
 
 
 
 
 
 
 
 
 

