https://github.com/bheinzerling/bpemb

In [14]:
from bpemb import BPEmb

EMB_DIM = 300
EMB_VS = 3000

bpemb = BPEmb(lang="es", dim=EMB_DIM, vs=EMB_VS)

Se guardan los embeddings en un formato visualizable en el [visualizador de embbedings de TF](http://projector.tensorflow.org/?config=http://nlp.h-its.org/bpemb/data/en/projector.config.json).

Código extraido de https://gist.github.com/BrikerMan/7bd4e4bd0a00ac9076986148afc06507.

In [15]:
import io

def export_emb(w2v):
    out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
    out_m = io.open('meta.tsv', 'w', encoding='utf-8')

    # Write meta file and vector file
    for index in range(len(w2v.index2word)):
        word = w2v.index2word[index]
        vec = w2v.vectors[index]
        out_m.write(word + "\n")
        out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_v.close()
    out_m.close()

In [17]:
print(bpemb.encode("freestyle"))

print(bpemb.most_similar("beat"))

export_emb(bpemb.emb)

['▁fre', 'est', 'y', 'le']


KeyError: "word 'beat' not in vocabulary"

Get tokens from text

In [16]:
import os,io
import unicodedata

def get_text(corpus):
    symbols = ['\n','?','¿',',','.','"',':',"'",'(',')']
    with io.open(corpus, encoding='utf-8') as f:
        text = f.read().lower().replace('\xa0', ' ').replace('-','').replace('\ufeff','')
        for s in symbols:
            text = text.replace(s,' '+s+' ')

    text = unicodedata.normalize('NFC',text)
    return text

text = get_text("./merge.txt")

tokenized_text = bpemb.encode(text)

text_tokens_ids = bpemb.encode_ids(text)

print(text[:50],'\n')
print(tokenized_text[:50],'\n')
print(text_tokens_ids[:50],'\n')

 
  
 en la improvisación 
 lamentablemente yo mue 

['▁', '\n', '▁', '\n', '▁en', '▁la', '▁im', 'pro', 'vis', 'ación', '▁', '\n', '▁la', 'm', 'enta', 'ble', 'mente', '▁yo', '▁muer', 'do', '▁como', '▁león', '▁', '\n', '▁yo', '▁te', '▁pr', 'endo', '▁fue', 'go', '▁,', '▁el', '▁concep', 'to', '▁es', '▁juego', '▁', '\n', '▁y', '▁contra', '▁m', 'í', '▁es', '▁como', '▁un', '▁parque', '▁de', '▁divers', 'ión', '▁'] 

[2905, 0, 2905, 0, 22, 20, 280, 531, 2010, 229, 2905, 0, 20, 2918, 332, 526, 181, 1592, 975, 54, 121, 2315, 2905, 0, 1592, 178, 258, 492, 146, 147, 1577, 30, 2238, 56, 58, 1716, 2905, 0, 33, 575, 24, 2931, 58, 121, 40, 2246, 5, 1281, 168, 2905] 



Filter words and get x,y

In [18]:
SEQ_LEN = 5

def shape_text(sequence_length, text, step=1):
    # cut the text in semi-redundant sequences of SEQUENCE_LEN words
    sentences = []
    next_words = []
    for i in range(0, len(text) - sequence_length, step):
        sentences.append(text[i + 1: i + sequence_length + 1])
        next_words.append(text[i])
    
    print('Total sequences:', len(sentences))
    return sentences, next_words

#def split_set(sentences, next_words)

words = set(text_tokens_ids)

word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

sentences, next_words = shape_text(SEQ_LEN, text_tokens_ids)
#next_words_oh = [[1 if i == word_indices[word] else 0 for i in range(len(words))] for word in next_words]

Total sequences: 132372


Model definition

In [19]:
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Bidirectional, LSTM, Dropout, Activation


# Data generator for fit and evaluate
def generator(sentence_list, next_word_list, batch_size): 
    while True:
        for i in range(int(len(sentence_list)/batch_size)):
            x = sentence_list[i*batch_size:(i+1)*batch_size]
            x = np.array([np.array(xi) for xi in x])
            y = next_word_list[i*batch_size:(i+1)*batch_size]
            y = [[1 if i == word_indices[word] else 0 for i in range(len(words))] for word in y]
            y = np.array([np.array(yi) for yi in y])
            yield x, y
            #yield sentence_list[i*batch_size:(i+1)*batch_size], next_word_list[i*batch_size:(i+1)*batch_size]

LSTM_SIZE = 256
EMB_TRAIN = False

model = Sequential()

# define the model
model = Sequential()
model.add(Embedding(EMB_VS, EMB_DIM, weights=[bpemb.vectors], input_length=SEQ_LEN, trainable=EMB_TRAIN))
model.add(Bidirectional(LSTM(256)))
model.add(Dropout(0.2))
model.add(Dense(len(words)))
model.add(Activation('softmax'))

# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 5, 300)            900000    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 512)               1140736   
_________________________________________________________________
dropout_3 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 2231)              1144503   
_________________________________________________________________
activation_3 (Activation)    (None, 2231)              0         
Total params: 3,185,239
Trainable params: 2,285,239
Non-trainable params: 900,000
_________________________________________________________________
None


Model training

In [20]:
import tensorflow as tf

tf.config.experimental.list_physical_devices('GPU')

weights_file = 'weights_embDim{}_embVS{}_embTrain{}_seqLen{}.h5_lstmSize{}'.format(EMB_DIM, EMB_VS, EMB_TRAIN, SEQ_LEN, LSTM_SIZE)

BATCH_SIZE = 256
EPOCHS = 250

if weights_file in os.listdir("./"):
        print("Weights loaded")
        model.load_weights("./" + weights_file)


model.fit(
    generator(sentences, next_words, BATCH_SIZE),
    epochs=EPOCHS,
    steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1
)

model.save_weights(weights_file)

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250


Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78/250
Epoch 79/250
Epoch 80/250
Epoch 81/250
Epoch 82/250
Epoch 83/250
Epoch 84/250
Epoch 85/250
Epoch 86/250
Epoch 87/250
Epoch 88/250
Epoch 89/250
Epoch 90/250
Epoch 91/250
Epoch 92/250
Epoch 93/250
Epoch 94/250
Epoch 95/250
Epoch 96/250
Epoch 97/250
Epoch 98/250
Epoch 99/250
Epoch 100/250
Epoch 101/250
Epoch 102/250
Epoch 103/250
Epoch 104/250
Epoch 105/250
Epoch 106/250
Epoch 107/250
Epoch 108/250
Epoch 109/250
Epoch 110/250
Epoch 111/250
Epoch 112/250
Epoch 113/250
Epoch 114/250
Epoch 115/250
Epoch 116/250
Epoch 117/250
Epoch 118/250
Epoch 119/250
Epoch 120/250
Epoch 121/250
Epoch 122/250
Epoch 123/250
Epoch 124/250
Epoch 125/250
Epoch 126/250
Epoch 127/250
Epoch 128/250
Epoch 129/250
Epoch 130/250
Epoch 131/250
Epoch 132/250
Epoch 133/250
Epoch 134/250
Epoch 135/250
Epoch 136/250
Epoch 137/250


Epoch 138/250
Epoch 139/250
Epoch 140/250
Epoch 141/250
Epoch 142/250
Epoch 143/250
Epoch 144/250
Epoch 145/250
Epoch 146/250
Epoch 147/250
Epoch 148/250
Epoch 149/250
Epoch 150/250
Epoch 151/250
Epoch 152/250
Epoch 153/250
Epoch 154/250
Epoch 155/250
Epoch 156/250
Epoch 157/250
Epoch 158/250
Epoch 159/250
Epoch 160/250
Epoch 161/250
Epoch 162/250
Epoch 163/250
Epoch 164/250
Epoch 165/250
Epoch 166/250
Epoch 167/250
Epoch 168/250
Epoch 169/250
Epoch 170/250
Epoch 171/250
Epoch 172/250
Epoch 173/250
Epoch 174/250
Epoch 175/250
Epoch 176/250
Epoch 177/250
Epoch 178/250
Epoch 179/250
Epoch 180/250
Epoch 181/250
Epoch 182/250
Epoch 183/250
Epoch 184/250
Epoch 185/250
Epoch 186/250
Epoch 187/250
Epoch 188/250
Epoch 189/250
Epoch 190/250
Epoch 191/250
Epoch 192/250
Epoch 193/250
Epoch 194/250
Epoch 195/250
Epoch 196/250
Epoch 197/250
Epoch 198/250
Epoch 199/250
Epoch 200/250
Epoch 201/250


Epoch 202/250
Epoch 203/250
Epoch 204/250
Epoch 205/250
Epoch 206/250
Epoch 207/250
Epoch 208/250
Epoch 209/250
Epoch 210/250
Epoch 211/250
Epoch 212/250
Epoch 213/250
Epoch 214/250
Epoch 215/250
Epoch 216/250
Epoch 217/250
Epoch 218/250
Epoch 219/250
Epoch 220/250
Epoch 221/250
Epoch 222/250
Epoch 223/250
Epoch 224/250
Epoch 225/250
Epoch 226/250
Epoch 227/250
Epoch 228/250
Epoch 229/250
Epoch 230/250
Epoch 231/250
Epoch 232/250
Epoch 233/250
Epoch 234/250
Epoch 235/250
Epoch 236/250
Epoch 237/250
Epoch 238/250
Epoch 239/250
Epoch 240/250
Epoch 241/250
Epoch 242/250
Epoch 243/250
Epoch 244/250
Epoch 245/250
Epoch 246/250
Epoch 247/250
Epoch 248/250
Epoch 249/250
Epoch 250/250


Text generation

In [None]:
# Functions from keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

seed = "pokemon"
text = np.flip(np.array(bpemb.encode_ids(seed)))

for i in range(100):
    inp = text[-SEQ_LEN:]
    
    out = model.predict(inp)[0]

    next_id = sample(out,1.0)
    #next_id = np.argmax(out)
    text = np.append(text,next_id)
    
text = np.flip(text)

text = list(map(lambda x: indices_word[x] if (x in indices_word.keys()) else int(x), text))

text = bpemb.decode_ids(text)

print(text)




Pytorch test

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.tensor as tensor
import torch.optim as optim

BATCH_SIZE = 32
EPOCHS = 1

# Data generator for fit and evaluate
def generator(sentence_list, next_word_list, batch_size): 
    for i in range(int(len(sentence_list)/batch_size)):
        yield tensor(sentence_list[i*batch_size:(i+1)*batch_size]), tensor(next_word_list[i*batch_size:(i+1)*batch_size])


class Model(nn.Module):
    
    def __init__(self):
        super(Model, self).__init__()
        
        self.embedding = nn.Embedding.from_pretrained(tensor(bpemb.vectors))
        self.lstm = nn.LSTM(EMB_DIM, 128, bidirectional=True, batch_first=True)
        self.drop = nn.Dropout()
        self.linear = nn.Linear(256, len(words))
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        
        # La LSTM devuelve una secuencia de 4, me quedo solo con el ultimo
        x = tensor([[i for i in b[-1]] for b in x])
        
        x = self.drop(x)
        x = self.linear(x)
        x = self.softmax(x)
        return x

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = "cpu"
print(device)
    
net = Model().to(device)
print(net,'\n')

net.zero_grad()

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(net.parameters())


inp_ex = tensor(sentences[0:BATCH_SIZE]).to(device)
print('Example input size:', inp_ex.size())

out_ex = net(inp_ex)
print('Example output size:', out_ex.size())

target_ex = tensor(next_words_idx[0:BATCH_SIZE]).to(device)
print('Target output size:', target_ex.size())

loss = criterion(out_ex, target_ex)
print(loss)
'''


for epoch in range(EPOCHS):
    running_loss = 0.0
    for i, data in enumerate(generator(sentences, next_words_idx, BATCH_SIZE), 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        
        net.zero_grad()
        optimizer.zero_grad()
        
        outputs = net(inputs)
        print(inputs.size(), labels.size(), outputs.size())
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        if i % 100 == 99:    # print every 100 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0


print('Finished Training')
'''

cpu
Model(
  (embedding): Embedding(100000, 100)
  (lstm): LSTM(100, 128, batch_first=True, bidirectional=True)
  (drop): Dropout(p=0.5, inplace=False)
  (linear): Linear(in_features=256, out_features=7921, bias=True)
  (softmax): Softmax(dim=1)
) 

Example input size: torch.Size([32, 4])
Example output size: torch.Size([32, 7921])


NameError: name 'next_words_idx' is not defined