# <b> LSTM Decoder with word vector representation </b>

### keras tokenization

In [88]:
import codecs

input_file = "../datasets/chapter01.txt"
with codecs.open(input_file, "r", encoding=None) as f:
    text = f.read()

In [89]:
from keras.preprocessing.text import text_to_word_sequence

In [90]:
keras_wordlist = text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\n’', lower=True, split=' ')

In [91]:
type(keras_wordlist)

list

In [92]:
len(keras_wordlist)

2707

#### Word set extraction from specific corpus and word vector transformation

In [93]:
import os
import pickle

def get_wordset(saves_folder, vocab_name):
    vocab_filename = os.path.join(saves_folder, vocab_name + ".pkl")
    with open(vocab_filename, 'rb') as f:
        vocab = pickle.load(f)

    return vocab['words_set']

In [94]:
#params
saves_folder = '../saves'
vocab = 'vocab'

In [95]:
wordset = get_wordset(saves_folder, vocab)
wordlist = list(wordset)

In [96]:
#import word vectors
from gensim.models.fasttext import FastText as ftext
from gensim.models.keyedvectors import KeyedVectors

wv_model = ftext.load_fasttext_format("../embeddings/fasttext/it/it")
fastvec = KeyedVectors.load_word2vec_format("../embeddings/fasttext/it/it.vec")

In [97]:
# test similar word
testword = wordlist[0]
print(testword)
wv_model.wv.most_similar(testword)

dove


[('ove', 0.77056884765625),
 ('presso', 0.5957440733909607),
 ('qui', 0.5518729090690613),
 ('mentre', 0.5315486788749695),
 ('cui', 0.5283954739570618),
 ('quale', 0.5273314714431763),
 ('pressi', 0.5228528380393982),
 ('vicino', 0.5206019878387451),
 ('trasferitosi', 0.517595648765564),
 ('stabilendosi', 0.5164061784744263)]

In [98]:
# retrieve vector representation of test word
testword_vec = wv_model.wv.get_vector(testword)

In [99]:
testword_vec.shape

(300,)

In [100]:
def dict_to_vecs(dict_wordset, model):
    vecs = []
    for word in dict_wordset:
        vec = model.wv.get_vector(word.lower)
        vecs.append(vec)
    return vecs

In [101]:
dict_vecs = dict_to_vecs(wordset, wv_model)

In [102]:
len(dict_vecs) == len(wordset)

True

### Play with Embedding Layer and pre-trained word vectors

In [103]:
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
# define documents
docs = ['Ben fatto!',
    'Ottimo lavoro',
    'Un grande sforzo',
    'un buon lavoro',
    'Eccellente!',
    'Debole',
    'Poco sforzo!',
    'non bene',
    'un lavoro povero',
    'Si poteva fare di meglio.']
# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
print("vocab_size", vocab_size)
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(encoded_docs)
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

vocab_size 20
[[4, 5], [6, 1], [2, 7, 3], [2, 8, 1], [9], [10], [11, 3], [12, 13], [2, 1, 14], [15, 16, 17, 18, 19]]
[[ 4  5  0  0]
 [ 6  1  0  0]
 [ 2  7  3  0]
 [ 2  8  1  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [11  3  0  0]
 [12 13  0  0]
 [ 2  1 14  0]
 [16 17 18 19]]


In [104]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('../embeddings/fasttext/it/it.vec')
line_num = 0
for line in f:
    line_num += 1
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    if line_num == 100:
        break
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 100 word vectors.


In [105]:
embeddings_index['a'].shape

(300,)

In [106]:
wordvec_size = 300

In [109]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, wordvec_size))
for word, i in t.word_index.items():
    #embedding_vector = embeddings_index.get(word)
    embedding_vector = wv_model.wv.get_vector(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [110]:
embedding_matrix.shape == (vocab_size, wordvec_size)

True

In [111]:
embedding_matrix.shape

(20, 300)

In [112]:
# define model
model = Sequential()
e = Embedding(vocab_size, wordvec_size, weights=[embedding_matrix], input_length=max_length, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [113]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 4, 300)            6000      
_________________________________________________________________
flatten_2 (Flatten)          (None, 1200)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1201      
Total params: 7,201
Trainable params: 1,201
Non-trainable params: 6,000
_________________________________________________________________
None
Accuracy: 100.000000


### LSTM with embedding layer

In [122]:
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Activation
from keras.optimizers import RMSprop

model = Sequential()
model.add(e)
model.add(
    LSTM(512, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(vocab_size))
model.add(Activation('softmax'))

In [123]:
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.001))

In [None]:
model.fit(x_tensor, y_tensor, batch_size=128, epochs=10)