# <b> LSTM Decoder with word vector representation </b>

### load gensim italian lang wordvectors

In [1]:
#import word vectors
from gensim.models.fasttext import FastText as ftext
from gensim.models.keyedvectors import KeyedVectors

wv_model = ftext.load_fasttext_format("../embeddings/fasttext/it/it")
fastvec = KeyedVectors.load_word2vec_format("../embeddings/fasttext/it/it.vec")

#### attempt - word set extraction from specific corpus and word vector transformation

In [None]:
import os
import pickle

def get_wordset(saves_folder, vocab_name):
    vocab_filename = os.path.join(saves_folder, vocab_name + ".pkl")
    with open(vocab_filename, 'rb') as f:
        vocab = pickle.load(f)

    return vocab['words_set']

In [None]:
#params
saves_folder = '../saves'
vocab = 'vocab'

In [None]:
wordset = get_wordset(saves_folder, vocab)
wordlist = list(wordset)

In [None]:
# test similar word
testword = wordlist[0]
print(testword)
wv_model.wv.most_similar(testword)

In [None]:
# retrieve vector representation of test word
testword_vec = wv_model.wv.word_vec(testword)

In [None]:
testword_vec.shape

##### transform words in dictionary into word vectors

In [None]:
def dict_to_vecs(dict_wordset, model):
    vecs = []
    for word in dict_wordset:
        vec = model.wv.word_vec(str(word.lower))
        vecs.append(vec)
    return vecs

In [None]:
dict_vecs = dict_to_vecs(wordset, wv_model)

In [None]:
len(dict_vecs) == len(wordset)

In [None]:
print(len(dict_vecs))

### load text

In [2]:
import codecs

input_file = "../datasets/chapter01.txt"
with codecs.open(input_file, "r", encoding=None) as f:
    text = f.read()

#### attempt 1 - keras tokenization

In [None]:
from keras.preprocessing.text import text_to_word_sequence

In [None]:
keras_wordlist = text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\n’', lower=True, split=' ')

In [None]:
type(keras_wordlist)

In [None]:
len(keras_wordlist)

In [None]:
import numpy as np
unique_wordset = np.unique(keras_wordlist)
unique_wordset.shape

#### attempt 2 - derive text properties with a little help from spaCy

In [None]:
import spacy
nlp = spacy.load("it_core_news_sm")
doc = nlp(text)

In [None]:
maxlen_in_doc = len(max(doc.sents, key=len))

In [None]:
sentences = []
for sentence in doc.sents:
    sentences.append(sentence)
print(sentences[201])

In [None]:
# sentence of max num of words
sentence_maxlen = len(max(doc.sents, key=len))
print(sentence_maxlen)

sentence_minlen = len(min(doc.sents, key=len))
print(sentence_minlen)

In [None]:
len(text.split())

In [None]:
numbers = []
for token in doc:
    if token.like_num:
        numbers.append(token)

In [None]:
print(numbers)

#### if numbers are found, convert into words thru this map

In [3]:
number_to_words = {
  '1974':'millenovecentosettantaquattro',
  '1976':'millenovecentosettantasei',
  '1978':'millenovecentosettantotto',
  '1979':'millenovecentosettantanove',
  '1980':'millenovecentottanta',
  '1984':'millenovecentottantaquattro',
  '1992':'millenovecentonovantadue',
  '14':'quattordici',
  '7':'sette',
  '13':'tredici',
  '1981':'millenovecentottantuno',
  '1982':'millenovecentottandue',
  '47':'quarantasette',
  '9.30':'nove e trenta',
  '7':'sette',
  '22':'ventidue',
  '7.10':'sette e dieci',
  '9':'nove',
  '10':'dieci' 
}

### Prepare X, y with "brute" sampling : fixed length sentence chunk as X_i, single next word as y_i

In [4]:
# replace unwanted chars
processed = text.replace('«', '')
processed = processed.replace('»', '')
processed = processed.replace(' - ', ' ')


for key in number_to_words.keys():
    processed = processed.replace(key, number_to_words[key])
    
processed = processed.replace("où", '')
processed = processed.replace("\x1a", '')

In [5]:
max_length = 30

def sample_sentences(text, sample_len, sample_step):

    print("Sampling sentences with len (words):", sample_len, "with sampling step window:", sample_step)
    sampled_sentences = []
    sampled_next_words = []

    list_words = text.split()

    for pos in range(0, len(list_words) - sample_len, sample_step):
        temp = ' '.join(list_words[pos: pos + sample_len])
        sampled_sentences.append(temp)
        sampled_next_words.append((list_words[pos + sample_len]))
    print('nb sequences(length of sentences):', len(sampled_sentences))
    print("length of next_word", len(sampled_next_words))

    return sampled_sentences, sampled_next_words

X_sentences, y_next_words = sample_sentences(processed, max_length, 3)

#         print('Vectorizing...')
#         num_sentences = len(sentences)
#         words_in_sentence = sampling_maxlen

#         X = np.zeros((num_sentences, words_in_sentence, dict_len), dtype=np.bool)
#         y = np.zeros((num_sentences, dict_len), dtype=np.bool)
#         for i, sentence in enumerate(sentences):
#             for t, word in enumerate(sentence.split()):
#                 # print(i, t, word)
#                 X[i, t, word_indices[word]] = 1
#                 y[i, word_indices[next_words[i]]] = 1
                
                

Sampling sentences with len (words): 30 with sampling step window: 3
nb sequences(length of sentences): 872
length of next_word 872


In [6]:
print(X_sentences[100])
print(y_next_words[100])

che ricorda ogni cosa, e ci chiama con l’odore del sale. Lì, finalmente, Seurac si fermava, adagiandosi sullo stesso sasso, poche ore dopo un misero pranzo; che ci fosse pioggia
o


### Encode X, y

In [7]:
from keras.preprocessing.text import Tokenizer
tt = Tokenizer()
tt.fit_on_texts(X_sentences)
X_sentences_encoded = tt.texts_to_sequences(X_sentences)
print(X_sentences[0])
print(X_sentences_encoded[0])
print(len(tt.word_index) + 1)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Quella specie di orologio da taschino, pesante e piuttosto ammaccato, serviva a dargli un’aria importante, dicevano. Se lo portava di qua, di là, arrancando, su e giù per le strade
[54, 317, 3, 88, 13, 1039, 1037, 1, 314, 1035, 1036, 5, 312, 1033, 313, 311, 15, 44, 120, 3, 1029, 3, 1026, 1027, 86, 1, 176, 12, 9, 179]
1040


In [8]:
print(max_length)

30


In [9]:
# if needed: in this case all sentences are of equal size of max_length
from keras.preprocessing.sequence import pad_sequences
X_sentences_encoded_padded = pad_sequences(X_sentences_encoded, maxlen=max_length, padding='post')

print(X_sentences_encoded[12])
print(X_sentences_encoded_padded[12])

[321, 12, 123, 180, 13, 37, 90, 10, 56, 322, 6, 323, 181, 1, 21, 182, 5, 91, 71, 2, 27, 13, 91, 324, 38, 55, 22, 8, 325, 92]
[321  12 123 180  13  37  90  10  56 322   6 323 181   1  21 182   5  91
  71   2  27  13  91 324  38  55  22   8 325  92]


In [10]:
print(X_sentences_encoded_padded)

[[  54  317    3 ...   12    9  179]
 [  88   13 1039 ...    3   89   14]
 [1037    1  314 ...  318  319  320]
 ...
 [  29 1022    1 ...    8  315 1038]
 [1023   85   68 ...    2   17  316]
 [  24 1024    9 ...  122   55   26]]


In [11]:
y_next_words_encoded = tt.texts_to_sequences(y_next_words)
y_next_words_encoded_padded = pad_sequences(y_next_words_encoded, maxlen=1, padding='post')

print(y_next_words[:5])
print(y_next_words_encoded[:5])

['di', 'pratica', 'inseparabili.', 'viveva', 'parti']
[[3], [318], [321], [180], [90]]


In [12]:
print(y_next_words_encoded_padded[:5])

[[  3]
 [318]
 [321]
 [180]
 [ 90]]


In [13]:
print(X_sentences_encoded_padded.shape)
print(y_next_words_encoded_padded.shape)

(872, 30)
(872, 1)


In [15]:
# try to flatten y
import numpy as np
y_next_words_encoded_flattened = np.array(y_next_words_encoded).flatten()
print(y_next_words_encoded_flattened.shape)

(872,)


In [16]:
# try with a word vec representation of y
y_next_words_vectorized = []
for next_word in y_next_words:
    y_next_words_vectorized.append(wv_model.wv.word_vec(next_word.lower()))

In [17]:
np.array(y_next_words_vectorized).shape

(872, 300)

### another way for data prep

In [18]:
if 'e' not in wv_model.wv.vocab:
    simword = wv_model.wv.most_similar('e')[0][0]
    print(word2idx(simword))

In [19]:
import string
import numpy as np
from keras.utils.data_utils import get_file

def word2idx(word):
    if word not in wv_model.wv.vocab:
        simword = wv_model.wv.most_similar(word)[0][0]
        return wv_model.wv.vocab[simword].index
    return wv_model.wv.vocab[word].index

def idx2word(idx):
    return wv_model.wv.index2word[idx]


max_sentence_len = 30

path = "../datasets/chapter01.txt"
with open(path) as file_:
    docs = file_.readlines()
sentences = [[word for word in doc.lower().translate(string.punctuation).split()[:max_sentence_len]] for doc in docs]
print('Num sentences:', len(sentences))

train_x = np.zeros([len(sentences), max_sentence_len], dtype=np.int32)
train_y = np.zeros([len(sentences)], dtype=np.int32)
for i, sentence in enumerate(sentences):
    for t, word in enumerate(sentence[:-1]):
        word = word.replace('"', '')
        train_x[i, t] = word2idx(word)
    train_y[i] = word2idx(sentence[-1])
print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)

Num sentences: 114
train_x shape: (114, 30)
train_y shape: (114,)


In [20]:
train_y

array([ 1406, 20725,   288, 12556,    98,    12,  5083, 21884,  2012,
          17,    30,   945, 31715,  1571,     5, 44335, 40119,   642,
         130,   999,  7251,  3018, 31969,  9358, 25384,  2045, 38801,
       36701, 40363,   622, 39005, 11171, 17873, 10817,  4590, 18928,
       19629, 36914, 22194, 11880, 47097, 39252, 33699,  5380, 27236,
         102,  6081,  6081,   567, 19369, 13280, 25760, 35723,  4759,
        6537, 32268,  2194,  3810,  4588,  2133,  9970,  1266, 18210,
         944, 13167,  5806,  4562,  4006, 35723, 39702,  3209, 30275,
        5788, 18928, 37228, 12950,  4759,  5477,  1319, 40642, 18492,
       20134, 14759, 21884,   931, 30770, 44335,  5058, 38589, 12261,
       40012, 44335, 15105, 44335,    97, 47469,   249,   377,   863,
           7,  6408, 29664,   239, 38406, 27748, 20195, 13008,   188,
        1359, 10760, 45096, 39835, 20134, 11759], dtype=int32)

### Build Embedding Matrix

In [21]:
# use same tokenizer
#t = Tokenizer()
#t.fit_on_texts(X_sentences)
vocab_size = len(tt.word_index) + 1
wordvec_size = 300
print("vocab_size", vocab_size)
print("wordvec_size", wordvec_size)

vocab_size 1040
wordvec_size 300


In [22]:
from numpy import zeros
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, wordvec_size))
for word, i in tt.word_index.items():
    #embedding_vector = embeddings_index.get(word)
    embedding_vector = wv_model.wv.word_vec(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [23]:
embedding_matrix.shape == (vocab_size, wordvec_size)

True

In [24]:
embedding_matrix.shape

(1040, 300)

In [25]:
# test if indexed word in text is properly embedded with the correct word vector from gensim
test_word = 'pratica'
word_index = tt.word_index[test_word]
print(word_index)
np.array_equal(wv_model.wv.get_vector(test_word), embedding_matrix[word_index])

318


True

In [26]:
wv_model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(50032, 300)

### LSTM with embedding layer

In [27]:
import tensorflow as tf
import keras
print(tf.__version__) #1.8.0
print(keras.__version__) #2.2.0

1.8.0
2.2.0


In [40]:
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import RMSprop

model = Sequential()
e = Embedding(vocab_size, wordvec_size, weights=[embedding_matrix], input_length=max_length, trainable=False)
model.add(e)
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(vocab_size))
#model.add(Dense(1))
model.add(Activation('softmax'))

In [41]:
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.001))

In [42]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 30, 300)           312000    
_________________________________________________________________
lstm_7 (LSTM)                (None, 30, 512)           1665024   
_________________________________________________________________
dropout_6 (Dropout)          (None, 30, 512)           0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 512)               2099200   
_________________________________________________________________
dropout_7 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1040)              533520    
_________________________________________________________________
activation_4 (Activation)    (None, 1040)              0         
Total para

In [39]:
#model.fit(X_sentences_encoded_padded, y_next_words_encoded_padded, batch_size=128, epochs=10)
model.fit(train_x, train_y, batch_size=128, epochs=10)

ValueError: Error when checking target: expected activation_3 to have shape (1040,) but got array with shape (1,)

### Play with Embedding Layer and pre-trained word vectors

In [None]:
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
# define documents
docs = ['Ben fatto!',
    'Ottimo lavoro',
    'Un grande sforzo',
    'un buon lavoro',
    'Eccellente!',
    'Debole',
    'Poco sforzo!',
    'non bene',
    'un lavoro povero',
    'Si poteva fare di meglio.']
# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
print("vocab_size", vocab_size)
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(encoded_docs)
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

In [None]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('../embeddings/fasttext/it/it.vec')
line_num = 0
for line in f:
    line_num += 1
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    if line_num == 100:
        break
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

In [None]:
embeddings_index['a'].shape

In [None]:
wordvec_size = 300

In [None]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, wordvec_size))
for word, i in t.word_index.items():
    #embedding_vector = embeddings_index.get(word)
    embedding_vector = wv_model.wv.get_vector(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix.shape == (vocab_size, wordvec_size)

In [None]:
embedding_matrix.shape

In [None]:
# define model
model = Sequential()
e = Embedding(vocab_size, wordvec_size, weights=[embedding_matrix], input_length=max_length, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [None]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))