# <b> LSTM Decoder with word vector representation </b>

### keras tokenization

In [104]:
import codecs

input_file = "../datasets/raminghi.txt"
with codecs.open(input_file, "r", encoding=None) as f:
    text = f.read()

In [105]:
from keras.preprocessing.text import text_to_word_sequence

In [106]:
keras_wordlist = text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\n’', lower=True, split=' ')

In [107]:
type(keras_wordlist)

list

In [108]:
len(keras_wordlist)

48248

#### Word set extraction from specific corpus and word vector transformation

In [6]:
import os
import pickle

def get_wordset(saves_folder, vocab_name):
    vocab_filename = os.path.join(saves_folder, vocab_name + ".pkl")
    with open(vocab_filename, 'rb') as f:
        vocab = pickle.load(f)

    return vocab['words_set']

In [7]:
#params
saves_folder = '../saves'
vocab = 'vocab'

In [8]:
wordset = get_wordset(saves_folder, vocab)
wordlist = list(wordset)

In [16]:
#import word vectors
from gensim.models.fasttext import FastText as ftext
from gensim.models.keyedvectors import KeyedVectors

wv_model = ftext.load_fasttext_format("../embeddings/fasttext/it/it")
fastvec = KeyedVectors.load_word2vec_format("../embeddings/fasttext/it/it.vec")

In [124]:
# test similar word
testword = wordlist[0]
print(testword)
wv_model.wv.most_similar(testword)

pietosi


[('pietà', 0.6834704279899597),
 ('ammannati', 0.5470445156097412),
 ('baccio', 0.541467547416687),
 ('robbia', 0.5390898585319519),
 ('gioventù', 0.5321142673492432),
 ("sant'alberto", 0.5263468027114868),
 ('salviati', 0.5239526629447937),
 ('procaccini', 0.5227013826370239),
 ('fiorenti', 0.5213931798934937),
 ('guerrazzi', 0.5211422443389893)]

In [24]:
# retrieve vector representation of test word
testword_vec = wv_model.wv.word_vec(testword)

In [25]:
testword_vec.shape

(300,)

In [36]:
def dict_to_vecs(dict_wordset, model):
    vecs = []
    for word in dict_wordset:
        vec = model.wv.word_vec(str(word.lower))
        vecs.append(vec)
    return vecs

In [37]:
dict_vecs = dict_to_vecs(wordset, wv_model)

In [38]:
len(dict_vecs) == len(wordset)

True

### Tokenize with a little help from spaCy --> Next

In [109]:
import spacy
nlp = spacy.load("it_core_news_sm")
doc = nlp(text)

In [110]:
maxlen_in_doc = len(max(doc.sents, key=len))

In [113]:
sentences = []
for sentence in doc.sents:
    sentences.append(sentence)
print(sentences[201])

«Proprio del suo, signore» disse l'uomo, e gli porse la mano.  


In [114]:
# sentence of max num of words
sentence_maxlen = len(max(doc.sents, key=len))
print(sentence_maxlen)

sentence_minlen = len(min(doc.sents, key=len))
print(sentence_minlen)

135
1


In [115]:
len(text.split())

47925

In [116]:
numbers = []
for token in doc:
    if token.like_num:
        numbers.append(token)

In [154]:
number_to_words = {
  '1974':'millenovecentosettantaquattro',
  '1976':'millenovecentosettantasei',
  '1978':'millenovecentosettantotto',
  '1979':'millenovecentosettantanove',
  '1980':'millenovecentottanta',
  '1984':'millenovecentottantaquattro',
  '1992':'millenovecentonovantadue',
  '14':'quattordici',
  '7':'sette',
  '13':'tredici',
  '1981':'millenovecentottantuno',
  '1982':'millenovecentottandue',
  '47':'quarantasette',
  '9.30':'nove e trenta',
  '7':'sette',
  '22':'ventidue',
  '7.10':'sette e dieci',
  '9':'nove',
  '10':'dieci' 
}

### Prepare X, y with "brute" sampling : fixed length sentence chunk as X_i, single next word as y_i

In [200]:
# replace unwanted chars
processed = text.replace('«', '')
processed = processed.replace('»', '')

for key in number_to_words.keys():
    processed = processed.replace(key, number_to_words[key])
    
processed = processed.replace("où", '')
processed = processed.replace("\x1a", '')

In [201]:
max_length = 30

def sample_sentences(text, sample_len, sample_step):

    print("Sampling sentences with len (words):", sample_len, "with sampling step window:", sample_step)
    sampled_sentences = []
    sampled_next_words = []

    list_words = text.split()

    for pos in range(0, len(list_words) - sample_len, sample_step):
        temp = ' '.join(list_words[pos: pos + sample_len])
        sampled_sentences.append(temp)
        sampled_next_words.append((list_words[pos + sample_len]))
    print('nb sequences(length of sentences):', len(sampled_sentences))
    print("length of next_word", len(sampled_next_words))

    return sampled_sentences, sampled_next_words

X_sentences, y_next_words = sample_sentences(processed, max_length, 3)

#         print('Vectorizing...')
#         num_sentences = len(sentences)
#         words_in_sentence = sampling_maxlen

#         X = np.zeros((num_sentences, words_in_sentence, dict_len), dtype=np.bool)
#         y = np.zeros((num_sentences, dict_len), dtype=np.bool)
#         for i, sentence in enumerate(sentences):
#             for t, word in enumerate(sentence.split()):
#                 # print(i, t, word)
#                 X[i, t, word_indices[word]] = 1
#                 y[i, word_indices[next_words[i]]] = 1
                
                

Sampling sentences with len (words): 30 with sampling step window: 3
nb sequences(length of sentences): 15965
length of next_word 15965


In [202]:
print(X_sentences[100])
print(y_next_words[100])

"L'autunno del patriarca", che è stato il mio lavoro più arduo e arrischiato, e non sapevo come proseguire. Per circa due anni presi appunti sugli argomenti che mi passavano per
la


### Vectorize X, y using word vectors

In [203]:
from keras.preprocessing.text import Tokenizer
t = Tokenizer()
t.fit_on_texts(X_sentences)
vocab_size = len(t.word_index) + 1
wordvec_size = 300
print("vocab_size", vocab_size)
print("wordvec_size", wordvec_size)

vocab_size 8788
wordvec_size 300


In [204]:
from numpy import zeros
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, wordvec_size))
for word, i in t.word_index.items():
    #embedding_vector = embeddings_index.get(word)
    embedding_vector = wv_model.wv.word_vec(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [205]:
embedding_matrix.shape == (vocab_size, wordvec_size)

True

In [206]:
embedding_matrix.shape

(8788, 300)

In [207]:
# try to vectorize directly the y
y_next_words_vectorized = []
for next_word in y_next_words:
    y_next_words_vectorized.append(wv_model.wv.word_vec(next_word.lower()))

### LSTM with embedding layer

In [208]:
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import RMSprop

model = Sequential()
e = Embedding(vocab_size, wordvec_size, weights=[embedding_matrix], input_length=max_length, trainable=False)
model.add(e)
model.add(
    LSTM(512, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(vocab_size))
model.add(Activation('softmax'))

In [209]:
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.001))

In [213]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 30, 300)           2636400   
_________________________________________________________________
lstm_5 (LSTM)                (None, 30, 512)           1665024   
_________________________________________________________________
dropout_5 (Dropout)          (None, 30, 512)           0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 512)               2099200   
_________________________________________________________________
dropout_6 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 8788)              4508244   
_________________________________________________________________
activation_2 (Activation)    (None, 8788)              0         
Total para

In [216]:
len(y_next_words)

15965

In [210]:
model.fit(X_sentences, y_next_words, batch_size=128, epochs=10)

AttributeError: 'str' object has no attribute 'ndim'

### Play with Embedding Layer and pre-trained word vectors

In [103]:
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
# define documents
docs = ['Ben fatto!',
    'Ottimo lavoro',
    'Un grande sforzo',
    'un buon lavoro',
    'Eccellente!',
    'Debole',
    'Poco sforzo!',
    'non bene',
    'un lavoro povero',
    'Si poteva fare di meglio.']
# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
print("vocab_size", vocab_size)
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(encoded_docs)
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

vocab_size 20
[[4, 5], [6, 1], [2, 7, 3], [2, 8, 1], [9], [10], [11, 3], [12, 13], [2, 1, 14], [15, 16, 17, 18, 19]]
[[ 4  5  0  0]
 [ 6  1  0  0]
 [ 2  7  3  0]
 [ 2  8  1  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [11  3  0  0]
 [12 13  0  0]
 [ 2  1 14  0]
 [16 17 18 19]]


In [104]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('../embeddings/fasttext/it/it.vec')
line_num = 0
for line in f:
    line_num += 1
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    if line_num == 100:
        break
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 100 word vectors.


In [105]:
embeddings_index['a'].shape

(300,)

In [106]:
wordvec_size = 300

In [109]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, wordvec_size))
for word, i in t.word_index.items():
    #embedding_vector = embeddings_index.get(word)
    embedding_vector = wv_model.wv.get_vector(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [110]:
embedding_matrix.shape == (vocab_size, wordvec_size)

True

In [111]:
embedding_matrix.shape

(20, 300)

In [112]:
# define model
model = Sequential()
e = Embedding(vocab_size, wordvec_size, weights=[embedding_matrix], input_length=max_length, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [113]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 4, 300)            6000      
_________________________________________________________________
flatten_2 (Flatten)          (None, 1200)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1201      
Total params: 7,201
Trainable params: 1,201
Non-trainable params: 6,000
_________________________________________________________________
None
Accuracy: 100.000000
