In [None]:
import numpy
from numpy import array
import spacy
from spacy.vocab import Vocab
import keras as k
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import LSTM, Embedding, Dense,Bidirectional
import pickle
import nltk
#nltk.download('stopwords')

Load Data

In [None]:
data = open('data.csv').read()[:100000]

Prepare sequences for training

In [None]:
#https://keras.io/api/keras_nlp/tokenizers/
#https://docs.python.org/3/library/pickle.html
#https://keras.io/api/preprocessing/
#https://numpy.org/doc/stable/reference/generated/numpy.array.html
#https://keras.io/api/layers/recurrent_layers/bidirectional/

def data_sequencing(data):   

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([data])
    with open('tokenizer.pkl', 'wb') as f: 
        pickle.dump(tokenizer, f)

    encoded = tokenizer.texts_to_sequences([data])[0]
    vocab_size = len(tokenizer.word_index) + 1
    #print('Vocabulary Size: %d' % vocab_size)
    
    sequences = list()
    rev_sequences = list()
    for line in data.split('.'):
        encoded = tokenizer.texts_to_sequences([line])[0]
        rev_encoded = encoded[::-1]
        for i in range(1, len(encoded)):
            sequence = encoded[:i+1]
            rev_sequence = rev_encoded[:i+1]
            sequences.append(sequence)
            rev_sequences.append(rev_sequence)
    #print('Total Sequences: %d' % len(sequences))
    

    max_length = max([len(seq) for seq in sequences])
    with open('max_length.pkl', 'wb') as f: 
        pickle.dump(max_length, f)
    print('Max Sequence Length: %d' % max_length)

    sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
    sequences = array(sequences)
    X, y = sequences[:,:-1],sequences[:,-1]
    
    rev_sequences = pad_sequences(rev_sequences, maxlen=max_length, padding='pre')
    rev_sequences = array(rev_sequences)
    rev_X, rev_y = rev_sequences[:,:-1],rev_sequences[:,-1]

    return X,y,rev_X,rev_y,max_length,vocab_size

In [None]:
X,y,rev_X,rev_y,max_length,vocab_size = data_sequencing(data)

Define Model

In [None]:
#https://keras.io/guides/sequential_model/
#https://keras.io/api/layers/recurrent_layers/
#https://keras.io/api/layers/core_layers/embedding/

model = Sequential()
model.add(Embedding(vocab_size,100, input_length=max_length-1))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

In [None]:
rev_model_lstm = Sequential()
rev_model_lstm.add(Embedding(vocab_size, 100, input_length=max_length-1))
rev_model_lstm.add(Bidirectional(LSTM(100)))
rev_model_lstm.add(Dense(vocab_size, activation='softmax'))
print(rev_model_lstm.summary())

Train Model

In [None]:
#https://keras.io/api/metrics/
#https://keras.io/api/models/

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y,batch_size=100, epochs=50, verbose=2)
model.save('model_lstm.h5')

In [None]:
rev_model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
rev_model_lstm.fit(rev_X, rev_y,batch_size=100, epochs=20, verbose=2)
rev_model_lstm.save('rev_model_lstm.h5')

Generate sequences using model

In [None]:
def generate_seq(model, tokenizer, max_length, seed_text):
    if seed_text == "":
        return ""
    else:
        in_text = seed_text
        n_words = 1
        n_preds = 10
        pred_words = ""
        for _ in range(n_words):
            encoded = tokenizer.texts_to_sequences([in_text])[0]
            encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
            proba = model.predict(encoded, verbose=0).flatten()
            yhat = numpy.argsort(-proba)[:n_preds] 
            out_word = ''

            for _ in range(n_preds):
                for word, index in tokenizer.word_index.items():
                    if index == yhat[_] and word not in stopwords:
                        out_word = word
                        pred_words += ' ' + out_word
                        #print(out_word)
                        break


        return pred_words

Load training model and SpaCy Model (Glove)

In [None]:
model = load_model('model_lstm.h5')
rev_model = load_model('rev_model_lstm.h5')

with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)
    
with open('max_length.pkl', 'rb') as f:
    max_length = pickle.load(f)

stopwords = nltk.corpus.stopwords.words('portuguese')

#https://spacy.io/models/pt
nlp = spacy.load('pt_core_news_md')

Function to set embeddings for OOV

In [None]:
def set_embedding_for_oov(doc):
    for token in doc:
        if token.is_oov == True:
            before_text = doc[:token.i].text
            after_text = str(array(doc)[:token.i:-1]).replace('[','').replace(']','')

            pred_before = generate_seq(model, tokenizer, max_length-1, before_text).split()
            pred_after = generate_seq(rev_model, tokenizer, max_length-1, after_text).split()
            
            embedding = numpy.zeros((300,))

            i=len(before_text)
            print('Words predicted from forward sequence model:')
            for word in pred_before:
                print(word)
                embedding += i*nlp.vocab.get_vector(word)
                i= i*.5
            i=len(after_text)
            print('Words predicted from reverse sequence model:')
            for word in pred_after:
                print(word)
                embedding += i*nlp.vocab.get_vector(word)
                i= i*.5
            nlp.vocab.set_vector(token.text, embedding)
            print(token.text,nlp.vocab.get_vector(token.text))  

Detect OOV

In [None]:
nlp.vocab.get_vector('banzeiro')

Verify OOV using function

In [None]:
doc1 = nlp('O Barco atravessou o banzeiro sem sofrer danos')
set_embedding_for_oov(doc1)

Verify similarity

In [None]:
#BANZEIRO – Onda do mar

doc2 = nlp('Senhor, a perseverança das ondas do mar, que fazem de cada recuo um ponto de partida para um novo avanço.')

doc3 = nlp('As vezes é preciso ser como as ondas do mar, recuar para ganhar força.')

doc4 = nlp('Como ondas do mar')

In [None]:
print(doc1, "<->", doc2, "=", doc1.similarity(doc2))
print(doc1, "<->", doc3, "=", doc1.similarity(doc3))
print(doc1, "<->", doc4, "=",doc1.similarity(doc4))

Verify POS tagging

In [None]:
for token in doc1:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_)