In [10]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import os

MAX_LEN = 75
EMBEDDING = 50

leis_dir = 'leis_vetadas/'
leis = []
leis_split = []
words = []
for fname in os.listdir(leis_dir):
    with open(leis_dir+fname, "r", encoding="utf-8") as f:
        r = f.read()
        leis.append(r)
        leis_split.append(r.split('**VETO**'))
        file_words = text_to_word_sequence(r)
        # file_words = split_words(r)
        words += file_words

vocab = list(set(words))
n_words = len(vocab)

tags = ["ok",'b-ok','e-ok',"veto",'b-veto','e-veto']
n_tags = len(tags)

word2idx = {w: i + 2 for i, w in enumerate(vocab)}
word2idx["UNK"] = 1 # Unknown words
word2idx["PAD"] = 0 # Padding

# Vocabulary Key:token_index -> Value:word
idx2word = {i: w for w, i in word2idx.items()}

# Vocabulary Key:Label/Tag -> Value:tag_index
# The first entry is reserved for PAD
tag2idx = {t: i+1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0

# Vocabulary Key:tag_index -> Value:Label/Tag
idx2tag = {i: w for w, i in tag2idx.items()}

# Model definition
input = Input(shape=(MAX_LEN,))
model = Embedding(input_dim=n_words+2, output_dim=EMBEDDING, # n_words + 2 (PAD & UNK)
                  input_length=MAX_LEN, mask_zero=True)(input)  # default: 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.15))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(n_tags+1)  # CRF layer, n_tags+1(PAD)
out = crf(model)  # output

model = Model(input, out)
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])

model.summary()

model.load_weights("model/alesp-2019-05-24.hdf5")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 75)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 75, 50)            228000    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 75, 100)           40400     
_________________________________________________________________
time_distributed_3 (TimeDist (None, 75, 50)            5050      
_________________________________________________________________
crf_3 (CRF)                  (None, 75, 7)             420       
Total params: 273,870
Trainable params: 273,870
Non-trainable params: 0
_________________________________________________________________


In [11]:
from ipywidgets import interact_manual
from ipywidgets import widgets

import sys
import math
import re
import string

# Custom Tokenizer
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()
    
def get_prediction(sentence):
    test_sentence = tokenize(sentence) # Tokenization
    s_len = len(test_sentence)
    t_sentences = []
    #print('len(test_sentence)', s_len)
    # Preprocessing
    for step in range(math.ceil(s_len / MAX_LEN)):
        sentence1 = test_sentence[75*step:75*(step+1)]
        #print("*************************************")
        #print(sentence1)
        #print("*************************************")
        t_sentences.append(pad_sequences(sequences=[[word2idx.get(w, 0) for w in sentence1]],
                           padding="post", value=word2idx["PAD"], maxlen=MAX_LEN)[0])

    p = model.predict(np.array(t_sentences))
    p = np.array([np.argmax(p1, axis=-1) for p1 in p])
    # Visualization

    for w, pred in zip(test_sentence, np.ndarray.flatten(p)):
        #print("{:15}: {:5}".format(w, idx2tag[pred]))
        if idx2tag[pred] in ['b-veto', 'veto', 'e-veto']:
            sys.stdout.write("\x1b[31m"+w+"\x1b[0m ")
        else:
            sys.stdout.write(w+' ')

interact_manual(get_prediction, sentence=widgets.Textarea(placeholder='Type your sentence here'));

interactive(children=(Textarea(value='', description='sentence', placeholder='Type your sentence here'), Butto…