In [1]:
import numpy as np
from minuet import loader, encoder, datastream, minuet
from minuet import preprocessing as p

Using TensorFlow backend.


In [2]:
TRAIN_PATH = './data/ner/train.txt'
DEVEL_PATH = './data/ner/dev.txt'
GLOVE_PATH = './embeddings/wglove.6B.300d.bin'

MAX_SENT_LEN = 10
MAX_WORD_LEN = 10
SENT_LSTM = 32
LSTM_SENT_DROP = 0.5

In [4]:
X_t, Y_t = loader.load_dataset(TRAIN_PATH)
X_v, Y_v = loader.load_dataset(DEVEL_PATH)

pre_word = p.assemble(
    p.lower,
    p.replace_numbers
)

pre_char = p.assemble(
    p.replace_numbers
)

#TODO: rename get_characters_vocabulary(*) to get_characters_index(*)
Vw = loader.get_vocabulary(X_t, pre_word)
char2index = loader.get_characters_mapping(X_t, pre_char)

word2index, E = loader.load_embeddings(GLOVE_PATH, Vw)

In [5]:
E_sent_t = encoder.sentence_to_index(X_t, word2index, pre_word, MAX_SENT_LEN)
E_sent_v = encoder.sentence_to_index(X_v, word2index, pre_word, MAX_SENT_LEN)

E_word_t = encoder.sentence_to_characters(X_t, char2index, MAX_WORD_LEN, MAX_SENT_LEN, pre_char)
E_word_v = encoder.sentence_to_characters(X_v, char2index, MAX_WORD_LEN, MAX_SENT_LEN, pre_char)

# labels indices
Y_train, label_encoder = encoder.encode_labels(Y_t, MAX_SENT_LEN)
Y_valid, _ = encoder.encode_labels(Y_v, MAX_SENT_LEN, label_encoder)

Y_train = np.expand_dims(Y_train, -1)
Y_valid = np.expand_dims(Y_valid, -1)

In [7]:
from minuet import minuet as mm

char_configs = mm.CharEmbeddingConfigs(len(char2index), 32, 16, 0.5)

model = mm.Minuet(
    embedding=E,
    lstm_size=SENT_LSTM,
    lstm_drop=LSTM_SENT_DROP,
    bidirectional=True,
    crf=True,
    char_embeddings_conf=char_configs
)

model.fit([E_sent_t, E_word_t], Y_train, [E_sent_v, E_word_v], Y_valid)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         (None, None, None)   0                                            
__________________________________________________________________________________________________
sent_input (InputLayer)         (None, None)         0                                            
__________________________________________________________________________________________________
char_embedding (Embedding)      (None, None, None, 3 2464        char_input[0][0]                 
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, None, 300)    4793100     sent_input[0][0]                 
__________________________________________________________________________________________________
char_BiLST

In [13]:
test_sentences = [
    'Uruguay president Mr Fulano leaves Mexico to NATO meeting'.split(),
    'He works at the New York Times in New York'.split(),
    'They went to the city for the meeting'.split(),
    'Apple Corporation now profits from selling apples'.split(),
    'EU rejects German call to boycott British lamb'.split()
]

labels = model.predict(test_sentences, word2index, Cw, pre_word, pre_char, 10)
labels = label_encoder.inverse_transform(labels)

for sentence, labels in zip(test_sentences, labels):
    label_off=0
    for index in range(len(sentence)):
        print('{}/{}'.format(sentence[index], labels[label_off + index]), end='\t')
    print()   

Uruguay/B-LOC	president/O	Mr/B-PER	Fulano/I-PER	leaves/O	Mexico/B-LOC	to/O	NATO/B-ORG	meeting/O	
He/O	works/O	at/O	the/O	New/B-ORG	York/I-ORG	Times/I-ORG	in/O	New/B-LOC	York/I-LOC	
They/O	went/O	to/O	the/O	city/O	for/O	the/O	meeting/O	
Apple/B-ORG	Corporation/I-ORG	now/O	profits/O	from/O	selling/O	apples/O	
EU/B-ORG	rejects/O	German/B-MISC	call/O	to/O	boycott/O	British/B-MISC	lamb/O	
