In [1]:
import numpy as np
from minuet import preprocessing as p
from minuet import loader, encoder, datastream, minuet
from minuet import Minuet, CharEmbeddingConfigs

Using TensorFlow backend.


### Setting hyperparameters

In [2]:
TRAIN_PATH = './data/ner/train.txt'
DEVEL_PATH = './data/ner/dev.txt'
GLOVE_PATH = './embeddings/wglove.6B.300d.bin'

MAX_SENT_LEN = 10
MAX_WORD_LEN = 10
SENT_LSTM = 32
LSTM_SENT_DROP = 0.5

CHAR_EMBD = 16
CHAR_LSTM = 32
CHAR_DROP = 0.5

### Loading data, setting preprocessing pipeline

In [3]:
X_t, Y_t = loader.load_dataset(TRAIN_PATH)
X_v, Y_v = loader.load_dataset(DEVEL_PATH)

pre_word = p.assemble(
    p.lower,
    p.replace_numbers
)

pre_char = p.assemble(
    p.lower,
    p.replace_numbers
)

Vw = loader.get_vocabulary(X_t, pre_word)
char2index = loader.get_characters_mapping(X_t, pre_char)
word2index, E = loader.load_embeddings(GLOVE_PATH, Vw, retain=False)

In [4]:
# labels indices
Y_train, label_encoder = encoder.encode_labels(Y_t, MAX_SENT_LEN)
Y_valid, _ = encoder.encode_labels(Y_v, MAX_SENT_LEN, label_encoder)

Y_train = np.expand_dims(Y_train, -1)
Y_valid = np.expand_dims(Y_valid, -1)

### Training the model

In [5]:
char_configs = CharEmbeddingConfigs(
    char2index,
    pre_char,
    MAX_WORD_LEN,
    CHAR_EMBD,
    CHAR_LSTM,
    CHAR_DROP
)

model = Minuet(
    word2index=word2index,
    pre_word=pre_word,
    word_embedding=E,
    lstm_size=SENT_LSTM,
    lstm_drop=LSTM_SENT_DROP,
    bidirectional=True,
    crf=True,
    char_embeddings_conf=char_configs
)

model.set_checkpoint_path('./models/ner/')
model.set_label_encoder(label_encoder)

in_train = model.prepare_samples(X_t, MAX_SENT_LEN)
in_valid = model.prepare_samples(X_v, MAX_SENT_LEN)

model.fit(in_train, Y_train, in_valid, Y_valid)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         (None, None, None)   0                                            
__________________________________________________________________________________________________
sent_input (InputLayer)         (None, None)         0                                            
__________________________________________________________________________________________________
char_embedding (Embedding)      (None, None, None, 1 816         char_input[0][0]                 
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, None, 300)    4793100     sent_input[0][0]                 
__________________________________________________________________________________________________
char_BiLST

### Testing

In [8]:
test_sentences = [
    'Uruguay president mr mujica leaves the country to a meeting'.split(),
    'He works at the New York Times in new york'.split(),
    'They went to the city for the meeting'.split(),
    'apple now profits from selling apples in Brasil'.split(),
    'EU rejects german call to boycott British lamb'.split()
]

labels = model.predict(test_sentences)
labels = model.decode_predictions(labels)

for sentence, labels in zip(test_sentences, labels):
    label_off=0
    for index in range(len(sentence)):
        print('{}/{}'.format(sentence[index], labels[label_off + index]), end='\t')
    print()

Uruguay/B-LOC	president/O	mr/B-PER	mujica/I-PER	leaves/O	the/O	country/O	to/O	a/O	meeting/O	
He/O	works/O	at/O	the/O	New/B-LOC	York/I-LOC	Times/O	in/O	new/B-LOC	york/I-LOC	
They/O	went/O	to/O	the/O	city/O	for/O	the/O	meeting/O	
apple/B-ORG	now/O	profits/O	from/O	selling/O	apples/O	in/O	Brasil/O	
EU/B-ORG	rejects/O	german/B-MISC	call/O	to/O	boycott/O	British/B-MISC	lamb/O	


### Loading

In [9]:
del model

model = Minuet.load('models/ner/')
labels = model.decode_predictions(model.predict(test_sentences))

for sentence, labels in zip(test_sentences, labels):
    label_off=0
    for index in range(len(sentence)):
        print('{}/{}'.format(sentence[index], labels[label_off + index]), end='\t')
    print()

Uruguay/B-LOC	president/O	mr/B-PER	mujica/I-PER	leaves/O	the/O	country/O	to/O	a/O	meeting/O	
He/O	works/O	at/O	the/O	New/B-LOC	York/I-LOC	Times/O	in/O	new/B-LOC	york/I-LOC	
They/O	went/O	to/O	the/O	city/O	for/O	the/O	meeting/O	
apple/B-ORG	now/O	profits/O	from/O	selling/O	apples/O	in/O	Brasil/O	
EU/B-ORG	rejects/O	german/B-MISC	call/O	to/O	boycott/O	British/B-MISC	lamb/O	
