In [1]:
# Data Import and Pre-Processing
import pandas as pd

In [51]:
training_data = [ (
    "the wall street journal reported today that apple corporation made money".split(),
    "B I I I O O O B I O O".split()
), (
    "georgia tech is a university in georgia".split(),
    "B I O O O O B".split()
)]
training_data

[(['the',
   'wall',
   'street',
   'journal',
   'reported',
   'today',
   'that',
   'apple',
   'corporation',
   'made',
   'money'],
  ['B', 'I', 'I', 'I', 'O', 'O', 'O', 'B', 'I', 'O', 'O']),
 (['georgia', 'tech', 'is', 'a', 'university', 'in', 'georgia'],
  ['B', 'I', 'O', 'O', 'O', 'O', 'B'])]

In [61]:
def generate_input_data(filename):
    with open(filename) as f:
        data = f.readlines()

    input_data = list()
    sentence_word_list = list()
    sentence_word_tag_list = list()
    for line in data:
        splitted_line = line.split("\t")

        if splitted_line[0] == "\n":
            one_sentence_tuple = (sentence_word_list, sentence_word_tag_list)
            input_data.append(one_sentence_tuple)
            sentence_word_list = []
            sentence_word_tag_list = []
        else:
            sentence_word_list.append(splitted_line[0].strip())
            sentence_word_tag_list.append(splitted_line[1].strip())

    return input_data

In [65]:
from six import iteritems

def prepare_sequence(seq, to_ix):
    return np.array([to_ix[w] for w in seq])

# Make up some training data
training_data = generate_input_data("data/train")

word_to_ix = {}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
ix_to_word = dict([(v, k) for k, v in iteritems(word_to_ix)])

char_to_ix = {}
for word in word_to_ix.keys():
    for char in word:
        if char not in char_to_ix:
            char_to_ix[char] = len(char_to_ix)
ix_to_char = dict([(v, k) for k, v in iteritems(char_to_ix)])

tag_list = ['B-company', 'B-facility', 'B-geo-loc', 'B-movie', 'B-musicartist', 'B-other', 'B-person', 'B-product', 'B-sportsteam',
 'B-tvshow', 'I-company', 'I-facility', 'I-geo-loc', 'I-movie', 'I-musicartist', 'I-other', 'I-person', 'I-product', 'I-sportsteam',
 'I-tvshow', 'O']

tag_to_ix = dict()
count = 0
for tag in tag_list:
    tag_to_ix[tag] = count
    count += 1
print(tag_to_ix)
ix_to_tag = dict([(v, k) for k, v in iteritems(tag_to_ix)])

def prepare_input(sentence, tags):
    sent_seq = prepare_sequence(sentence, word_to_ix)
    tags_seq = prepare_sequence(tags, tag_to_ix)
    word_seq = \
        tf.keras.preprocessing.sequence.pad_sequences(
            [prepare_sequence(word, char_to_ix) for word in sentence], 
            padding='post', 
            value=-1)
    word_len_seq = \
        np.apply_along_axis(
            lambda seq: next(i for i, j in enumerate(seq) if j < 0), 
            axis=1, 
            arr=np.c_[word_seq, np.ones((word_seq.shape[0], 1)) * -1])
    return sent_seq, tags_seq, word_seq, np.squeeze(word_len_seq)

{'B-company': 0, 'B-facility': 1, 'B-geo-loc': 2, 'B-movie': 3, 'B-musicartist': 4, 'B-other': 5, 'B-person': 6, 'B-product': 7, 'B-sportsteam': 8, 'B-tvshow': 9, 'I-company': 10, 'I-facility': 11, 'I-geo-loc': 12, 'I-movie': 13, 'I-musicartist': 14, 'I-other': 15, 'I-person': 16, 'I-product': 17, 'I-sportsteam': 18, 'I-tvshow': 19, 'O': 20}


### Model Start

In [44]:
import tensorflow as tf
tf.reset_default_graph()

In [45]:
# Model Params

learning_rate = 0.01
train_epoch = 10000
input_size = 10
batch_size = 100
num_units = 512
num_classes = 5

In [127]:
# Placeholders

inputs = tf.placeholder(tf.float32, [None, None, input_size], name='inputs')
labels = tf.placeholder(tf.int32, [None, None], name='labels')
batch_seq_len = tf.placeholder(tf.int32)
org_seq_len = tf.placeholder(tf.int32, [None])

In [128]:
# Bi-LSTM Cell
with tf.name_scope("BiLSTM"):
    with tf.variable_scope('forward'):
        lstm_fw = tf.nn.rnn_cell.LSTMCell(num_units, forget_bias=1.0, state_is_tuple=True)
    with tf.variable_scope('backward'):
        lstm_bw = tf.nn.rnn_cell.LSTMCell(num_units, forget_bias=1.0, state_is_tuple=True)
    
    (output_fw, output_bw), states = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw,
                                                                     cell_bw=lstm_bw,
                                                                     inputs=inputs,
                                                                     sequence_length=org_seq_len,
                                                                     dtype=tf.float32,
                                                                     scope="BiLSTM")



outputs = tf.concat([output_fw, output_bw], axis=2)

# FC
W = tf.get_variable("W", [2 * num_units, num_classes], dtype=tf.float32)
b = tf.get_variable("b", [num_classes], dtype=tf.float32, initializer=tf.zeros_initializer())

outputs_flat = tf.reshape(outputs, [-1, 2 * num_units])
pred = tf.matmul(outputs_flat, W) + b
scores = tf.reshape(pred, [-1, batch_seq_len, num_classes])

# CRF

log_loss, trans_params = tf.contrib.crf.crf_log_likelihood(scores, labels, org_seq_len)
loss = tf.reduce_mean(-log_loss)

# viterbi Seq, score
viterbi_seq, viterbi_score = tf.contrib.crf.crf_decode(scores, trans_params, org_seq_len)

# Train Ops
train_opt = tf.train.AdamOptimizer(learning_rate)
train_op = train_opt.minimize(loss)

# Saver
saver = tf.train.Saver()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [None]:
# Train into Session

# Take From Batch
batch_inputs = []
batch_labels = []
batch_seq_lengths = 0

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for i in range(train_epoch):
        for batch_inputs, batch_labels, batch_len, batch_seq_lengths in batch(x_, y_, seq_len_train, batch_size, input_size):
            tf_viterbi_seq, _ = sess.run([viterbi_seq, train_op],
                                        feed_dict={inputs: batch_inputs,
                                                  labels: batch_labels,
                                                  batch_seq_len:batch_len,
                                                  org_seq_len: batch_seq_lengths})
        
            if i % 30 == 0:
                mask = (np.expand_dims(np.arrange(batch_len), axis=0) < np.expand_dims(batch_seq_lengths, axis=1))
                total_labels = np.sum(batch_seq_lengths)
                correct_labels = np.sum((batch_labels == tf_viterbi_seq)* mask)
                accuracy = 100.0 * correct_labels / float(total_labels)
                print("Epoch ", i, " Accuracy: ", accuracy)
        
    saver.save(sess, './model_crf')

In [None]:
# Test Accuracy and Pred

# Take From Batch
batch_test = []
batch_test_labels = []
batch_seq_t_lengths = 0

with tf.Sesion() as sess:
    sess.run(tf.global_variables_initializer())
    
    for i in range(train_epoch):
        for batch_inputs, batch_labels, batch_len, batch_seq_t_lengths in batch(x_t, y_t, seq_len_test, batch_test_size, input_size):
            tf_viterbi_seq = sess.run(viterbi_seq,
                                        feed_dict={inputs: batch_inputs,
                                                  labels: batch_labels,
                                                  batch_seq_len:batch_len,
                                                  org_seq_len: batch_seq_t_lengths})
        
            
            mask = (np.expand_dims(np.arrange(batch_len), axis=0) < np.expand_dims(batch_seq_lengths, axis=1))
            total_labels = np.sum(batch_seq_t_lengths)
            correct_labels = np.sum((batch_labels == tf_viterbi_seq)* mask)
            accuracy = 100.0 * correct_labels / float(total_labels)
            print("Test Accuracy: ", accuracy)
            print("Label: ", batch_labels[0].astype(int))
            print("Pred: ", tf_viterbi_sequence[0])