In [2]:
from collections import Counter
from functools import reduce
from typing import Dict, List, Tuple

import numpy as np
from sklearn.metrics import precision_recall_fscore_support
import tensorflow as tf

In [3]:
train_path_name = 'data/wnut17train.conll'
valid_path_name = 'data/emerging.dev.conll'
test_paht_name = 'data/emerging.test.annotated'

In [4]:
def read_conll_file(path_name: str) -> Tuple[List[List[str]], List[List[str]]]:
    with open(path_name) as file:
        lst_tweet_tokens = []
        lst_tweet_tags = []
        lst_tokens = []
        lst_tags = []
        for line in file:
            token_tag_pair = line.split() 
            if len(token_tag_pair) == 2:
                    lst_tokens.append(token_tag_pair[0])
                    lst_tags.append(token_tag_pair[1])
            else:
                lst_tweet_tokens.append(lst_tokens)
                lst_tweet_tags.append(lst_tags)
                lst_tokens = []
                lst_tags = []
                
    return lst_tweet_tokens, lst_tweet_tags

def display(lst_tokens: List[str], lst_tags: List[str]) -> str:
    result_str = ''
    for token, tag in zip(lst_tokens, lst_tags):
        result_str += ' ' + token + '<' + tag  + '>'
    return result_str

In [5]:
train_tweet_tokens, train_tweet_tags = read_conll_file(train_path_name)
valid_tweet_tokens, valid_tweet_tags = read_conll_file(valid_path_name)
test_tweet_tokens, test_tweet_tags = read_conll_file(test_paht_name)

In [6]:
display(train_tweet_tokens[0], train_tweet_tags[0])

" @paulwalk<O> It<O> 's<O> the<O> view<O> from<O> where<O> I<O> 'm<O> living<O> for<O> two<O> weeks<O> .<O> Empire<B-location> State<I-location> Building<I-location> =<O> ESB<B-location> .<O> Pretty<O> bad<O> storm<O> here<O> last<O> evening<O> .<O>"

In [7]:
print('# of training examples: {}'.format(len(train_tweet_tokens)))
print('# of validating examples: {}'.format(len(valid_tweet_tokens)))
print('# of testing examples: {}'.format(len(test_tweet_tokens)))

# of training examples: 3394
# of validating examples: 1009
# of testing examples: 1287


In [8]:
print('Unique train tags: {}'.format(sorted(set(reduce(lambda a, b: a + b, train_tweet_tags)))))
print()
print('Unique valid tags: {}'.format(sorted(set(reduce(lambda a, b: a + b, valid_tweet_tags)))))
print()
print('Unique test tags: {}'.format(sorted(set(reduce(lambda a, b: a + b, test_tweet_tags)))))

Unique train tags: ['B-corporation', 'B-creative-work', 'B-group', 'B-location', 'B-person', 'B-product', 'I-corporation', 'I-creative-work', 'I-group', 'I-location', 'I-person', 'I-product', 'O']

Unique valid tags: ['B-corporation', 'B-creative-work', 'B-group', 'B-location', 'B-person', 'B-product', 'I-corporation', 'I-creative-work', 'I-group', 'I-location', 'I-person', 'I-product', 'O']

Unique test tags: ['B-corporation', 'B-creative-work', 'B-group', 'B-location', 'B-person', 'B-product', 'I-corporation', 'I-creative-work', 'I-group', 'I-location', 'I-person', 'I-product', 'O']


In [9]:
special_tokens = ['<unk>', '<pad>'] 
special_tags = ['O']

def create_mappings(tweet_words: List[List[str]], 
                    special_words: List[str],
                    normalize=True) -> Tuple[Dict[str, int], Dict[int, str]]:
    """
    returns "word to id" and "id to word"
    """ 
    if normalize:
        special_words = [word.lower() for word in special_words]
        words = reduce(lambda a, b: a + b, tweet_words)
        words = [word.lower() for word in words]
        words = ['<user>' if word[0] == '@' else word for word in words]
        unique_words = set(words).difference(special_words)
    else:
        unique_words = set(reduce(lambda a, b: a + b, tweet_words)).difference(special_words)
        
    word2id = {}
    id2word = {}
    
    for index, word in enumerate(special_words):
        word2id[word] = index
        id2word[index] = word
        
    for index_, word in enumerate(unique_words, index + 1):
        word2id[word] = index_
        id2word[index_] = word
    
    return word2id, id2word

def lst2ids(lst, word2id):
    res = []
    for x in lst:
        if x in word2id:
            res.append(word2id[x])
        else:
            res.append(word2id['<unk>'])
    return res

In [10]:
word2id, id2word = create_mappings(train_tweet_tokens, special_tokens, normalize=True)
tag2id, id2tag = create_mappings(train_tweet_tags, special_tags, normalize=False)

In [11]:
len(word2id), len(tag2id)

(11112, 13)

In [12]:
def crate_datasets(
    lst_tweet_tokens,
    lst_tweet_tags, 
    word2id,
    id2word,
    tag2id,
    id2tag
):
    max_len = max([len(x) for x in lst_tweet_tokens])
    size = len(lst_tweet_tokens)
    
    x = np.ones((size, max_len)) * word2id['<pad>']
    y = np.ones((size, max_len)) * tag2id['O']
    
    lengths = np.zeros(size)
    
    for i, (tweet_tokens, tweet_tags) in enumerate(zip(lst_tweet_tokens, lst_tweet_tags)):
        tweet_ids = lst2ids(tweet_tokens, word2id)
        tweet_tags_ids = lst2ids(tweet_tags, tag2id)
        assert len(tweet_ids) == len(tweet_tags_ids)
        x[i, :len(tweet_ids)] = tweet_ids
        y[i, :len(tweet_ids)] = tweet_tags_ids
        lengths[i] = len(tweet_ids)
        
    return x, y, lengths

In [13]:
X_train, y_train, len_train = crate_datasets(train_tweet_tokens, train_tweet_tags, 
                                              word2id, id2word,
                                              tag2id, id2tag)

In [14]:
X_valid, y_valid, len_valid = crate_datasets(valid_tweet_tokens, valid_tweet_tags, 
                                              word2id, id2word,
                                              tag2id, id2tag)

In [15]:
X_train.shape, y_train.shape, len_train.shape

((3394, 41), (3394, 41), (3394,))

In [16]:
class BiLSTM:
    def __init__(
        self,
        n_hidden_units,
        tokens_size, 
        tags_size,
        embedding_dim,
        rnn_cell='basic',
        dropout_ratio=1.0,
        learning_rate=5e-03
    ):
        self.rnn_cell = rnn_cell
        self.n_hidden_units = n_hidden_units
        self.tokens_size = tokens_size
        self.tags_size = tags_size
        self.embedding_dim = embedding_dim
        self.dropout_ratio = dropout_ratio
        self.learning_rate = learning_rate
    
        self.session = tf.Session(graph=self.create_graph())
        self.session.run(self.init)
        
    def define_placeholders(self):
        self.input_token_ids = tf.placeholder(shape=[None, None], 
                                             dtype=tf.int32,
                                             name='input_token_ids')
        
        self.input_tag_ids = tf.placeholder(shape=[None, None],
                                           dtype=tf.int32, 
                                           name='input_tag_ids')
        
        self.input_lengths = tf.placeholder(shape=[None],
                                           dtype=tf.int32,
                                           name='input_lengths')
        
        self.dropout = tf.placeholder_with_default(tf.constant(1.0,
                                                               dtype=tf.float32
                                                              ),
                                                 shape=[],
                                                 name='dropout')
        
        self.lr = tf.placeholder_with_default(tf.constant(1e-03, dtype=tf.float32),
                                                        shape=[],
                                                        name='learning_rate')
        
    def define_embeddings(self):
        self.embeddings = tf.get_variable(name='embeddings',
                                          shape=[self.tokens_size, self.embedding_dim],
                                          dtype=tf.float32, 
                                          initializer = tf.initializers.variance_scaling()
                                          )
        self.embedded_input = tf.nn.embedding_lookup(
            self.embeddings,
            self.input_token_ids
        )
        
    def define_cells(self):
        if self.rnn_cell == 'basic':
            self.forward_cell = tf.nn.rnn_cell.BasicRNNCell(
                num_units=self.n_hidden_units)
            self.backward_cell = tf.nn.rnn_cell.BasicRNNCell(
                num_units=self.n_hidden_units)
        
        elif self.rnn_cell == 'lstm':
            self.forward_cell = tf.nn.rnn_cell.BasicLSTMCell(
                num_units = self.n_hidden_units)
            self.backward_cell = tf.nn.rnn_cell.BasicLSTMCell(
                num_units = self.n_hidden_units)
        
        elif self.rnn_cell == 'gru':
            self.forward_cell = tf.nn.rnn_cell.GRUCell(
                num_units = self.n_hidden_units)
            self.backward_cell = tf.nn.rnn_cell.GRUCell(
                num_units = self.n_hidden_units)
        else:
            raise ValueError('There is no {} for rnn_cell argument'.format(self.rnn_cell))
        
        self.forward_cell = tf.nn.rnn_cell.DropoutWrapper(
            self.forward_cell,
            input_keep_prob=self.dropout,
            output_keep_prob=self.dropout,
            state_keep_prob=self.dropout
        )
        
        self.backward_cell = tf.nn.rnn_cell.DropoutWrapper(
            self.backward_cell,
            input_keep_prob=self.dropout,
            output_keep_prob=self.dropout,
            state_keep_prob=self.dropout
        )

        
    def create_graph(self):
        with tf.Graph().as_default() as graph:
            self.define_placeholders()
            self.define_embeddings()
            self.define_cells()

            self.max_len = tf.shape(self.input_token_ids)[1]
            
            (self.fw_outputs, self.bw_outputs), _ = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=self.forward_cell, 
                cell_bw=self.backward_cell,
                inputs=self.embedded_input,
                sequence_length=self.input_lengths,
                dtype=tf.float32
            )

            self.rnn_output = tf.concat([
                self.fw_outputs,
                self.bw_outputs
            ], axis=2)

            self.logits = tf.layers.dense(
                self.rnn_output,
                self.tags_size,
                activation=None, 

            )
            
            self.predictions = tf.argmax(self.logits, axis=2)

            self.weights = tf.cast(tf.sequence_mask(self.input_lengths,
                                                    maxlen=self.max_len),
                                   dtype=tf.float32)

            self.loss = tf.contrib.seq2seq.sequence_loss(
                logits=self.logits,
                targets=self.input_tag_ids,
                weights=self.weights
            )
            
            self.optimizer = tf.train.AdamOptimizer(self.lr)
            self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
            clip_norm = tf.cast(1.0, tf.float32)

            self.grads_and_vars = [(tf.clip_by_norm(g, clip_norm), v) for g, v in self.grads_and_vars] 
            self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)
            
            self.init = tf.global_variables_initializer()
            return graph
                        
    def fit(self, X, y, lengths, val_data, batch_size=64, n_epochs=10, shuffle=True):
        with self.session.as_default() as sess:
        
            n = X.shape[0]
            for i in range(n_epochs):
                if shuffle:
                    order = np.random.permutation(np.arange(n))
                    X = X[order]
                    y = y[order]
                    lengths = lengths[order]
                n_batches = int(n / batch_size)
 
                for j in range(n_batches):
                    start_index = j * batch_size
                    end_index = (j + 1) * batch_size
                   
                    loss = sess.run(self.train_op, feed_dict={
                        self.input_token_ids: X[start_index:end_index],
                        self.input_tag_ids: y[start_index:end_index],
                        self.input_lengths: lengths[start_index:end_index], 
                        self.dropout: self.dropout_ratio, 
                        self.lr: self.learning_rate
                    })
                
                validation_loss, val_preds = sess.run([self.loss, self.predictions], feed_dict={
                        self.input_token_ids: val_data[0],
                        self.input_tag_ids: val_data[1],
                        self.input_lengths: val_data[2]
                })
                
                training_loss, train_preds = sess.run([self.loss, self.predictions], feed_dict={
                        self.input_token_ids: X,
                        self.input_tag_ids: y,
                        self.input_lengths: lengths
                })
                
                training_metric = precision_recall_fscore_support(
                    y.flatten(),
                    train_preds.flatten(),
                    average='macro')
                
                validation_metric = precision_recall_fscore_support(
                    val_data[1].flatten(),
                    val_preds.flatten(),
                    average='macro')
                
                print('Epoch: {}, training loss: {}'.format(i + 1, training_loss))
                print('Epoch: {}, precision: {}'.format(i + 1, training_metric[0]))
                print('Epoch: {}, recall: {}'.format(i + 1, training_metric[1]))
                print('Epoch: {} f1-score: {}'.format(i + 1, training_metric[2]))
                print('----------------')
                print('*** validation loss: {} ***'.format(validation_loss))
                print('*** precision: {} ***'.format(validation_metric[0]))
                print('*** recall: {} ***'.format(validation_metric[1]))
                print('*** f1-score: {} ***'.format(validation_metric[2]))
                print()
                
    def predict(self):
        pass