In [1]:
from __future__ import absolute_import, division, print_function

!pip install tensorflow-gpu==2.0.0-alpha0
import tensorflow as tf
device_name = tf.test.gpu_device_name()
print (device_name)

/device:GPU:0


In [0]:
import matplotlib.pyplot as plt
% matplotlib inline
import numpy as np
import pandas as pd
import time
import random
import math

In [3]:
from google.colab import drive

drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [0]:
batch_size = 64
emb_size = 300
hidden_size = 512
lr = 1e-3

In [0]:
# format of files: each line is "word1|tag1 word2|tag2 ..."
train_path = '/content/drive/My Drive/Colab Notebooks/cs11747/data/tag/train.txt' # train set
dev_path = '/content/drive/My Drive/Colab Notebooks/cs11747/data/tag/dev.txt' # dev set

In [0]:
def read_data(fname, shuffle=False):
    """
    Read tagged file
    """
    with open(fname, "r") as f:
        word_set = []
        tag_set = []
        for line in f:
            words, tags = [], ['<start>']
            for wt in line.lower().strip().split():
                w, t = wt.split('|')
                words.append(w)
                tags.append(t)
            word_set.append(' '.join(words))
            tags.append('<end>')
            tag_set.append(' '.join(tags))
        if shuffle:
            c = list(zip(word_set, tag_set))
            random.shuffle(c)
            word_set, tag_set = zip(*c)
        return word_set, tag_set

In [7]:
train_word, train_tag = read_data(train_path, shuffle=True)
dev_word, dev_tag = read_data(dev_path)

print (len(train_word[0].split()))
print (train_tag[0], len(train_tag[0].split()))
print (len(train_word), len(train_tag), len(dev_word), len(dev_tag))

17
<start> o i-org i-org o o o o o o i-misc i-misc i-misc o o o o o <end> 19
10000 10000 1696 1696


In [0]:
tokenizer_word = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<unk>') 
tokenizer_word.fit_on_texts(train_word)
train_seq_word = tokenizer_word.texts_to_sequences(train_word)
dev_seq_word = tokenizer_word.texts_to_sequences(dev_word)

In [0]:
tokenizer_tag = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<unk>') 
tokenizer_tag.fit_on_texts(train_tag)
train_seq_tag = tokenizer_tag.texts_to_sequences(train_tag)
dev_seq_tag = tokenizer_tag.texts_to_sequences(dev_tag)

In [10]:
w2i = tokenizer_word.word_index
t2i = tokenizer_tag.word_index
nwords = len(w2i)
ntags = len(t2i)
print (f'dictionary for words and tags built. {nwords} words and {ntags} tags')

dictionary for words and tags built. 23694 words and 12 tags


In [11]:
t2i

{'<end>': 5,
 '<start>': 4,
 '<unk>': 1,
 'b-loc': 10,
 'b-misc': 9,
 'b-org': 12,
 'b-per': 11,
 'i-loc': 6,
 'i-misc': 7,
 'i-org': 8,
 'i-per': 3,
 'o': 2}

In [12]:
# dictionary from id to word, used to decode
i2w = {}
for w, i in w2i.items():
    i2w[i] = w
print (i2w[4])

.


In [13]:
# dictionary from id to tag, used to decode
i2t = {}
for t, i in t2i.items():
    i2t[i] = t
print (i2t[5])

<end>


In [0]:
def train_gen(batch_size=batch_size):
    i = 0
    while i<len(train_seq_word):
        if i+batch_size < len(train_seq_word):
            batch_word = tf.keras.preprocessing.sequence.pad_sequences(train_seq_word[i:i+batch_size], padding='post')
            batch_tag = tf.keras.preprocessing.sequence.pad_sequences(train_seq_tag[i:i+batch_size], padding='post')
        else:
            batch_word = tf.keras.preprocessing.sequence.pad_sequences(train_seq_word[i:], padding='post')
            batch_tag = tf.keras.preprocessing.sequence.pad_sequences(train_seq_tag[i:], padding='post')
        i += batch_size
        yield batch_word, batch_tag

In [0]:
def dev_gen(batch_size=batch_size):
    i = 0
    while i<len(dev_seq_word):
        if i+batch_size < len(dev_seq_word):
            batch_word = tf.keras.preprocessing.sequence.pad_sequences(dev_seq_word[i:i+batch_size], padding='post')
            batch_tag = tf.keras.preprocessing.sequence.pad_sequences(dev_seq_tag[i:i+batch_size], padding='post')
        else:
            batch_word = tf.keras.preprocessing.sequence.pad_sequences(dev_seq_word[i:], padding='post')
            batch_tag = tf.keras.preprocessing.sequence.pad_sequences(dev_seq_tag[i:], padding='post')
        i += batch_size
        yield batch_word, batch_tag

# Build model

In [0]:
class BiLSTM_CRF(tf.keras.Model):
    def __init__(self, embed_size, hidden_size):
        super(BiLSTM_CRF, self).__init__()
        self.embedding = tf.keras.layers.Embedding(nwords+1, embed_size, trainable=True)
        self.bilstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hidden_size//2, 
                                       return_sequences=True, 
                                       recurrent_initializer='glorot_uniform'), merge_mode='concat')
        # emission score
        self.fc = tf.keras.layers.Dense(ntags+1)
        
        # Matrix of transition scores.  Entry i,j is the score of transitioning to i from j.
        self.transitions = np.random.randn(ntags+1, ntags+1)
        self.transitions[t2i['<start>'], :] = -10000. # to start forbidden
        self.transitions[:, t2i['<end>']] = -10000. # from end forbidden
        self.transitions[0, :] = -10000. # to pad forbidden
        self.transitions[0, t2i['<end>']] = 0. # except from end to pad
        self.transitions[:, 0] = -10000. # from pad forbidden
        self.transitions = tf.Variable(self.transitions, trainable=True) #[ntags+1(to), ntags+1(from)]]
        
    # normalizer. sum of all possible paths
    def _forward_alg(self, feats, mask): # feats [batch, t, ntags+1], mask [batch, t]
        batch = tf.shape(feats)[0]
        forward = np.full((batch, ntags+1), -10000.)
        forward[:, t2i['<start>']] = 0. #[batch, ntags+1]
        forward = tf.Variable(forward, trainable=False)
        
        steps = tf.shape(feats)[1]
        for step in range(steps):
            feat = feats[:, step, :] #[batch, ntags+1]
            mask_t = mask[:, step+1] #[batch]
            # for sentence that is not finised, need to forward to next step. otherwise remain forward unchanged. controlled by mask.
            forward_next = tf.broadcast_to(tf.expand_dims(forward, axis=1), [batch, ntags+1, ntags+1]) #[batch, ntags+1(to), ntags+1(from)]
            emit = tf.broadcast_to(tf.expand_dims(feat, axis=-1), [batch, ntags+1, ntags+1]) #[batch, ntags+1(to), ntags+1(from)]
            trans = tf.broadcast_to(tf.expand_dims(self.transitions, axis=0), [batch, ntags+1, ntags+1]) #[batch, ntags+1(to), ntags+1(from)]
            emit = tf.cast(emit, tf.float64)
            #print (forward_next, emit, trans)
            next_score = forward_next + emit + trans #[batch, ntags+1(to), ntags+1(from)]
            next_score = tf.math.reduce_logsumexp(next_score, axis=-1) #[batch, ntags+1(to)]
            forward = tf.where(mask_t, next_score, forward) #[batch, ntags+1]
            
        last = forward + self.transitions[t2i['<end>'], :] #[batch, ntags+1]
        alpha = tf.math.reduce_logsumexp(last, axis=-1) #[batch]
        return alpha
    
    # score of the gold path
    def _score_sent(self, feats, tags, mask): #feats:[batch, t, ntags+1], tags:[batch, t], mask:[batch, t]
        batch = tf.shape(feats)[0]
        m = tf.cast(mask, tf.int32)
        length = tf.reduce_sum(m, axis=-1) #[batch] sentence length (including start tag) used to get the last tag (the tag before end)
        score = tf.zeros([batch])
        steps = tf.shape(feats)[1]
        for step in range(steps):
            feat = feats[:, step, :] #[batch, ntags+1]
            from_tag = tags[:, step] #[batch]
            to_tag = tags[:, step+1] #[batch]
            mask_t = mask[:, step+1] #[batch]
            # get transition score
            i = tf.stack([to_tag, from_tag], axis=-1) #[batch, 2]
            trans = tf.gather_nd(self.transitions, i) #[batch] 
            trans = tf.cast(trans, tf.float32)
            # get emit score
            j = tf.range(batch) #[batch]
            j = tf.stack([j, to_tag], axis=-1) #[batch, 2]
            emit = tf.gather_nd(feat, j) #[batch]
            
            # if one sentence is not finised, need to update to next score. otherwise remain score unchanged. controlled by mask.
            next_score = score + trans + emit #[batch]
            score = tf.where(mask_t, next_score, score) #[batch]
            
        # last step only transition score
        k = tf.range(batch) #[batch]
        k = tf.stack([k, tf.math.add(length, -1)], axis=-1) #[batch, 2]
        from_tag = tf.gather_nd(tags, k) #[batch]
        to_tag = tf.fill(tf.shape(from_tag), t2i['<end>']) #[batch]
        i = tf.stack([to_tag, from_tag], axis=-1) #[batch, 2]
        trans = tf.gather_nd(self.transitions, i) #[batch] 
        trans = tf.cast(trans, tf.float32)
        score = score + trans #[batch]
        return score
            
    # negtive log likelihood loss function
    def nll_cost(self, sent, tags): # sent [batch, t], tags [batch, t]
        x = self.embedding(sent) #[batch, t, emb]
        x = self.bilstm(x) #[batch, t, hidden]
        x = self.fc(x) #[batch, t, ntags+1]
        mask = tf.math.logical_not(tf.math.logical_or(tf.math.equal(tags, 0), tf.math.equal(tags, t2i['<end>']))) # padding and end are masked
        alpha = self._forward_alg(x, mask)
        alpha = tf.cast(alpha, tf.float32)
        gold = self._score_sent(x, tags, mask)
        return alpha - gold #[batch]
    
    # viterbi algorithm for decoding
    def _viterbi(self, feats, mask): #feats: [batch, t, ntags+1], mask: [batch, t]
        batch = tf.shape(feats)[0]
        forward = np.full((batch, ntags+1), -10000.)
        forward[:, t2i['<start>']] = 0. #[batch, ntags+1]
        forward = tf.Variable(forward, trainable=False)
        backtrack = []
        
        steps = tf.shape(feats)[1]
        for step in range(steps):
            feat = feats[:, step, :] #[batch, ntags+1]
            mask_t = mask[:, step+1] #[batch]
            # for sentence that is not finised, need to forward to next step. otherwise remain forward unchanged. controlled by mask.
            forward_next = tf.broadcast_to(tf.expand_dims(forward, axis=1), [batch, ntags+1, ntags+1]) #[batch, ntags+1(to), ntags+1(from)]
            emit = tf.broadcast_to(tf.expand_dims(feat, axis=-1), [batch, ntags+1, ntags+1]) #[batch, ntags+1(to), ntags+1(from)]
            emit = tf.cast(emit, tf.float64)
            trans = tf.broadcast_to(tf.expand_dims(self.transitions, axis=0), [batch, ntags+1, ntags+1]) #[batch, ntags+1(to), ntags+1(from)]
            #print (forward_next, emit, trans)
            next_score = forward_next + emit + trans #[batch, ntags+1(to), ntags+1(from)]
            best_tag = tf.argmax(next_score, axis=-1) #[batch, ntags+1(to)]
            backtrack.append(best_tag)
            best_score = tf.reduce_max(next_score, axis=-1) #[batch, ntags+1(to)]
            forward = tf.where(mask_t, best_score, forward) #[batch, ntags+1]
            
        # last step to end token. only transition score
        last = forward + self.transitions[t2i['<end>'], :] #[batch, ntags+1]
        best_last_tag = tf.argmax(last, axis=-1) #[batch]
        best_path_score = tf.reduce_max(last, axis=-1) #[batch]
        
        # back track to decode the best path. skip the masked tags
        t = -2
        best_path = [best_last_tag]
        for back in reversed(backtrack):
            mask_t = mask[:, t] #[batch]
            j = tf.range(batch) #[batch]
            j = tf.cast(j, tf.int64)
            j = tf.stack([j, best_last_tag], axis=-1) #[batch, 2]
            prev_tag = tf.gather_nd(back, j) #[batch]
            # for sentence that is not such long, need to remain the last tag before end. otherwise back to previous tag. controlled by mask.
            best_last_tag = tf.where(mask_t, prev_tag, best_last_tag) #[batch]
            best_path.append(best_last_tag)
            t -= 1
        best_path.reverse() # forward
        path_end = tf.fill([batch], t2i['<end>'])
        path_end = tf.cast(path_end, tf.int64)
        best_path.append(path_end) # add end to match the shape of mask
        best_path = tf.stack(best_path, axis=-1) #[batch, t] from start to end
        best_path = tf.where(mask, best_path, tf.cast(mask, tf.int64)) #[batch ,t] mask tags beyond sentence length. end is also masked
        return best_path_score, best_path
        
    # sequence tagging
    def call(self, x): # x:[batch, t]
        mask = tf.math.logical_not(tf.math.equal(x, 0)) # [batch, t]
        batch = tf.shape(x)[0]
        start_mask = tf.fill([batch, 1], True) #[batch, 1]
        end_mask = tf.fill([batch, 1], False) #[batch, 1]
        mask = tf.concat([start_mask, mask, end_mask], axis=-1) #[batch, t] padding and end are masked
        
        x = self.embedding(x) #[batch, t, emb]
        x = self.bilstm(x) #[batch, t, hidden]
        x = self.fc(x) #[batch, t, ntags+1]
        
        score, path = self._viterbi(x, mask)
        return score, path

In [17]:
model = BiLSTM_CRF(emb_size, hidden_size)

W0514 02:17:12.412975 140013846337408 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7f56edeba400>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.
W0514 02:17:12.422863 140013846337408 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7f56edebacc0>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.


In [0]:
def train_step(sent, tag): 
    with tf.GradientTape() as tape:
        batch_loss = model.nll_cost(sent, tag)
        loss = tf.reduce_mean(batch_loss)

    variables = model.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
  
    return loss

In [0]:
def eval_step(sent, tag):
    batch_loss = model.nll_cost(sent, tag)
    loss = tf.reduce_mean(batch_loss)
  
    return loss

In [0]:
optimizer = tf.keras.optimizers.Adam()

In [21]:
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()
    print ('Epoch {} start ...... '.format(epoch + 1))
    total_loss = 0
    batch = 0
    for sent, tag in train_gen():
        batch_loss = train_step(sent, tag)
        total_loss += batch_loss
        batch += 1

        if batch % 20 == 0:
            print('  ---- Batch {} Loss {:.4f}'.format(batch, batch_loss))
    
    # Evaluate on dev set
    dev_loss = 0
    batch_eval = 0
    for sent, tag in dev_gen():
        batch_loss = eval_step(sent, tag)
        dev_loss += batch_loss
        batch_eval += 1

    print('Epoch {} finished in {} seconds. Training loss {:.4f}. Evaluation loss {:.4f}.'.format(epoch + 1, time.time() - start, total_loss / batch, dev_loss / batch_eval))

Epoch 1 start ...... 
  ---- Batch 20 Loss 15.4017
  ---- Batch 40 Loss 13.1362
  ---- Batch 60 Loss 9.6502
  ---- Batch 80 Loss 10.4112
  ---- Batch 100 Loss 6.1642
  ---- Batch 120 Loss 6.6460
  ---- Batch 140 Loss 6.2544
Epoch 1 finished in 81.28769636154175 seconds. Training loss 11.3478. Evaluation loss 9.8316.
Epoch 2 start ...... 
  ---- Batch 20 Loss 5.1833
  ---- Batch 40 Loss 4.0594
  ---- Batch 60 Loss 4.0787
  ---- Batch 80 Loss 4.9632
  ---- Batch 100 Loss 2.8861
  ---- Batch 120 Loss 3.3668
  ---- Batch 140 Loss 3.0349
Epoch 2 finished in 77.57497715950012 seconds. Training loss 4.1359. Evaluation loss 9.3675.
Epoch 3 start ...... 
  ---- Batch 20 Loss 2.8783
  ---- Batch 40 Loss 2.1395
  ---- Batch 60 Loss 2.4061
  ---- Batch 80 Loss 3.2640
  ---- Batch 100 Loss 1.9142
  ---- Batch 120 Loss 2.0513
  ---- Batch 140 Loss 1.8487
Epoch 3 finished in 77.2682056427002 seconds. Training loss 2.3775. Evaluation loss 9.8468.
Epoch 4 start ...... 
  ---- Batch 20 Loss 1.8800
  ---

In [0]:
def decode_sent(sent):
    words = []
    for w in sent:
        if w == 0:
            break
        words.append(i2w[w])
    return ' '.join(words)

In [0]:
def decode_tag(tag):
    tags = []
    for t in tag:
        if t == t2i['<start>']:
            continue
        if t == 0 or t == t2i['<end>']:
            break
        tags.append(i2t[t])
    return ' '.join(tags)

In [0]:
def ner_tag():
    for sent, tag in dev_gen():
        score, path = model(sent)
        for i, s in enumerate(score): 
            print ('====================================================')
            print (f'sentence is: {decode_sent(sent[i])}')
            print (f'gold tag is: {decode_tag(tag[i])}')
            print (f'prediction : {decode_tag(path[i].numpy())}')
            print (f'score is: {s}')

In [25]:
ner_tag()

sentence is: <unk> is the tenth album from japanese punk techno band the mad capsule markets .
gold tag is: i-misc o o o o o i-misc o o o i-org i-org i-org i-org o
prediction : o o o o o o i-misc o o o o i-misc i-misc o o
score is: 127.84922551770306
sentence is: this album proved to be more commercial and more <unk> than <unk> , with heavily synthesized songs like introduction <unk> and come .
gold tag is: o o o o o o o o o o o i-misc o o o o o o i-misc i-misc o i-misc o
prediction : o o o o o o o o o o o o o o o o o o o o o o o
score is: 222.82548467039985
sentence is: founding member <unk> minoru played <unk> on good day , and <unk> cover of a song by uk post punk industrial band killing joke .
gold tag is: o o i-per i-per o o o i-misc i-misc o o i-misc o o o o o i-loc o o o o i-org i-org o
prediction : o o o i-per o i-per o i-org i-org o o o o o o o o i-misc i-misc i-misc o o o o o
score is: 194.27585582304653
sentence is: <unk> can of this had a different meaning , and most people