In [1]:
from __future__ import absolute_import, division, print_function

!pip install tensorflow-gpu==2.0.0-alpha0
import tensorflow as tf
device_name = tf.test.gpu_device_name()
print (device_name)

/device:GPU:0


In [0]:
import matplotlib.pyplot as plt
% matplotlib inline
import numpy as np
import pandas as pd
import time
import random
import math
from collections import defaultdict

In [3]:
from google.colab import drive

drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [0]:
# local normalization: teacher forcing, with/without scheduled sampling
teacher_forcing = False
scheduled_sampling = False
# global normalization: structured perceptron or hamming cost augmented hinge loss
structured_perceptron = True
cost_augmented_hinge = False

In [0]:
emb_size = 200
tag_emb_size = 64
hidden_size = 300
lr = 1e-3

In [0]:
# format of files: each line is "word1|tag1 word2|tag2 ..."
train_path = '/content/drive/My Drive/Colab Notebooks/cs11747/data/tag/train.txt' # train set
dev_path = '/content/drive/My Drive/Colab Notebooks/cs11747/data/tag/dev.txt' # dev set

In [0]:
w2i = defaultdict(lambda: len(w2i))
unk_word = w2i["<unk>"]

t2i = defaultdict(lambda: len(t2i))
unk_tag = t2i["<unk>"]
start_tag = t2i["<start>"]

In [0]:
def read(fname):
    """
    Read tagged file
    """
    with open(fname, "r") as f:
        data = []
        for line in f:
            words, tags = [], []
            for wt in line.lower().strip().split():
                w, t = wt.split('|')
                words.append(w2i[w])
                tags.append(t2i[t])
            data.append((words, tags))
        return data

In [9]:
train = read(train_path)
print (f'read in {len(train)} sentences in training set')
w2i = defaultdict(lambda: unk_word, w2i)
t2i = defaultdict(lambda: unk_tag, t2i)
nwords = len(w2i)
ntags = len(t2i)
print (f'dictionary for words and tags built. {nwords} words and {ntags} tags')
dev = read(dev_path)
print (f'read in {len(dev)} sentences in dev set')

read in 10000 sentences in training set
dictionary for words and tags built. 23694 words and 11 tags
read in 1696 sentences in dev set


In [10]:
w2i['companion']

3

In [11]:
w, t = train[10]
len(t)

19

In [0]:
class Sampler:
    '''
    A linear decay scheduled sampler. Allow to set always true.
    '''
    def __init__(self, init_rate=1.0, min_rate=0.2, decay_rate=0.1, all_true=False):
        self.min_rate = min_rate
        self.decay_rate = decay_rate
        self.rate = init_rate
        self.reach_min = False
        self.all_true = all_true
        
    def decay(self):
        if not self.all_true:
            if not self.reach_min:
                self.rate -= self.decay_rate
                if self.rate < self.min_rate:
                    self.rate = self.min_rate
                    self.reach_min = True
        print (f'sampling rate now is: {self.rate}')
        
    def sample_true(self):
        if self.all_true:
            return True
        else:
            return random.random() < self.rate

In [0]:
if scheduled_sampling:
    sampler = Sampler()
else:
    sampler = Sampler(all_true=True)

In [0]:
class Forward(tf.keras.layers.Layer):
    def __init__(self):
        super(Forward, self).__init__()
        self.forward = tf.keras.layers.GRU(hidden_size//2, 
                                       return_sequences=True, 
                                       return_state=True, 
                                       recurrent_initializer='glorot_uniform')
    
    def call(self, x, prev, hidden): # x:[batch=1, 1, emb]   prev:[batch=1, 1, tag_emb]
        x = tf.concat([x, prev], axis=-1) #[1, 1, emb+tag_emb]
        # passing one time step to the GRU
        output, state = self.forward(x, initial_state=hidden)
        output = tf.reshape(output, (-1, output.shape[2])) #[batch=1, hidden/2]

        return output, state

In [0]:
class Backward(tf.keras.layers.Layer):
    def __init__(self):
        super(Backward, self).__init__()
        self.backward = tf.keras.layers.GRU(hidden_size//2, 
                                       return_sequences=True, 
                                       return_state=True, 
                                       go_backwards=True,
                                       recurrent_initializer='glorot_uniform')
    
    def call(self, x): # x:[batch=1, t, emb]
        # passing all time steps to the GRU
        outputs, _ = self.backward(x) #[batch=1, t, hidden/2]
        outputs = tf.reverse(outputs, axis=[1])
        return outputs

In [0]:
class Tagger(tf.keras.Model):
    def __init__(self):
        super(Tagger, self).__init__()
        self.embedding = tf.keras.layers.Embedding(nwords, emb_size, trainable=True)
        self.tag_embedding = tf.keras.layers.Embedding(ntags, tag_emb_size, trainable=True)
            
        # forward gru
        self.forward = Forward()
        
        # backward gru
        self.back = Backward()
            
        self.fc = tf.keras.layers.Dense(ntags)
        
    def call(self, x, ref_tags=None): # x, ref_tags:[batch=1, t] 
        batch = tf.shape(x)[0] 
        steps = tf.shape(x)[1]
        x = self.embedding(x) #[batch=1, t, emb]
        
        # backward gru all steps
        outputs_b = self.back(x) # outputs of all time steps in backward gru [batch=1, t, hidden/2]
        
        # forward gru one time step
        scores = []
        prev = tf.fill([batch, 1], start_tag)
        hidden = tf.random.uniform(shape=[batch, hidden_size//2])
        for t in range(steps):
            inp = tf.expand_dims(x[:, t, :], axis=1)
            
            prev = self.tag_embedding(prev) #[batch=1, t=1, tag_emb] 
            
            output, hidden = self.forward(inp, prev, hidden)
            
            comb = tf.concat([output, outputs_b[:, t, :]], axis=-1) # [1, hidden]
            score = self.fc(comb) # [1, ntags]
            scores.append(score)
            prediction = tf.math.argmax(score, axis=-1).numpy()
            prediction = tf.expand_dims(prediction, axis=1) # [1,1]

            if ref_tags != None:
                if sampler.sample_true():
                    prev = tf.expand_dims(ref_tags[:, t], axis=1) #[1, 1]
                else:
                    prev = prediction
            else:
                prev = prediction
        
        scores = tf.concat(scores, axis=0) # [t, ntags]
        return scores

In [0]:
model = Tagger()

In [0]:
def mle(scores, tags):
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    loss = loss_object(tags, scores)
    return tf.reduce_mean(loss)

In [0]:
def hamming_cost(predictions, refs): # get the hamming cost of a given predictions with respect to references
    return sum(p != r for p, r in zip(predictions, refs))

In [0]:
def seq_score(scores, tags):
    return sum(score[tag] for score, tag in zip(scores, tags))

In [0]:
def cost_augmented_decode(scores, refs): # to find the predictions that making the cost augmented score higest
    augmented = []
    for score, ref in zip(scores, refs):
        aug = np.ones(score.shape)
        aug[ref] = 0
        augmented.append(np.argmax(np.add(score, aug)))
    return augmented

In [0]:
def structured_prediction_loss(scores, refs):
    if structured_perceptron: # structured perceptron
        predictions = tf.argmax(scores, axis=1)
    if cost_augmented_hinge: # hamming cost augmented hinge loss
        predictions = cost_augmented_decode(scores, refs)
       
    score_ref = seq_score(scores, refs)
    score_pred = seq_score(scores, predictions)

    if structured_perceptron:
        loss = score_pred - score_ref
    if cost_augmented_hinge:
        hamming = hamming_cost(predictions, refs)
        loss = score_pred - score_ref + hamming
    return loss

In [0]:
def calc_loss(scores, tags):
    if structured_perceptron or cost_augmented_hinge:
        return structured_prediction_loss(scores, tags)
    if teacher_forcing:
        return mle(scores, tags)

In [0]:
def calc_correct(scores, tags):
    correct = [np.argmax(score) == tag for score, tag in zip(scores, tags)]
    return sum(correct)

In [0]:
optimizer = tf.keras.optimizers.Adam()

In [26]:
EPOCHS = 5

for epoch in range(EPOCHS):
    start = time.time()
    print ('=========================================== Epoch {} start ...................................... '.format(epoch + 1))
    total_loss = 0
    this_correct = this_words = 0
    for i in range(len(train)):
        words, tags = train[i]
        loss = 0
        with tf.GradientTape() as tape:
            if teacher_forcing:
                scores = model(tf.expand_dims(words, axis=0), tf.expand_dims(tags, axis=0))
            else:
                scores = model(tf.expand_dims(words, axis=0))
            loss = calc_loss(scores, tags)
            #print (loss)
            this_correct += calc_correct(scores, tags)
            this_words += len(words)
            total_loss += loss
            
        variables = model.trainable_variables
        
        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))

        if (i+1) % 200 == 0:
            print('  ---- {} seconds to train {} sentences, loss {:.4f}, accuracy {:.4f}'.format(
                time.time()-start, i+1, total_loss/(i+1), this_correct/this_words))
    
    # Evaluate on dev set
    dev_loss = 0
    dev_correct = dev_words = 0
    for i in range(len(dev)):
        words, tags = dev[i]
        scores = model(np.expand_dims(words, axis=0))
        loss = calc_loss(scores, tags)
        dev_correct += calc_correct(scores, tags)
        dev_words += len(words)
        dev_loss += loss

    print('Epoch {} finished in {} seconds'.format(epoch + 1, time.time() - start))
    print('Training loss {:.4f}, accuracy {:.4f}'.format(total_loss/len(train), this_correct/this_words))
    print('Evaluation loss {:.4f}, accuracy {:.4f}'.format(dev_loss/len(dev), dev_correct/dev_words))
    
    sampler.decay()

  ---- 68.08922362327576 seconds to train 200 sentences, loss 0.4457, accuracy 0.7911
  ---- 132.5501344203949 seconds to train 400 sentences, loss 0.3210, accuracy 0.8332
  ---- 199.58185577392578 seconds to train 600 sentences, loss 0.2730, accuracy 0.8450
  ---- 263.1359016895294 seconds to train 800 sentences, loss 0.2392, accuracy 0.8455
  ---- 328.5036132335663 seconds to train 1000 sentences, loss 0.2107, accuracy 0.8533
  ---- 385.99389362335205 seconds to train 1200 sentences, loss 0.1841, accuracy 0.8590
  ---- 448.3567440509796 seconds to train 1400 sentences, loss 0.1677, accuracy 0.8609
  ---- 514.2185034751892 seconds to train 1600 sentences, loss 0.1537, accuracy 0.8638
  ---- 581.732659816742 seconds to train 1800 sentences, loss 0.1419, accuracy 0.8672
  ---- 644.6695346832275 seconds to train 2000 sentences, loss 0.1317, accuracy 0.8665
  ---- 704.8125746250153 seconds to train 2200 sentences, loss 0.1230, accuracy 0.8674
  ---- 769.8353662490845 seconds to train 2400

KeyboardInterrupt: ignored