In [0]:
from __future__ import absolute_import, division, print_function

!pip install tensorflow-gpu==2.0.0-alpha0
import tensorflow as tf
device_name = tf.test.gpu_device_name()
print (device_name)

/device:GPU:0


In [0]:
import matplotlib.pyplot as plt
% matplotlib inline
import numpy as np
import pandas as pd
import time
import random

In [0]:
from google.colab import drive

drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [0]:
batch_size = 4
emb_size = 50
ngram = 2
lr = 1e-3

In [0]:
train_path = '/content/drive/My Drive/Colab Notebooks/cs11747/data/lm/train.txt' # train set
dev_path = '/content/drive/My Drive/Colab Notebooks/cs11747/data/lm/valid.txt' # dev set
embed_path = '/content/drive/My Drive/Colab Notebooks/cs11747/3/embeddings.tsv' # word vector
words_path = '/content/drive/My Drive/Colab Notebooks/cs11747/3/labels.tsv' # words

In [0]:
def read_dataset(filename):
    with open(filename, "r") as f:
        data = []
        for line in f:
            words = line.lower().strip()
            data.append(words)
        return data

In [0]:
train_set = read_dataset(train_path)
random.shuffle(train_set)
dev_set = read_dataset(dev_path)

In [0]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<unk>')
tokenizer.fit_on_texts(train_set)

In [0]:
word_index = tokenizer.word_index
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
nwords = len(word_index)
print (nwords)

9650


In [0]:
def train_gen(batch_size=batch_size):
    steps = len(train_set) // batch_size
    for step in range(steps):
        train_seq = tokenizer.texts_to_sequences(train_set[step:step+batch_size])
        yield train_seq

In [0]:
def dev_gen(batch_size=batch_size):
    steps = len(dev_set) // batch_size
    for step in range(steps):
        dev_seq = tokenizer.texts_to_sequences(dev_set[step:step+batch_size])
        yield dev_seq

In [0]:
class WordEmbSG(tf.keras.Model):
    def __init__(self, nwords, emb_size):
        super(WordEmbSG, self).__init__()
        self.embed = tf.keras.layers.Embedding(nwords+1, emb_size, trainable=True)
        self.dense = tf.keras.layers.Dense(nwords+1, activation='softmax')

    @tf.function
    def call(self, x):
        """Run the model."""
        result = self.embed(x)
        result = self.dense(result)

        return result

In [0]:
model = WordEmbSG(nwords, emb_size)

In [0]:
# Calculate the loss value for the whole batch of sentences
def sent_loss(sents):
    all_centers = []
    all_targets = []
    for sent in sents:
        padded = [0] * ngram + sent + [0] * ngram
        for i in range(ngram, len(sent) + ngram):
            center = padded[i]
            for j in range(1, ngram+1):
                for k in [-1, 1]:
                    all_centers.append(center)
                    context = padded[i+j*k]
                    all_targets.append(context)

    all_centers = tf.Variable(all_centers, trainable=False)
    all_targets = tf.Variable(all_targets, trainable=False)
    logits = model(all_centers)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.losses.Reduction.SUM)
    loss = loss_fn(all_targets, logits)

    return loss

In [0]:
last_dev = 1e20
best_dev = 1e20
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
# Iterate over epochs.
for epoch in range(15):
    print('Start of epoch %d' % (epoch,))
    start = time.time()
    train_loss = 0
    train_words = 0
    batch_id = 0
    for sents in train_gen():

        # Open a GradientTape to record the operations run during the forward pass, which enables autodifferentiation.
        with tf.GradientTape() as tape:

            loss = sent_loss(sents)
            train_loss += loss
            batch_words = sum(list(map(len, sents))) * 2 * ngram
            train_words += batch_words

            # Use the gradient tape to automatically retrieve the gradients of the trainable weights with respect to the loss.
            grads = tape.gradient(loss, model.trainable_variables)

            # Run one step of gradient descent by updating the value of the weights to minimize the loss.
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

            # Log every 200 batch.
            batch_id += 1
            if batch_id % 200 == 0:
                print('Training time: %0.3f seconds, training loss at sentence %d: %0.4f' % (time.time()-start, batch_id*batch_size, loss/batch_words))
                
    print ('Epoch %d: Training time=%0.3f seconds, training loss per word=%0.4f' % (epoch, time.time()-start, train_loss/train_words))
    
    # Evaluate on dev set
    dev_words, dev_loss = 0, 0
    for sents in dev_gen():
        loss = sent_loss(sents)
        dev_loss += loss
        dev_words += sum(list(map(len, sents))) * 2 * ngram
    print ('Epoch %d: evaluation loss per word=%f' % (epoch, dev_loss/dev_words))
    
    # Keep track of the development accuracy and reduce the learning rate if it got worse
    if last_dev < dev_loss/dev_words:
        lr /= 2
        optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
        print ('learning rate decay to: ', lr)
    last_dev = dev_loss/dev_words
    
    # Save the word vectors
    if best_dev > dev_loss/dev_words:
        print ('Updating word vectors......')
        emb = model.embed.get_weights()[0]
        out_v = open(embed_path, 'w')
        out_w = open(words_path, 'w')
        for i in range(1, nwords+1):
            word = reverse_word_index[i]
            embedding = emb[i]
            out_w.write(word + "\n")
            out_v.write('\t'.join([str(x) for x in embedding]) + "\n")
        out_v.close()
        out_w.close()
        best_dev = dev_loss/dev_words

Start of epoch 0
Training time: 35.569 seconds, training loss at sentence 800: 8.0792
Training time: 70.430 seconds, training loss at sentence 1600: 7.6319
Training time: 104.593 seconds, training loss at sentence 2400: 7.1293
Training time: 140.151 seconds, training loss at sentence 3200: 6.8551
Training time: 175.696 seconds, training loss at sentence 4000: 6.9475
Training time: 211.040 seconds, training loss at sentence 4800: 7.2279
Training time: 248.454 seconds, training loss at sentence 5600: 6.4907
Training time: 286.641 seconds, training loss at sentence 6400: 6.3459
Training time: 323.706 seconds, training loss at sentence 7200: 6.4848
Training time: 363.885 seconds, training loss at sentence 8000: 6.8595
Training time: 402.506 seconds, training loss at sentence 8800: 6.4426
Training time: 444.321 seconds, training loss at sentence 9600: 6.9596
Training time: 483.418 seconds, training loss at sentence 10400: 6.6912
Training time: 522.704 seconds, training loss at sentence 1120

ResourceExhaustedError: ignored