In [0]:
from __future__ import absolute_import, division, print_function

!pip install tensorflow-gpu==2.0.0-alpha0
import tensorflow as tf
device_name = tf.test.gpu_device_name()
print (device_name)

/device:GPU:0


In [0]:
import matplotlib.pyplot as plt
% matplotlib inline
import numpy as np
import pandas as pd
import time
import random

In [0]:
from google.colab import drive

drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [0]:
batch_size = 64
emb_size = 50
ngram = 2
dropout = 0.5
lr = 1e-3

In [0]:
train_path = '/content/drive/My Drive/Colab Notebooks/cs11747/data/lm/train.txt' # train set
dev_path = '/content/drive/My Drive/Colab Notebooks/cs11747/data/lm/valid.txt' # dev set

In [0]:
def read_dataset(filename):
    with open(filename, "r") as f:
        data = []
        for line in f:
            words = line.lower().strip()
            data.append(words)
        return data

In [0]:
train_set = read_dataset(train_path)
random.shuffle(train_set)
dev_set = read_dataset(dev_path)

In [0]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<unk>')
tokenizer.fit_on_texts(train_set)

In [0]:
nwords = len(tokenizer.word_index)
print (nwords)

9650


In [0]:
def train_gen(batch_size=batch_size):
    steps = len(train_set) // batch_size
    for step in range(steps):
        train_seq = tokenizer.texts_to_sequences(train_set[step:step+batch_size])
        yield train_seq

In [0]:
def dev_gen(batch_size=batch_size):
    steps = len(dev_set) // batch_size
    for step in range(steps):
        dev_seq = tokenizer.texts_to_sequences(dev_set[step:step+batch_size])
        yield dev_seq

In [0]:
class NNLM(tf.keras.Model):
    def __init__(self, nwords, emb_size, ngram, dropout):
        super(NNLM, self).__init__()
        self.w = self.add_weight(shape=(nwords+1, emb_size),
                                 initializer='glorot_normal',
                                 trainable=True)
        self.b = self.add_weight(shape=(nwords+1,),
                                 initializer='zeros',
                                 trainable=True)
        self.dense = tf.keras.layers.Dense(emb_size, activation='tanh')
        self.dropout = tf.keras.layers.Dropout(dropout)
        self.softmax = tf.keras.layers.Softmax()

    @tf.function
    def call(self, x):
        """Run the model."""
        result = tf.nn.embedding_lookup(self.w, x)
        result = tf.reshape(result, [x.shape[0], -1])
        result = self.dense(result)
        result = self.dropout(result)
        result = tf.matmul(result, tf.transpose(self.w)) + self.b
        result = self.softmax(result)
        return result

In [0]:
model = NNLM(nwords, emb_size, ngram, dropout)

In [0]:
# Calculate the loss value for the whole batch of sentences
def sent_loss(sents):
    all_histories = []
    all_targets = []
    for sent in sents:
        hist = [0] * ngram
        for next_word in sent + [0]:
            all_histories.append(list(hist))
            all_targets.append(next_word)
            hist = hist[1:] + [next_word]

    all_histories = tf.Variable(all_histories, trainable=False)
    all_targets = tf.Variable(all_targets, trainable=False)
    logits = model(all_histories)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.losses.Reduction.SUM)
    loss = loss_fn(all_targets, logits)

    return loss

In [0]:
last_dev = 1e20
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
# Iterate over epochs.
for epoch in range(15):
    print('Start of epoch %d' % (epoch,))
    start = time.time()
    train_loss = 0
    train_words = 0
    batch_id = 0
    for sents in train_gen():

        # Open a GradientTape to record the operations run during the forward pass, which enables autodifferentiation.
        with tf.GradientTape() as tape:

            loss = sent_loss(sents)
            train_loss += loss
            batch_words = sum(list(map(len, sents)))
            train_words += batch_words

            # Use the gradient tape to automatically retrieve the gradients of the trainable weights with respect to the loss.
            grads = tape.gradient(loss, model.trainable_variables)

            # Run one step of gradient descent by updating the value of the weights to minimize the loss.
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

            # Log every 20 batch.
            batch_id += 1
            if batch_id % 20 == 0:
                print('Training time: %0.3f seconds, training loss at sentence %d: %0.4f' % (time.time()-start, batch_id*batch_size, loss/batch_words))
                
    print ('Epoch %d: Training time=%0.3f seconds, training loss per word=%0.4f' % (epoch, time.time()-start, train_loss/train_words))
    
    # Evaluate on dev set
    dev_words, dev_loss = 0, 0
    for sents in dev_gen():
        loss = sent_loss(sents)
        dev_loss += loss
        dev_words += sum(list(map(len, sents)))
    print ('Epoch %d: evaluation loss per word=%f' % (epoch, dev_loss/dev_words))
    
    # Keep track of the development accuracy and reduce the learning rate if it got worse
    if last_dev < dev_loss/dev_words:
        lr /= 2
        optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    last_dev = dev_loss/dev_words

Start of epoch 0
Training time: 8.284 seconds, training loss at sentence 1280: 9.5427
Training time: 16.154 seconds, training loss at sentence 2560: 9.0242
Training time: 23.977 seconds, training loss at sentence 3840: 7.9139
Training time: 31.789 seconds, training loss at sentence 5120: 7.1955
Training time: 39.708 seconds, training loss at sentence 6400: 7.1883
Training time: 47.321 seconds, training loss at sentence 7680: 7.1057
Training time: 55.236 seconds, training loss at sentence 8960: 6.9779
Training time: 62.851 seconds, training loss at sentence 10240: 6.7675
Training time: 70.811 seconds, training loss at sentence 11520: 6.6592
Training time: 78.454 seconds, training loss at sentence 12800: 6.5506
Training time: 86.418 seconds, training loss at sentence 14080: 6.4672
Training time: 93.946 seconds, training loss at sentence 15360: 6.5011
Training time: 101.920 seconds, training loss at sentence 16640: 6.4323
Training time: 109.534 seconds, training loss at sentence 17920: 6.

ResourceExhaustedError: ignored