In [1]:
from __future__ import absolute_import, division, print_function

!pip install tensorflow-gpu==2.0.0-alpha0
import tensorflow as tf
device_name = tf.test.gpu_device_name()
print (device_name)

/device:GPU:0


In [0]:
import matplotlib.pyplot as plt
% matplotlib inline
import numpy as np
import pandas as pd
import time
import random

In [3]:
from google.colab import drive

drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [0]:
batch_size = 32
emb_size = 100
ngram = 2
lr = 0.002

In [0]:
train_path = '/content/drive/My Drive/Colab Notebooks/cs11747/data/lm/train.txt' # train set
dev_path = '/content/drive/My Drive/Colab Notebooks/cs11747/data/lm/valid.txt' # dev set
embed_path = '/content/drive/My Drive/Colab Notebooks/cs11747/3/embeddings.tsv' # word vector
words_path = '/content/drive/My Drive/Colab Notebooks/cs11747/3/labels.tsv' # words

In [0]:
def read_dataset(filename):
    with open(filename, "r") as f:
        data = []
        for line in f:
            words = line.lower().strip()
            data.append(words)
        return data

In [0]:
train_set = read_dataset(train_path)
random.shuffle(train_set)
dev_set = read_dataset(dev_path)

In [8]:
len(train_set)

42068

In [0]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(train_set)

In [10]:
word_index = tokenizer.word_index
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
nwords = len(word_index)
print (nwords)
nbits = len(np.binary_repr(nwords))

9649


In [0]:
def train_gen(batch_size=batch_size):
    steps = len(train_set) // batch_size
    for step in range(steps):
        train_seq = tokenizer.texts_to_sequences(train_set[step:step+batch_size])
        yield train_seq

In [0]:
def dev_gen(batch_size=batch_size):
    steps = len(dev_set) // batch_size
    for step in range(steps):
        dev_seq = tokenizer.texts_to_sequences(dev_set[step:step+batch_size])
        yield dev_seq

In [0]:
class WordEmbCBOW(tf.keras.Model):
    def __init__(self, nwords, emb_size):
        super(WordEmbCBOW, self).__init__()
        self.embed = tf.keras.layers.Embedding(nwords+1, emb_size, trainable=True)
        self.dense = tf.keras.layers.Dense(nbits, activation='sigmoid')

    @tf.function
    def call(self, x):
        """Run the model."""
        result = self.embed(x)
        result = tf.reduce_sum(result, axis=1) #[batch, emb]
        result = self.dense(result) #[batch, nbits]
        return result

In [0]:
model = WordEmbCBOW(nwords, emb_size)

In [0]:
# Calculate the loss value for the whole batch of sentences
def sent_loss(sents):
    all_windows = []
    all_targets = []
    for sent in sents:
        padded = [0] * ngram + sent + [0] * ngram
        for i in range(ngram, len(sent) + ngram):
            window = padded[i-ngram:i] + padded[i+1:i+ngram+1]
            all_windows.append(window)
            all_targets.append(padded[i])

    all_windows = tf.Variable(all_windows, trainable=False)
    logits = model(all_windows)
    all_bc = [np.binary_repr(i).zfill(nbits) for i in all_targets]
    all_bc = [[float(s) for s in bc] for bc in all_bc]
    all_bc = tf.Variable(all_bc, trainable=False)
    loss_fn = tf.keras.losses.BinaryCrossentropy(reduction=tf.losses.Reduction.SUM)
    loss = loss_fn(all_bc, logits)

    return loss

In [0]:
last_dev = 1e20
best_dev = 1e20
optimizer = tf.keras.optimizers.Adam(lr=lr)
# Iterate over epochs.
for epoch in range(15):
    print('Start of epoch %d' % (epoch,))
    start = time.time()
    train_loss = 0
    train_words = 0
    batch_id = 0
    for sents in train_gen():

        # Open a GradientTape to record the operations run during the forward pass, which enables autodifferentiation.
        with tf.GradientTape() as tape:

            loss = sent_loss(sents)
            train_loss += loss
            batch_words = sum(list(map(len, sents)))
            train_words += batch_words

            # Use the gradient tape to automatically retrieve the gradients of the trainable weights with respect to the loss.
            grads = tape.gradient(loss, model.trainable_variables)

            # Run one step of gradient descent by updating the value of the weights to minimize the loss.
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

            # Log every 100 batch.
            batch_id += 1
            if batch_id % 100 == 0:
                print('Training time: %0.3f seconds, training loss at sentence %d: %0.4f' % (time.time()-start, batch_id*batch_size, loss/batch_words))
                
    print ('Epoch %d: Training time=%0.3f seconds, training loss per word=%0.4f' % (epoch, time.time()-start, train_loss/train_words))
    
    # Evaluate on dev set
    dev_words, dev_loss = 0, 0
    for sents in dev_gen():
        loss = sent_loss(sents)
        dev_loss += loss
        dev_words += sum(list(map(len, sents)))
    print ('Epoch %d: evaluation loss per word=%f' % (epoch, dev_loss/dev_words))
    
    # Keep track of the development accuracy and reduce the learning rate if it got worse
    if last_dev < dev_loss/dev_words:
        lr /= 2
        optimizer = tf.keras.optimizers.Adam(lr=lr)
        print ('learning rate decay to: ', lr)
    last_dev = dev_loss/dev_words
    
    # Save the word vectors
    if best_dev > dev_loss/dev_words:
        print ('Updating word vectors......')
        emb = model.embed.get_weights()[0]
        out_v = open(embed_path, 'w')
        out_w = open(words_path, 'w')
        for i in range(1, nwords+1):
            word = reverse_word_index[i]
            embedding = emb[i]
            out_w.write(word + "\n")
            out_v.write('\t'.join([str(x) for x in embedding]) + "\n")
        out_v.close()
        out_w.close()
        best_dev = dev_loss/dev_words

Start of epoch 0
Training time: 23.733 seconds, training loss at sentence 3200: 0.4513
Training time: 46.327 seconds, training loss at sentence 6400: 0.4315
Training time: 69.267 seconds, training loss at sentence 9600: 0.4180
Training time: 92.024 seconds, training loss at sentence 12800: 0.4341
Training time: 115.369 seconds, training loss at sentence 16000: 0.4414
Training time: 138.974 seconds, training loss at sentence 19200: 0.4368
Training time: 163.027 seconds, training loss at sentence 22400: 0.4393
Training time: 186.959 seconds, training loss at sentence 25600: 0.4392
Training time: 210.437 seconds, training loss at sentence 28800: 0.4298
Training time: 234.989 seconds, training loss at sentence 32000: 0.4373
Training time: 258.299 seconds, training loss at sentence 35200: 0.4413
Training time: 281.996 seconds, training loss at sentence 38400: 0.4104
Training time: 307.670 seconds, training loss at sentence 41600: 0.4334
Epoch 0: Training time=311.017 seconds, training loss 