In [1]:
from __future__ import division
import math
import numpy as np
import matplotlib.pyplot as plt

## word2vec Subsampling Frequent Words

Each word in the training set is discarded with the probability computed by the form: $P(w_i) = 1 - \sqrt{\frac{threshold}{freq(w_i)}}$

In [None]:
def prob_func_factory(threshold):
    return lambda freq: max(0, 1 - math.sqrt(threshold / freq))

In [None]:
thresholds = [1e-5, 1e-4, 1e-3]

for threshold in thresholds:
    prob_fn = prob_func_factory(threshold)
    freqs = range(1, 1000000, 2500)
    freqs_sum = float(sum(freqs))
    freqs = [freq / freqs_sum for freq in freqs]
    probs = [prob_fn(freq) for freq in freqs]
    
    plt.semilogx(freqs, probs, label=threshold)
    
plt.ylabel('Prob of word being removed')
plt.xlabel('Occurences of word')
plt.legend(loc='upper left'); plt.grid(); plt.show()

This formula was chosen because it aggresively subsamples words whose frequency is higher than the set threshold.

## Dataset Preprocessing

Code below converts text data to unified format with one sentence per line.

In [2]:
import os
import re
import nltk
import csv
import numpy
import string
import codecs
import pandas

#### Presidential Speeches Dataset

In [None]:
# filepaths
CORPORA_DIR = './corpora/presidential-transcripts-raw/'
CLEAN_FILE_NAME = './corpora/presidential-clean.txt'

# regexps
with codecs.open(CLEAN_FILE_NAME, 'w', encoding='ascii', errors='ignore') as out:
    for root_dir, dirs, files in os.walk(CORPORA_DIR):
        print 'Exploring: ', root_dir
        for file_name in files:
            if not re.match('\.DS_Store', file_name):
                file_path = os.path.join(root_dir, file_name)
                with open(file_path) as fd:
                    # read entire file
                    raw_content = [line.decode('ascii', errors='replace') for line in fd.readlines()]

                    # translate to lowercase
                    lower_contents = [text.lower() for text in raw_content]

                    # tokenizer speech into sentences
                    for fragment in lower_contents:
                        [out.write(line.strip() + '\n') for line in nltk.tokenize.sent_tokenize(fragment)]

#### Airbnb Reviews Dataset

In [8]:
# filepaths
CORPORA_DIR = './corpora/reviews/'
CLEAN_FILE_NAME = './corpora/reviews-clean.txt'

# regexps
with codecs.open(CLEAN_FILE_NAME, 'w', encoding='ascii', errors='ignore') as out:
    for root_dir, dirs, files in os.walk(CORPORA_DIR):
        print 'Exploring: ', root_dir
        for file_name in files:
            if not re.match('\.DS_Store', file_name):
                file_path = os.path.join(root_dir, file_name)
                
                with open(file_path) as fd:
                    csv_reader = csv.reader(fd)
                    next(csv_reader, None)
                    # read and decodeentire file
                    raw_content = [row[-1].decode('utf8', errors='replace') for row in csv_reader]
                    
                    # remove new lines
                    clean_content = [re.sub(r'[\n\r ]+', ' ', line) for line in raw_content]

                    # translate to lowercase
                    lower_contents = [text.lower() for text in raw_content]

                    # tokenizer speech into sentences
                    for fragment in lower_contents:
                        [out.write(line.strip() + '\n') for line in nltk.tokenize.sent_tokenize(fragment)]

Exploring:  ./corpora/reviews/


### Tensorflow seq2seq using raw_rnn

In [15]:
import numpy as np
import tensorflow as tf

In [17]:
def toy_data_generator(vocab_size, data_size, max_seq_length, reserved_digits=3):
    for _ in xrange(data_size):
        seq_length = np.random.randint(max_seq_length) + 1
        yield [np.random.randint(vocab_size-reserved_digits)+reserved_digits for _ in xrange(seq_length)]
        
def batchify_data(data_generator, batch_size):
    """ Split dataset (generator) into batches """
    if isinstance(data_generator, list):
        for ix in xrange(0, len(data_generator), batch_size):
            buff = data_generator[ix:ix+batch_size]
            yield buff
    else:
        while data_generator:
            buff = []
            for ix in xrange(0, batch_size):
                buff.append(next(data_generator))
            yield buff
            
def pad_data(data_arr, append_pre=[], append_suf=[], max_length=None):
    data_arr = [append_pre + row + append_suf for row in data_arr]
    lengths = [len(row) for row in data_arr]
    max_len = max(lengths) if not max_length else max_length
    return np.array([row+[0]*(max_len-length) for row, length in zip(data_arr, lengths)]), lengths
    

In [20]:
# define constant parameters
VOCAB_SIZE = 10
BATCH_SIZE = 32
MAX_SEQ_LENGTH = 5

EMB_SIZE = 20
ENC_HIDDEN_UNITS = 20
DEC_HIDDEN_UNITS = ENC_HIDDEN_UNITS

EOS_TOKEN = 2
PAD_TOKEN = 0

# reset graph
tf.reset_default_graph()
sess = tf.InteractiveSession()

# define placeholders for data
enc_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs')
enc_inputs_len = tf.placeholder(shape=(None), dtype=tf.int32, name='encoder_inputs_len')
dec_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')
dec_targets_len = tf.placeholder(shape=(None), dtype=tf.int32, name='decoder_targets_len')

# define embeddings and lookup
embeddings_src = tf.Variable(tf.random_uniform([VOCAB_SIZE_SRC, EMB_SIZE], -1.0, 1.0), dtype=tf.float32)
embeddings_trg = tf.Variable(tf.random_uniform([VOCAB_SIZE_TRG, EMB_SIZE], -1.0, 1.0), dtype=tf.float32)
enc_inputs_emb = tf.nn.embedding_lookup(embeddings_src, enc_inputs)

# define encoder
enc_cell = tf.contrib.rnn.GRUCell(ENC_HIDDEN_UNITS)
_, enc_final_state = tf.nn.dynamic_rnn(
    enc_cell, enc_inputs_emb, dtype=tf.float32, time_major=True, scope='encoder_cell')

# define decoder
dec_cell = tf.contrib.rnn.GRUCell(DEC_HIDDEN_UNITS)

dec_smax_W = tf.Variable(tf.random_uniform([DEC_HIDDEN_UNITS, VOCAB_SIZE_TRG], -1.0, 1.0), dtype=tf.float32)
dec_smax_b = tf.Variable(tf.zeros([VOCAB_SIZE_TRG]), dtype=tf.float32)

eos_slice = tf.fill([BATCH_SIZE], EOS_TOKEN, name='EOS')
pad_slice = tf.fill([BATCH_SIZE], PAD_TOKEN, name='PAD')

eos_slice_emb = tf.nn.embedding_lookup(embeddings_src, eos_slice)
pad_slice_emb = tf.nn.embedding_lookup(embeddings_src, pad_slice)

# loop transition function (defines inputs of step t given outputs of step t-1)
# (time, prev_cell_output, prev_cell_state, prev_loop_state) -> (elements_finished, input, cell_state, output, loop_state)
def loop_fn(time, prev_output, prev_state, prev_loop_state):
    if prev_state is None:
        init_elements_finished = (0 >= dec_targets_len)
        init_input = eos_slice_emb
        init_cell_state = enc_final_state
        init_cell_output = None
        init_loop_state = None
        return (init_elements_finished, init_input, init_cell_state, init_cell_output, init_loop_state)
    else:
        def get_next_input():
            output_logits = tf.add(tf.matmul(prev_output, dec_smax_W), dec_smax_b)
            pred = tf.argmax(output_logits, axis=1)
            next_input = tf.nn.embedding_lookup(embeddings_trg, pred)
            return next_input

        step_elements_finished = (time >= dec_targets_len)
        step_finished = tf.reduce_all(step_elements_finished)
        step_input = tf.cond(step_finished, lambda: pad_slice_emb, get_next_input)
        step_state = prev_state
        step_output = prev_output
        step_loop_state = None
        return (step_elements_finished, step_input, step_state, step_output, step_loop_state)

dec_outputs_ta, dec_final_state, _ = tf.nn.raw_rnn(
    dec_cell, loop_fn)
dec_outputs = dec_outputs_ta.stack()

dec_max_steps, dec_batch_size, dec_dim = tf.unstack(tf.shape(dec_outputs))
dec_outputs_flat = tf.reshape(dec_outputs, (-1, dec_dim))
dec_logits_flat = tf.add(tf.matmul(dec_outputs_flat, dec_smax_W), dec_smax_b)
dec_logits = tf.reshape(dec_logits_flat, (dec_max_steps, dec_batch_size, VOCAB_SIZE_TRG))
dec_preds = tf.argmax(dec_logits, 2)

# define loss function and optimizer
stepwise_cent = tf.nn.softmax_cross_entropy_with_logits(
    labels=tf.one_hot(dec_targets, depth=VOCAB_SIZE_TRG, dtype=tf.float32),
    logits=dec_logits)
loss = tf.reduce_mean(stepwise_cent)
train_op = tf.train.AdamOptimizer().minimize(loss)

In [24]:
DATA_SIZE = 500000
MAX_SEQ_LENGTH = 25

sess.run(tf.global_variables_initializer())

try:
    batch_ix = 0
    loss_track = []
    batches_gen = batchify_data(toy_data_generator(VOCAB_SIZE, DATA_SIZE, MAX_SEQ_LENGTH), BATCH_SIZE)
    
    for data_batch in batches_gen: 
        enc_inc, enc_lengths = pad_data(data_batch, append_suf=[2])
        enc_len = lengths
        dec_tar, dec_lengths = pad_data(data_batch, append_suf=[2])
        
        fd = {
            enc_inputs: enc_inc.T,
            enc_inputs_len: enc_lengths,
            dec_targets: dec_tar.T,
            dec_targets_len: dec_lengths
        }
        _, l = sess.run([train_op, loss], fd)
        loss_track.append(l)

        if batch_ix == 0 or batch_ix % 100 == 0:
            print('batch {}'.format(batch_ix))
            print('  minibatch loss: {}'.format(sess.run(loss, fd)))
            predict_ = sess.run(dec_preds, fd)
            for i, (inp, pred) in enumerate(zip(fd[enc_inputs].T, predict_.T)):
                print('  sample {}:'.format(i + 1))
                print('    input     > {}'.format(inp))
                print('    predicted > {}'.format(pred))
                if i >= 2:
                    break
        batch_ix += 1
                    
except KeyboardInterrupt:
    print 'Training Interrupted'

batch 0
  minibatch loss: 2.35056734085
  sample 1:
    input     > [9 5 6 9 6 5 6 7 7 8 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    predicted > [8 8 8 8 9 8 9 8 9 9 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
  sample 2:
    input     > [6 6 7 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    predicted > [8 8 8 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
  sample 3:
    input     > [8 9 5 4 7 6 3 4 4 7 5 7 6 8 3 9 6 2 0 0 0 0 0 0 0]
    predicted > [8 8 8 9 8 9 4 5 9 5 9 5 9 5 6 6 6 6 0 0 0 0 0 0 0]
batch 100
  minibatch loss: 2.06731200218
  sample 1:
    input     > [4 5 5 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    predicted > [6 6 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
  sample 2:
    input     > [5 4 7 8 9 7 7 6 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    predicted > [8 5 8 3 3 7 7 7 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
  sample 3:
    input     > [8 7 3 6 7 9 9 9 9 4 4 3 7 3 8 9 5 4 2 0 0 0 0 0]
    predicted > [8 8 8 8 8 5 7 5 7 9 5 7 9 5 7 9 9 4 3 0 0 0 0 0]
batch 200
  minibatch loss: 1.94717085361


batch 1700
  minibatch loss: 0.984771430492
  sample 1:
    input     > [6 9 7 6 9 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    predicted > [6 6 6 6 6 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
  sample 2:
    input     > [7 7 7 8 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    predicted > [7 7 7 7 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
  sample 3:
    input     > [6 8 7 6 4 4 3 6 4 3 8 4 2 0 0 0 0 0 0 0 0 0 0 0 0 0]
    predicted > [6 6 6 6 6 6 6 6 6 6 6 6 2 0 0 0 0 0 0 0 0 0 0 0 0 0]
batch 1800
  minibatch loss: 1.07512998581
  sample 1:
    input     > [4 6 9 5 4 4 3 5 9 7 4 3 3 5 7 2 0 0 0 0 0 0 0 0 0 0]
    predicted > [4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 0 0 0 0 0 0 0 0 0 0]
  sample 2:
    input     > [7 4 3 3 6 4 4 8 6 8 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    predicted > [4 4 4 4 3 3 3 3 7 6 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
  sample 3:
    input     > [4 7 7 3 9 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    predicted > [7 7 7 7 7 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
b

batch 3400
  minibatch loss: 1.0804681778
  sample 1:
    input     > [3 7 6 8 8 9 5 6 4 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    predicted > [6 6 6 8 8 8 8 8 8 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
  sample 2:
    input     > [6 7 9 7 6 4 3 5 5 9 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    predicted > [7 7 6 6 6 6 6 7 5 5 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
  sample 3:
    input     > [6 6 8 4 6 7 8 7 7 5 5 8 3 7 4 5 8 4 4 6 2 0 0 0 0]
    predicted > [6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 2 0 0 0 0]
batch 3500
  minibatch loss: 0.97028952837
  sample 1:
    input     > [4 9 4 9 6 6 3 8 3 7 7 6 3 5 9 4 2 0 0 0 0 0 0 0 0 0]
    predicted > [4 4 9 6 6 4 6 4 6 4 4 4 3 4 6 2 2 0 0 0 0 0 0 0 0 0]
  sample 2:
    input     > [8 8 3 4 9 6 9 9 4 8 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    predicted > [8 8 8 8 8 8 8 8 8 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
  sample 3:
    input     > [4 6 7 9 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    predicted > [7 7 6 7 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
batch 3600
  mi

batch 5100
  minibatch loss: 0.957665324211
  sample 1:
    input     > [8 5 8 4 6 3 3 3 6 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    predicted > [8 8 3 3 3 3 3 3 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
  sample 2:
    input     > [8 6 6 9 4 6 6 9 7 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    predicted > [6 6 6 6 6 6 6 6 6 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
  sample 3:
    input     > [5 9 3 5 7 4 7 9 7 3 7 7 9 6 4 5 5 2 0 0 0 0 0 0 0 0]
    predicted > [5 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 2 2 0 0 0 0 0 0 0 0]
batch 5200
  minibatch loss: 0.975641310215
  sample 1:
    input     > [7 9 4 9 5 9 3 6 7 4 8 4 5 3 8 4 4 7 8 9 3 5 7 2 0 0]
    predicted > [7 9 9 9 9 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 7 4 7 2 0 0]
  sample 2:
    input     > [8 7 4 9 6 9 9 8 5 7 5 9 6 9 7 6 9 9 8 5 6 9 2 0 0 0]
    predicted > [8 8 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 2 0 0 0]
  sample 3:
    input     > [9 9 3 9 8 6 4 5 7 3 3 8 8 4 4 6 9 2 0 0 0 0 0 0 0 0]
    predicted > [9 9 9 9 9 9 4 4 4 4 4 4 8 8 8 8 8 2 0 0 0 0 0 0 0 0]
