In [0]:
import tensorflow as tf
from collections import Counter
import numpy as np
import math

from google.colab import files

In [0]:
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving sentences.train to sentences.train
User uploaded file "sentences.train" with length 111668947 bytes


### **1) PREPROCESSING**

In [0]:
def _sentence_preprocessing(sentence):
    sentence = ['<bos>'] + line.split() + ['<eos>']
    if len(sentence) <= 30:
        sentence = sentence + ['<pad>']*(30-len(sentence))
        return sentence
    #we ignore sentences with more than 30 words/tokens
    return []

In [0]:
tokens = []

with open("sentences.train", "r") as file:
  for line in file:
    tokens.extend(_sentence_preprocessing(line))
        
file.close()

In [21]:
#build vocab of 20k most frequent words
vocab = Counter(tokens)
vocab_20k = list(map(lambda x: x[0], vocab.most_common(19999))) #19999 cause last, 20000th token, is reserved for <unk>
vocab_20k[:10]

['<pad>', '<bos>', '<eos>', '.', ',', 'the', 'i', 'to', 'and', '``']

In [22]:
word_list = list(set(tokens))
len(word_list)

175268

In [0]:
#build words to ids dictionary
word_to_id_dict = dict({'<bos>': 1, '<eos>': 2, '<pad>': 3, '<unk>': 4})
id_counter = 5 
for word in word_list:
  if word_to_id_dict.get(word, 0) == 0 and word in vocab_20k:
      word_to_id_dict[word] = id_counter
      id_counter += 1

In [0]:
words_ids = np.array([word_to_id_dict.get(word,4)-1 for word in tokens]).reshape(int(len(tokens)/30), 30)
#-1 so that indexing starts with 0 and not 1 (this we need later due to using embedding_lookup)

In [9]:
words_ids[0,:]

array([    0, 14886, 12089,  4356, 18925,  7616,  8296,     3,  1294,
       18139,  1337, 16789, 15393, 18925,  8073,  9840,  4477,  8296,
        3969,  5143,     1,     2,     2,     2,     2,     2,     2,
           2,     2,     2])

In [0]:
with open('train.ids', 'w') as file:
  for i in range(words_ids.shape[0]):
    file.write(' '.join(str(x) for x in list(words_ids[i,:])) + '\n')
    
file.close()

### *2) DATASET API*

In [0]:
def parse(line, vocab_size=20000):
  line_split = tf.string_split([line])
  input_seq = tf.string_to_number(line_split.values[:-1], out_type=tf.int32)
  output_seq = tf.string_to_number(line_split.values[1:], out_type=tf.int32)
  return input_seq, output_seq

In [12]:
batch_size = 64
hidden_units = 512

file_name_train = tf.placeholder(tf.string)

training_dataset = tf.data.TextLineDataset(file_name_train).map(parse).batch(batch_size)
iterator = tf.data.Iterator.from_structure(training_dataset.output_types, training_dataset.output_shapes)
input_batch, output_batch = iterator.get_next()
training_init_op = iterator.make_initializer(training_dataset)

Instructions for updating:
Colocations handled automatically by placer.


In [16]:
#print out first three batches
with tf.Session() as sess:
  sess.run(training_init_op, {file_name_train: "train.ids"})
  sess.run(tf.initialize_all_variables())
  for _ in range(3):
    a, b = sess.run([input_batch, output_batch])
    print(a)
    print(b)
    print(a.shape)
    print(b.shape)


Instructions for updating:
Use `tf.global_variables_initializer` instead.
[[    0 14886 12089 ...     2     2     2]
 [    0 14886 10106 ...     2     2     2]
 [    0     3  9022 ...     2     2     2]
 ...
 [    0  3691 18702 ...     2     2     2]
 [    0 12089 18456 ...     2     2     2]
 [    0 14886  8320 ...     2     2     2]]
[[14886 12089  4356 ...     2     2     2]
 [14886 10106 19316 ...     2     2     2]
 [    3  9022 18171 ...     2     2     2]
 ...
 [ 3691 18702 12063 ...     2     2     2]
 [12089 18456 19867 ...     2     2     2]
 [14886  8320  1337 ...     2     2     2]]
(64, 29)
(64, 29)
[[    0  5986 19766 ...     2     2     2]
 [    0 12355  4453 ...     2     2     2]
 [    0 12116     3 ...     2     2     2]
 ...
 [    0  8609 15367 ...     2     2     2]
 [    0 11158  3691 ...     2     2     2]
 [    0 14886 19354 ...     2     2     2]]
[[ 5986 19766  1337 ...     2     2     2]
 [12355  4453  3433 ...     2     2     2]
 [12116     3  3233 ...     2 

### **3) EXPERIMENT A**

In [0]:
def parse(line, vocab_size=20000):
  line_split = tf.string_split([line])
  input_seq = tf.string_to_number(line_split.values[:-1], out_type=tf.int32)
  output_seq = tf.string_to_number(line_split.values[1:], out_type=tf.int32)
  return input_seq, output_seq

In [27]:
batch_size = 64
vocab_size = 20000
embed_size = 100
hidden_units = 512
num_epochs = 2

tf.reset_default_graph()
initializer = tf.contrib.layers.xavier_initializer()

#import the data 
with tf.name_scope('inputs'):
  file_name_train = tf.placeholder(tf.string)
  training_dataset = tf.data.TextLineDataset(file_name_train).map(parse).batch(batch_size)
  iterator = tf.data.Iterator.from_structure(training_dataset.output_types, training_dataset.output_shapes)
  input_batch, output_batch = iterator.get_next()
  training_init_op = iterator.make_initializer(training_dataset) 
  
  nr_words = tf.reduce_prod(tf.shape(input_batch))


#embeddings
with tf.name_scope("embeddings"):
  input_embedding_mat = tf.get_variable('input_embedding_mat', shape=(vocab_size, embed_size), 
                                        dtype=tf.float32, initializer=initializer)
  output_embedding_mat = tf.get_variable('output_embedding_mat', shape=(vocab_size, hidden_units), 
                                         dtype=tf.float32, initializer=initializer)


  input_embedded = tf.nn.embedding_lookup(input_embedding_mat, input_batch) #(64,29,100)

  def output_embedding(current_output):
              return tf.matmul(current_output, tf.transpose(output_embedding_mat))
  
#rnn
with tf.name_scope('rnn'):
  LSTM = tf.nn.rnn_cell.LSTMCell(hidden_units, initializer=initializer, reuse=tf.AUTO_REUSE)
  state_c, state_h = LSTM.zero_state(batch_size=batch_size, dtype=tf.float32)

  preds = []
  for i in range(29):
    output, (state_c, state_h) = LSTM(tf.reshape(input_embedded[:,i,:], [batch_size,embed_size]), state=(state_c, state_h))
    preds.append(output)

  preds = tf.stack(preds, axis=1) #concatenate preds over axis=1 (2nd dimension), to obtain tensor of size (64,29,512)

  
#loss
with tf.name_scope('loss'):
  logits = tf.map_fn(output_embedding, preds) #(64,29,20000)
  logits = tf.reshape(logits, [-1, vocab_size])  
  loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(output_batch, [-1]), logits=logits) #(1856,)

#optimization
with tf.name_scope('optimization'):
  params = tf.trainable_variables()
  opt = tf.train.AdamOptimizer()
  gradients = tf.gradients(loss, params, colocate_gradients_with_ops=True)
  clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5)
  global_step = tf.train.get_or_create_global_step()
  updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=global_step)


with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    epoch = 0
    while epoch < num_epochs:
      sess.run(training_init_op, {file_name_train: "train.ids"})
      train_loss = 0.0
      train_words = 0.0
      while True:
          try:
            _loss, _words, _global_step, _ = sess.run([loss, nr_words, global_step, updates])
            train_loss += np.sum(_loss)
            train_words += _words
           
            if _global_step % 10 == 0:
              train_loss /= train_words
              train_ppl = math.exp(train_loss)
              print("Training Step: {}. Training perplexity: {}. Epoch: {}".format(_global_step, train_ppl, epoch))
              train_loss = 0.0
              train_words = 0
                        
          except tf.errors.OutOfRangeError:
              # The end of one epoch
              epoch += 1
              break

Training Step: 10. Training perplexity: 3564.209138383194. Epoch: 0
Training Step: 20. Training perplexity: 47.700302967892846. Epoch: 0
Training Step: 30. Training perplexity: 42.71117369212458. Epoch: 0
Training Step: 40. Training perplexity: 41.85922497650299. Epoch: 0
Training Step: 50. Training perplexity: 38.845850039202546. Epoch: 0
Training Step: 60. Training perplexity: 33.34318157551967. Epoch: 0
Training Step: 70. Training perplexity: 29.341679391529752. Epoch: 0
Training Step: 80. Training perplexity: 23.362750225842493. Epoch: 0
Training Step: 90. Training perplexity: 19.618937179369283. Epoch: 0
Training Step: 100. Training perplexity: 16.336344330957093. Epoch: 0
Training Step: 110. Training perplexity: 15.603616100438296. Epoch: 0
Training Step: 120. Training perplexity: 13.576652404413844. Epoch: 0
Training Step: 130. Training perplexity: 13.337870211990573. Epoch: 0
Training Step: 140. Training perplexity: 13.349407399817748. Epoch: 0
Training Step: 150. Training perp

KeyboardInterrupt: ignored

In [0]:
### WORKING VERSION

#TODO: fix embedding, use lookup table (see Šiki's link)
# def embedding(batch, input=True):
#   if not input:
#     return  tf.matmul(batch, tf.transpose(output_embedding_mat))
#   else:
#     return  tf.matmul(batch, input_embedding_mat)

# input_embedded = tf.map_fn(embedding, input_batch)

#input_seq = tf.one_hot(input_seq, vocab_size)
  #output_seq = tf.one_hot(output_seq, vocab_size)
  
  
  
#input_batch = tf.one_hot(input_batch, vocab_size)

#             a = sess.run(input_embedded)
#             print(a.shape)
#             c = sess.run(logits)
#             print(c.shape)
#             d = sess.run(loss)
#             print(d.shape)
#             a = sess.run(preds)
#             print(a[0].shape)

#input_tensor = tf.convert_to_tensor(words_ids)
#input_data = tf.data.Dataset.from_tensors(input_tensor)