In [1]:
from numpy.random import seed

seed(1)
from tensorflow import set_random_seed

set_random_seed(2)
from util import read_data,read_data2, get_vocab, vectorize_sentences, indent_sentences
import numpy as np
import tensorflow as tf
import tensorflow.contrib.seq2seq as seq2seq
from tensorflow.python.layers.core import Dense



In [2]:
sess = tf.Session()

In [3]:
SENTENCE_START_TOKEN = "<START>"
SENTENCE_END_TOKEN = "<EOS>"
OOV_TOKEN = "<UNK>"
PAD_TOKEN = "<PAD>"

In [4]:
df = read_data()

In [5]:
# df = df[:1000]

In [6]:
msk = np.random.rand(len(df)) < 0.8
df_train = df[msk]
df_test = df[~msk]

In [7]:
print("Train size: %s"%len(df_train))
print("Test size: %s"%len(df_test))

Train size: 128719
Test size: 32232


In [8]:
eng_vocab, rev_eng_vocab = get_vocab(df["english_sentences"], addtional_tokens=[PAD_TOKEN,OOV_TOKEN], top=15000)
heb_vocab, rev_heb_vocab = get_vocab(df["hebrew_sentences"],
                                         addtional_tokens=[PAD_TOKEN,OOV_TOKEN, SENTENCE_START_TOKEN, SENTENCE_END_TOKEN],
                                         top=30000)

100%|██████████| 160951/160951 [00:16<00:00, 9675.10it/s]


Original vocab size 20550


100%|██████████| 160951/160951 [00:16<00:00, 9885.04it/s] 


Original vocab size 55736


In [9]:
vect_eng_sentences = vectorize_sentences(df_train["english_sentences"], eng_vocab,encode=True,reverse=True)
decoder_input_data = vectorize_sentences(df_train["hebrew_sentences"], heb_vocab, add_prefix_token=SENTENCE_START_TOKEN,encode=True)
decoder_target_data = np.array([np.concatenate((x[1:], [heb_vocab[SENTENCE_END_TOKEN]]), axis=0) for x in decoder_input_data])

100%|██████████| 128719/128719 [00:12<00:00, 10300.76it/s]
100%|██████████| 128719/128719 [00:11<00:00, 10803.61it/s]


In [10]:
vocab_size_english = len(eng_vocab)
vocab_size_hebrew = len(heb_vocab)
max_encoder_seq_length = max([len(txt) for txt in vect_eng_sentences])
max_decoder_seq_length = max([len(txt) for txt in decoder_input_data])
emb_dim = 100
lstm_size = 128
batch_size = 100

In [11]:
def pad_sequences(sequence,pad_id,to_length):
    need_to_add = to_length-len(sequence)
    return np.concatenate((sequence,np.array([pad_id]*need_to_add)),axis=0)

In [12]:
vect_eng_sentences = np.array([pad_sequences(sentence,eng_vocab[PAD_TOKEN],max_encoder_seq_length) for sentence in vect_eng_sentences])
decoder_input_data = np.array([pad_sequences(sentence,heb_vocab[PAD_TOKEN],max_decoder_seq_length) for sentence in decoder_input_data])
decoder_target_data = np.array([pad_sequences(sentence,heb_vocab[PAD_TOKEN],max_decoder_seq_length) for sentence in decoder_target_data])

### inputs and outputs

In [13]:
encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs')
decoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_inputs')
decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')

In [14]:
encoder_inputs_length = tf.placeholder(shape=(None,), dtype=tf.int32, name='encoder_inputs_length')
decoder_inputs_length = tf.placeholder(shape=(None,), dtype=tf.int32, name='decoder_inputs_length')

In [15]:
learning_rate = tf.placeholder(tf.float32, shape=[])

### embeddings

In [16]:
embedding_encoder = tf.get_variable("embedding_encoder", [vocab_size_english, emb_dim])

In [17]:
encoder_emb_inp = tf.nn.embedding_lookup(embedding_encoder, encoder_inputs)

In [18]:
embedding_decoder = tf.get_variable("embedding_decoder", [vocab_size_hebrew, emb_dim])

In [19]:
decoder_emb_inp = tf.nn.embedding_lookup(embedding_decoder, decoder_inputs)

In [20]:
lstm_initializer = tf.random_uniform_initializer(-0.08,0.08)
# lstm_initializer = None

### encoder

In [21]:
encoder_cell = tf.contrib.rnn.MultiRNNCell([tf.nn.rnn_cell.LSTMCell(lstm_size,initializer=lstm_initializer) for _ in range(2)])
# encoder_cell = tf.nn.rnn_cell.LSTMCell(lstm_size,initializer=lstm_initializer)

In [22]:
encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
    encoder_cell, encoder_emb_inp,sequence_length=encoder_inputs_length,dtype=tf.float32, time_major=True)

### decoder

In [23]:
decoder_cell = tf.contrib.rnn.MultiRNNCell([tf.nn.rnn_cell.LSTMCell(lstm_size,initializer=lstm_initializer) for _ in range(2)])
# decoder_cell = tf.nn.rnn_cell.LSTMCell(lstm_size,initializer=lstm_initializer)

In [24]:
decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn(
    decoder_cell, decoder_emb_inp,initial_state=encoder_state,sequence_length=decoder_inputs_length,
    dtype=tf.float32, time_major=True, scope="plain_decoder",
)

In [25]:
decoder_logits = tf.contrib.layers.linear(decoder_outputs, vocab_size_hebrew)

In [26]:
decoder_prediction = tf.argmax(decoder_logits, 2)

### loss

In [27]:
stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=tf.one_hot(decoder_targets, depth=vocab_size_hebrew, dtype=tf.float32),
                                                                 logits=decoder_logits)

loss = tf.reduce_mean(stepwise_cross_entropy)
optimizer = tf.train.AdamOptimizer()
gvs = optimizer.compute_gradients(loss)
capped_gvs = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gvs]
train_op = optimizer.apply_gradients(capped_gvs)

### train

In [28]:
sess.run(tf.global_variables_initializer())

In [29]:
def get_batch(batch_size):
    number_of_batches = int(len(vect_eng_sentences)/batch_size)
    counter = 0
    while True:
        from_index = (counter%number_of_batches)*batch_size
        to_index = ((counter%number_of_batches)+1)*(batch_size)
        yield {encoder_inputs:vect_eng_sentences[from_index:to_index].T,
              decoder_inputs:decoder_input_data[from_index:to_index].T,
              decoder_targets:decoder_target_data[from_index:to_index].T,
              encoder_inputs_length:np.array([len([word for word in sequence if word!=0]) for sequence in vect_eng_sentences[from_index:to_index]],dtype=np.int32),
              decoder_inputs_length:np.array([len([word for word in sequence if word!=0]) for sequence in decoder_input_data[from_index:to_index]],dtype=np.int32)}
        counter += 1

In [30]:
def decode_sequence(sequence,rev_vocab):
    return " ".join([rev_vocab[int(index)] for index in sequence if rev_vocab[int(index)]!="<PAD>" and rev_vocab[int(index)]!="<EOS>"])

In [None]:
loss_track = []

In [None]:
max_batches = 100001
batches_per_epoch = int(len(vect_eng_sentences)/batch_size)
log_every_iterations = 50
print_samples_every_iterations = 250
feed_generator = get_batch(batch_size)
current_learning_rate = 0.1
try:
    for batch in range(max_batches):
        if batch%batches_per_epoch==0:
            print("Epoch %s"%(int(batch/batches_per_epoch)))
        fd = next(feed_generator)
#         fd[learning_rate] = current_learning_rate
        _, l = sess.run([train_op, loss], fd)
#         if len(loss_track)>0 and loss_track[-1]<=l and current_learning_rate>0.002:
#             current_learning_rate = current_learning_rate/2
#             print("Learning rate dropped to %s"%current_learning_rate)
        loss_track.append(l)
        if batch == 0 or batch % log_every_iterations == 0:
#             print('batch {}'.format(batch))
            print('  minibatch loss: {}'.format(l))
            if batch % print_samples_every_iterations == 0:
                predict_ = sess.run(decoder_prediction, fd)
                for i, (inp, pred,exp) in enumerate(zip(fd[encoder_inputs].T, predict_.T,fd[decoder_targets].T)):
                    print('  sample {}:'.format(i + 1))
                    print('    input     > {}'.format(decode_sequence(inp[::-1],rev_eng_vocab)))
                    print('    predicted_raw > {}'.format(pred))
                    print('    expected_raw > {}'.format(exp))
                    print('    predicted > {}'.format(decode_sequence(pred,rev_heb_vocab)))
                    print('    expected > {}'.format(decode_sequence(exp,rev_heb_vocab)))
                    if i >= 2:
                        break
                print()
except KeyboardInterrupt:
    print('training interrupted')

Epoch 0
  minibatch loss: 10.308975219726562
  sample 1:
    input     > let is try something .
    predicted_raw > [3 3 3 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    expected_raw > [   94.  1243.    73.    23.     3.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.

 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    expected_raw > [  1.20000000e+01   9.03000000e+02   1.52200000e+03   1.90480000e+04
   2.35000000e+02   5.37000000e+02   4.00000000e+00   3.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   