In [75]:
from load_embedding import load_embedding
import utils
import tf_utils

import pickle
import numpy as np
from math import floor, ceil

import tensorflow as tf

print("tf_version:\t" + tf.__version__)

logger = utils.Logger("./logs/")
timer = utils.Timer()

tf_version:	1.10.0


In [66]:
#------------------------------------------------------------------------------------------------------------------------------#
# DIRECTORIES
DATA_DIR = "./data/"
RESULTS_DIR = "./results/"
WORD_EMBEDDINGS_FILE = "wordembeddings-dim100.word2vec"
SENTENCES_TRAIN_FILE = "sentences.train"
SENTENCES_TEST_FILE = "sentences_test.txt"
SENTENCES_EVAL_FILE = "sentences.eval"
SENTENCES_CONTINUATION_FILE = "sentences.continuation"

#------------------------------------------------------------------------------------------------------------------------------#
# LANGUAGE MODEL PARAMETERS
EMBEDDING_DIM = 100
STATE_DIM = 512
VOCABULARY_SIZE = 20000
SENT_DIM = 30

# RNN PARAMETERS
BATCH_SIZE = 64
LEARNING_RATE = 0.001
MAX_GRAD_NORM = 5.0
NUM_EPOCHS = 1

#------------------------------------------------------------------------------------------------------------------------------#
# LOAD DATA
LOAD_DATA = True
LOAD_EMBEDDING = False

if LOAD_DATA:
    with open(RESULTS_DIR + "vocabulary.pkl", "rb") as f:
        vocabulary, word_to_idx, idx_to_word = pickle.load(f)
        
    X_train = np.load(RESULTS_DIR + "X_train.npy")
    X_test = np.load(RESULTS_DIR + "X_test.npy")

else:
    vocabulary, word_to_idx, idx_to_word = utils.create_vocabulary(DATA_DIR + SENTENCES_TRAIN_FILE, VOCABULARY_SIZE)
    X_train = utils.create_dataset(DATA_DIR + SENTENCES_TRAIN_FILE, word_to_idx)
    X_test = utils.create_dataset(DATA_DIR + SENTENCES_TEST_FILE, word_to_idx)
    
    with open(RESULTS_DIR + "vocabulary.pkl", "wb") as f:
        pickle.dump((vocabulary, word_to_idx, idx_to_word), f)
    
    with open(RESULTS_DIR + "X_train.ids", "w") as f:
        for i in range(X_train.shape[0]):
            f.write(" ".join([str(x) for x in X_train[i, :]]) + "\n")
     
    with open(RESULTS_DIR + "X_test.ids", "w") as f:
        for i in range(X_test.shape[0]):
            f.write(" ".join([str(x) for x in X_test[i, :]]) + "\n")
    
    np.save(RESULTS_DIR + "X_train.npy", X_train)
    np.save(RESULTS_DIR + "X_test.npy", X_test)

num_train = X_train.shape[0]
num_test = X_test.shape[0]    
    
logger.append("vocabulary:", len(vocabulary))
logger.append("X_train:", X_train.shape)
logger.append("X_test:", X_test.shape)
logger.append("<bos> idx", word_to_idx["<bos>"])
logger.append("<eos> idx", word_to_idx["<eos>"])
logger.append("<pad> idx", word_to_idx["<pad>"])
logger.append("<unk> idx", word_to_idx["<unk>"])
logger.append("DATA LOADED.")

vocabulary:                             20000          
X_train:                                (1969833, 30)  
X_test:                                 (10000, 30)    
<bos> idx                               178            
<eos> idx                               179            
<pad> idx                               180            
<unk> idx                               181            
DATA LOADED.                            


In [110]:
# Initialize
np.random.seed(12345)
tf.reset_default_graph()
initializer = tf.contrib.layers.xavier_initializer()

# Dataset
sentences_file_name = tf.placeholder(tf.string)

training_dataset = tf.data.TextLineDataset(sentences_file_name).map(tf_utils.parse_ids_file).repeat(NUM_EPOCHS).batch(BATCH_SIZE)
iterator = tf.data.Iterator.from_structure(training_dataset.output_types, training_dataset.output_shapes)
X_batch, y_batch = iterator.get_next()
training_init_op = iterator.make_initializer(training_dataset)


# Weights
output_weight = tf.get_variable("output_weight", shape=[STATE_DIM, VOCABULARY_SIZE], 
                                initializer=initializer, trainable=True) # 512x20000
if not LOAD_EMBEDDING:
    embedding_weight = tf.get_variable("embedding_weight", shape=[VOCABULARY_SIZE, EMBEDDING_DIM], 
                                       initializer=initializer, trainable=True) # 20000x100
else:
    embedding_weight = tf.Variable(np.empty((VOCABULARY_SIZE, EMBEDDING_DIM), dtype=np.float32), collections=[])  # 20000x100
    
# LSTM initialization
batch_size = tf.shape(X_batch)[0] # Adjust for last batch
LSTM = tf.nn.rnn_cell.BasicLSTMCell(num_units=STATE_DIM)
state_c, state_h = LSTM.zero_state(batch_size=batch_size, dtype=tf.float32) # 64x512

X_batch_embedded = tf.nn.embedding_lookup(embedding_weight, X_batch)  # 64x29x100

# logger.append("X_batch:", X_batch.get_shape())  # 64x29
# logger.append("y_batch:", y_batch.get_shape())  # 64x29

# logger.append("output_weight:", output_weight.get_shape())  # 512x20000
# logger.append("embedding_weight:", embedding_weight.get_shape())  # 20000x100
# logger.append("state_c:", state_c.get_shape())  # 64x512
# logger.append("state_h:", state_h.get_shape())  # 64x512
# logger.append("X_batch_embedded:", X_batch_embedded.get_shape(), X_batch_embedded)  # 64x29x100

losses = []
probabilities = []

# RNN forward pass
for t in range(0, SENT_DIM - 1):
    X_t = X_batch_embedded[:, t, :]  # 64x100
    y_t = y_batch[:, t]  # 64x1
    
    lstm_output, (state_c, state_h) = LSTM(inputs=X_t, state=(state_c, state_h))  # 64x512
    logits = tf.matmul(lstm_output, output_weight)  # 64x20000
    
    loss_t = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_t, logits=logits)  # 64x1
    losses.append(loss_t)
    
    probability_t = tf.math.exp(-loss_t)
    probabilities.append(probability_t)
    
    # Same as sparse softmax
    # probabilities_t = tf.nn.softmax(logits=logits, axis=1) # 64x20000
    # probabilities_t = tf.reduce_sum(tf.multiply(probabilities_t, tf.one_hot(y_t, depth=VOCABULARY_SIZE,dtype=tf.float32)), axis=1)
    
    
losses = tf.stack(losses)  # 29x64 
loss = tf.reduce_mean(tf.reduce_sum(losses,axis=1))  # 29x1 -> 1x1

perplexity = tf.reduce_mean(tf.exp(tf.reduce_mean(losses, axis=0))) # exp(-1/n sum_t=1...n  -log p(w_t|w_1:t-1))

optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
optimizer = tf.contrib.estimator.clip_gradients_by_norm(optimizer, clip_norm=MAX_GRAD_NORM)
optimize_op = optimizer.minimize(loss)


with tf.Session() as session:
    # Initialize variables
    session.run(tf.global_variables_initializer())
    
    # Load data
    session.run(training_init_op, {sentences_file_name: RESULTS_DIR + "X_test.ids"})
    
    # Load embedding
    if LOAD_EMBEDDING:
        load_embedding(session, word_to_idx, embedding_weight, DATA_DIR + WORD_EMBEDDINGS_FILE, EMBEDDING_DIM, VOCABULARY_SIZE)

    epoch = 0
    batch_count = 0
    total_batch = num_test / BATCH_SIZE
    while True:

        try:
            batch_loss, _, batch_perplexity = session.run([loss, optimize_op, perplexity])
            epoch = floor(batch_count / total_batch) + 1
            
            if batch_count % 5 == 0:
                print("epoch: {}/{:<10}batch: {}/{:<10}loss = {:<5}perplexity = {}".format(epoch, NUM_EPOCHS, 
                                                                         batch_count + 1, ceil(total_batch), batch_loss, batch_perplexity))
            
            batch_count += 1
        except tf.errors.OutOfRangeError:
            break


epoch: 1/1         batch: 1/157       loss = 633.8519287109375perplexity = 20008.978515625
epoch: 1/1         batch: 6/157       loss = 554.6332397460938perplexity = 5989.4970703125
epoch: 1/1         batch: 11/157       loss = 295.4862365722656perplexity = 285.9700927734375
epoch: 1/1         batch: 16/157       loss = 312.59356689453125perplexity = 943.1192626953125
epoch: 1/1         batch: 21/157       loss = 282.461669921875perplexity = 634.7378540039062
epoch: 1/1         batch: 26/157       loss = 267.4273376464844perplexity = 294.16619873046875
epoch: 1/1         batch: 31/157       loss = 284.5143127441406perplexity = 591.5167236328125


KeyboardInterrupt: 

In [None]:
#------------------------------------------------------------------------------------------------------------------------------#
# PARAMETERS
len_sents = X_train.shape[1]
num_train = X_train.shape[0]
num_test = X_test.shape[0]

#------------------------------------------------------------------------------------------------------------------------------#
tf.reset_default_graph()
    
# Initializer
initializer = tf.contrib.layers.xavier_initializer()

# Parameters
output_weight = tf.get_variable("output_weight", shape=[STATE_DIM, VOCABULARY_SIZE], 
                                initializer=initializer, trainable=True)

if not load_embedding:
    embedding_weight = tf.get_variable("embedding_weight", shape=[VOCABULARY_SIZE, EMBEDDING_DIM], 
                                       initializer=initializer, trainable=True)

# Placeholders
X = tf.placeholder(tf.int32, (None, sent_dim))

# LSTM initialization
LSTM = tf.nn.rnn_cell.BasicLSTMCell(num_units=STATE_DIM)
state_c, state_h = LSTM.zero_state(batch_size=BATCH_SIZE, dtype=tf.float32)

losses = []

# RNN forward pass
for t in range(0, sent_dim - 1):
    X_t = X[:, t]
    y_t = X[:, t+1] # 64x1
    
    X_t = tf.one_hot(X_t, depth = VOCABULARY_SIZE)
    E_t = tf.matmul(X_t, E)

    output, (state_c, state_h) = LSTM(inputs=E_t, state=(state_c, state_h))
    logits = tf.matmul(output, W)
    
    loss_t = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_t, logits=logits)
    
    losses.append(loss_t)
    
losses = tf.reduce_sum(tf.stack(losses),axis=1)
loss = tf.reduce_mean(losses)

optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
optimizer = tf.contrib.estimator.clip_gradients_by_norm(optimizer, clip_norm=MAX_GRAD_NORM)
optimize_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()

#------------------------------------------------------------------------------------------------------------------------------#
# SESSION
np.random.seed(12345)

batches_per_epoch = 20

with tf.Session() as session:

    session.run(init)

    for epoch in range(NUM_EPOCHS):
        epoch_loss = 0
        
#         print('epoch\t%4d' % epoch + 1)
        
        for idx in range(batches_per_epoch):
            batch_loss, _ = session.run([loss, optimize_op],
                                    feed_dict={X: X_test[(idx*BATCH_SIZE):((idx+1)*BATCH_SIZE)]}
                                    )
            epoch_loss += batch_loss
            print('\tbatch %4d\t%.2f' % (idx + 1, batch_loss))
        
#         if epoch + 1 % 2 == 0:
        print('epoch\t%4d\t%.2f' % (epoch + 1, epoch_loss / batches_per_epoch))

In [None]:
# PARAMETERS

BATCH_SIZE = 64
LEARNING_RATE = 0.001
MAX_GRAD_NORM = 5.0
NUM_EPOCHS = 50

EMBEDDING_DIM = 100
STATE_DIM = 512
VOCABULARY_SIZE = 20000

sent_dim = X_train.shape[1]
num_train = X_train.shape[0]
num_test = X_test.shape[0]

batch_per_epoch = floor(num_test / BATCH_SIZE)

# Session
tf.reset_default_graph()

# Initializer
initializer = tf.contrib.layers.xavier_initializer()

# Parameters
W = tf.get_variable("W", shape=[STATE_DIM, VOCABULARY_SIZE], initializer=initializer, trainable=True)
E = tf.get_variable("E", shape=[VOCABULARY_SIZE, EMBEDDING_DIM], initializer=initializer, trainable=True)

# Placeholders
X = tf.placeholder(tf.int32, (None, sent_dim))
dataset = tf.data.Dataset.from_tensor_slices(X).batch(batch_size).repeat()

iterator = dataset.make_initializable_iterator()
X_batch = iterator.get_next()

# LSTM initialization
LSTM = tf.nn.rnn_cell.BasicLSTMCell(num_units=STATE_DIM)
state_c, state_h = LSTM.zero_state(batch_size=batch_size, dtype=tf.float32)

losses = []

# RNN forward pass
for t in range(0, 5):
    X_t = X_batch[:, t]
    y_t = X_batch[:, t+1]
    
    X_t = tf.one_hot(X_t, depth = VOCABULARY_SIZE)
    E_t = tf.matmul(X_t, E)

    output, (state_c, state_h) = LSTM(inputs=E_t, state=(state_c, state_h))
    logits = tf.matmul(output, W)
    
    loss_t = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_t, logits=logits)
    
    losses.append(loss_t)
    
losses = tf.stack(losses)
losses = tf.reduce_sum(losses,axis=1)

loss = tf.reduce_mean(losses)

optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
optimizer = tf.contrib.estimator.clip_gradients_by_norm(optimizer, clip_norm=MAX_GRAD_NORM)
optimize_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()

In [None]:
np.random.seed(12345)

# SESSION
session = tf.Session()

session.run(init)

for epoch in range(NUM_EPOCHS):
    total_loss = 0
    
    for _ in range(batch_per_epoch):
        batch_loss, _ = sess.run([loss, train_op, loss])
        total_loss += batch_loss
        
        
#     train_loss, _ = session.run([loss, optimize_op],
#                             feed_dict={X: X_test[0:batch_size]}
#                             )

    if epoch + 1 % 1 == 0:
        print('Epoch %04d> training loss: %.2f' % (epoch, total_loss))
    
session.close()

In [3]:
embedding_weight = tf.Variable(np.empty((VOCABULARY_SIZE, EMBEDDING_DIM), dtype=np.float32), collections=[])

with tf.Session() as session:
    load_embedding(session, word_to_idx, embedding_weight, DATA_DIR + WORD_EMBEDDINGS_FILE, EMBEDDING_DIM, VOCABULARY_SIZE)
    print(embedding_weight.shape)

Loading external embeddings from ./data/wordembeddings-dim100.word2vec
<bos> not in embedding file
<eos> not in embedding file
<pad> not in embedding file
<unk> not in embedding file
19996 words out of 20000 could be loaded
(20000, 100)
