In [1]:
import os
from google.colab import drive
drive.mount('/content/gdrive/')
os.chdir("./gdrive/My Drive/NLU/Projects/project 1/rok/")
os.listdir()

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


['utils.py',
 'load_embedding.py',
 'tf_utils.py',
 'data',
 'docs',
 '.ipynb_checkpoints',
 '__pycache__',
 'results',
 'index.ipynb']

In [2]:
from load_embedding import load_embedding
import utils
import tf_utils

import pickle
import numpy as np
from math import floor, ceil
import warnings
warnings.simplefilter("ignore")

import tensorflow as tf
print("tf_version:\t" + tf.__version__)

!pip install tensorboardcolab
from tensorboardcolab import TensorBoardColab

tbc = TensorBoardColab()
logger = utils.Logger("./logs/")
timer = utils.Timer()

tf_version:	1.13.1


Using TensorFlow backend.


Wait for 8 seconds...
TensorBoard link:
http://2eeeff66.ngrok.io


In [0]:
#------------------------------------------------------------------------------------------------------------------------------#
# DIRECTORIES
DATA_DIR = "./data/"
RESULTS_DIR = "./results/"
WORD_EMBEDDINGS_FILE = "wordembeddings-dim100.word2vec"
SENTENCES_TRAIN_FILE = "sentences.train"
SENTENCES_TEST_FILE = "sentences_test.txt"
SENTENCES_EVAL_FILE = "sentences.eval"
SENTENCES_CONTINUATION_FILE = "sentences.continuation"

#------------------------------------------------------------------------------------------------------------------------------#
# LANGUAGE MODEL PARAMETERS
EMBEDDING_DIM = 100
STATE_DIM = 512
VOCABULARY_SIZE = 20000
SENT_DIM = 30

#------------------------------------------------------------------------------------------------------------------------------#
# RNN PARAMETERS
BATCH_SIZE = 64
LEARNING_RATE = 0.001
MAX_GRAD_NORM = 5.0
NUM_EPOCHS = 1
KEEP_PROBS = 0.5

#------------------------------------------------------------------------------------------------------------------------------#
# LOAD DATA
LOAD_DATA = True
LOAD_EMBEDDING = False

In [4]:
if LOAD_DATA:
    with open(RESULTS_DIR + "vocabulary.pkl", "rb") as f:
        vocabulary, word_to_idx, idx_to_word = pickle.load(f)
        
    X_train = np.load(RESULTS_DIR + "X_train.npy")
    X_test = np.load(RESULTS_DIR + "X_test.npy")
    X_eval = np.load(RESULTS_DIR + "X_eval.npy")

else:
    vocabulary, word_to_idx, idx_to_word = utils.create_vocabulary(DATA_DIR + SENTENCES_TRAIN_FILE, VOCABULARY_SIZE)
    X_train = utils.create_dataset(DATA_DIR + SENTENCES_TRAIN_FILE, word_to_idx)
    X_test = utils.create_dataset(DATA_DIR + SENTENCES_TEST_FILE, word_to_idx)
    X_eval = utils.create_dataset(DATA_DIR + SENTENCES_EVAL_FILE, word_to_idx)
    
    with open(RESULTS_DIR + "vocabulary.pkl", "wb") as f:
        pickle.dump((vocabulary, word_to_idx, idx_to_word), f)
    
    with open(RESULTS_DIR + "X_train.ids", "w") as f:
        for i in range(X_train.shape[0]):
            f.write(" ".join([str(x) for x in X_train[i, :]]) + "\n")
     
    with open(RESULTS_DIR + "X_test.ids", "w") as f:
        for i in range(X_test.shape[0]):
            f.write(" ".join([str(x) for x in X_test[i, :]]) + "\n")
    
    with open(RESULTS_DIR + "X_eval.ids", "w") as f:
        for i in range(X_eval.shape[0]):
            f.write(" ".join([str(x) for x in X_eval[i, :]]) + "\n")
    
    np.save(RESULTS_DIR + "X_train.npy", X_train)
    np.save(RESULTS_DIR + "X_test.npy", X_test)
    np.save(RESULTS_DIR + "X_eval.npy", X_eval)

num_train = X_train.shape[0]
num_test = X_test.shape[0]
num_eval = X_eval.shape[0]
    
logger.append("vocabulary:", len(vocabulary))
logger.append("X_train:", X_train.shape)
logger.append("X_test:", X_test.shape)
logger.append("X_eval:", X_eval.shape)
logger.append("<bos> idx", word_to_idx["<bos>"])
logger.append("<eos> idx", word_to_idx["<eos>"])
logger.append("<pad> idx", word_to_idx["<pad>"])
logger.append("<unk> idx", word_to_idx["<unk>"])
logger.append("DATA LOADED.")

vocabulary:                             20000          
X_train:                                (1969833, 30)  
X_test:                                 (10000, 30)    
X_eval:                                 (9846, 30)     
<bos> idx                               178            
<eos> idx                               179            
<pad> idx                               180            
<unk> idx                               181            
DATA LOADED.                            


In [0]:
timer.__enter__()
tf.reset_default_graph()

with tf.name_scope("initialization"):
    tf.set_random_seed(12345)
    np.random.seed(12345)
    initializer = tf.contrib.layers.xavier_initializer()

with tf.name_scope("input"):
    with tf.name_scope("train_dataset"):
        sentences_train_file_name = tf.placeholder(tf.string)
        training_dataset = tf.data.TextLineDataset(sentences_train_file_name).map(tf_utils.parse_ids_file).repeat(NUM_EPOCHS).batch(BATCH_SIZE)
        iterator = tf.data.Iterator.from_structure(training_dataset.output_types, training_dataset.output_shapes)
        X_batch, y_batch = iterator.get_next()
        training_init_op = iterator.make_initializer(training_dataset)

    with tf.name_scope("evaluation_dataset"):
        sentences_eval_file_name = tf.placeholder(tf.string)
        eval_dataset = tf.data.TextLineDataset(sentences_eval_file_name).map(tf_utils.parse_ids_file).batch(BATCH_SIZE)
        eval_iterator = tf.data.Iterator.from_structure(eval_dataset.output_types, eval_dataset.output_shapes)
        X_eval_batch, y_eval_batch = eval_iterator.get_next()
        eval_init_op = eval_iterator.make_initializer(eval_dataset)


with tf.name_scope("weights"):
    with tf.name_scope("output_weight"):
        output_weight = tf.get_variable("output_weight", shape=[STATE_DIM, VOCABULARY_SIZE], 
                                        initializer=initializer, trainable=True) # 512x20000
    with tf.name_scope("embedding_weight"):
        if not LOAD_EMBEDDING:
            embedding_weight = tf.get_variable("embedding_weight", shape=[VOCABULARY_SIZE, EMBEDDING_DIM], 
                                               initializer=initializer, trainable=True) # 20000x100
        else:
            embedding_weight = tf.Variable(np.empty((VOCABULARY_SIZE, EMBEDDING_DIM), dtype=np.float32), collections=[], trainable=False)  # 20000x100
    
with tf.name_scope("lstm_initialization"):
    LSTM = tf.nn.rnn_cell.BasicLSTMCell(num_units=STATE_DIM)
    with tf.name_scope("dropout"):
        LSTM = tf.nn.rnn_cell.DropoutWrapper(LSTM, input_keep_prob=KEEP_PROBS, output_keep_prob=KEEP_PROBS, 
                                             state_keep_prob=KEEP_PROBS)
        
    batch_size = tf.shape(X_batch)[0] # Adjust for last batch
    state_c, state_h = LSTM.zero_state(batch_size=batch_size, dtype=tf.float32) # 64x512



with tf.name_scope("training"):
  
    with tf.name_scope("embedding_lookup"):
        X_batch_embedded = tf.nn.embedding_lookup(embedding_weight, X_batch)  # 64x29x100
  
    losses = []
    probabilities = []

    for t in range(0, SENT_DIM - 1):
        X_t = X_batch_embedded[:, t, :]  # 64x100
        y_t = y_batch[:, t]  # 64x1
        
        with tf.name_scope("lstm_fp"):
            lstm_output, (state_c, state_h) = LSTM(inputs=X_t, state=(state_c, state_h))  # 64x512
            logits = tf.matmul(lstm_output, output_weight)  # 64x20000

        with tf.name_scope("loss"):
            loss_t = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_t, logits=logits)  # 64x1
            losses.append(loss_t)
        
        with tf.name_scope("probability"):
            probability_t = tf.math.exp(-loss_t)
            probabilities.append(probability_t)
    
    with tf.name_scope("aggregate_losses"):
        losses = tf.stack(losses)  # 29x64 
        loss = tf.reduce_mean(tf.reduce_sum(losses,axis=1))  # 29x1 -> 1x1

        perplexity = tf.reduce_mean(tf.exp(tf.reduce_mean(losses, axis=0))) # exp(-1/n sum_t=1...n  -log p(w_t|w_1:t-1))

with tf.name_scope("optimize"):
    optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    optimizer = tf.contrib.estimator.clip_gradients_by_norm(optimizer, clip_norm=MAX_GRAD_NORM)
    optimize_op = optimizer.minimize(loss)

with tf.name_scope("evaluation"):
    batch_size = tf.shape(X_eval_batch)[0]
    state_c, state_h = LSTM.zero_state(batch_size=batch_size, dtype=tf.float32) # 64x512
    
    with tf.name_scope("embedding_lookup"):
        X_eval_batch_embedded = tf.nn.embedding_lookup(embedding_weight, X_eval_batch)  # 64x29x100
  
    eval_losses = []
    eval_probabilities = []

    for t in range(0, SENT_DIM - 1):
        X_eval_t = X_eval_batch_embedded[:, t, :]  # 64x100
        y_eval_t = y_eval_batch[:, t]  # 64x1
        
        with tf.name_scope("lstm_fp"):
            eval_lstm_output, (state_c, state_h) = LSTM(inputs=X_eval_t, state=(state_c, state_h))  # 64x512
            eval_logits = tf.matmul(eval_lstm_output, output_weight)  # 64x20000

        with tf.name_scope("loss"):
            eval_loss_t = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_eval_t, logits=eval_logits)  # 64x1
            eval_losses.append(eval_loss_t)
        
        with tf.name_scope("probability"):
            eval_probability_t = tf.math.exp(-eval_loss_t)
            eval_probabilities.append(eval_probability_t)
    
    with tf.name_scope("aggregate_losses"):
        eval_losses = tf.stack(eval_losses)  # 29x64 
        eval_loss = tf.reduce_mean(tf.reduce_sum(eval_losses,axis=1))  # 29x1 -> 1x1

        eval_perplexity = tf.exp(tf.reduce_mean(eval_losses, axis=0)) # exp(-1/n sum_t=1...n  -log p(w_t|w_1:t-1))
  
  
with tf.Session() as session:
    # Initialize variables
    session.run(tf.global_variables_initializer())
    tf_utils.trainable_parameters()
    
    # Load data
    session.run(training_init_op, {sentences_train_file_name: RESULTS_DIR + "X_train.ids"})
    
    # Load embedding
    if LOAD_EMBEDDING:
        load_embedding(session, word_to_idx, embedding_weight, DATA_DIR + WORD_EMBEDDINGS_FILE, EMBEDDING_DIM, VOCABULARY_SIZE)

        
    # Training
    epoch = 0
    batch_count = 0
    total_batch = num_train / BATCH_SIZE
    while True:

        try:
            batch_loss, _, batch_perplexity = session.run([loss, optimize_op, perplexity])
            epoch = floor(batch_count / total_batch) + 1
            
            if batch_count % 100 == 0:
                print("epoch: {}/{:<6}batch: {:>5}/{:<10}loss = {:<13.2f}perp = {:<13.2f}".format(epoch, NUM_EPOCHS, 
                                                        batch_count + 1, ceil(total_batch), batch_loss, batch_perplexity))
            
            batch_count += 1
            
            if batch_count > 5000:
                break
        except tf.errors.OutOfRangeError:
            break

    # Evaluation     
    session.run(eval_init_op, {sentences_eval_file_name: RESULTS_DIR + "X_eval.ids"})
    
    batch_count = 0
    total_batch = num_eval / BATCH_SIZE
    
    eval_perplexities = np.array([], dtype=np.float32)
    while True:

        try:
            batch_perplexity = session.run(eval_perplexity)
            eval_perplexities = np.append(eval_perplexities, batch_perplexity)
            batch_count += 1
               
        except tf.errors.OutOfRangeError:
            break
    
    
timer.__exit__()

with open(RESULTS_DIR + "groupXX.perplexityA", "w") as f:
    for i in range(num_eval):
        f.write(str(eval_perplexities[i]) + "\n")

output_weight:0               (512, 20000)
embedding_weight:0            (20000, 100)
basic_lstm_cell/kernel:0      (612, 2048)
basic_lstm_cell/bias:0           (2048,)
num_parameters                  13495424
epoch: 1/1     batch:     1/30779     loss = 633.83       perp = 20001.68     
epoch: 1/1     batch:   101/30779     loss = 184.66       perp = 73.69        
epoch: 1/1     batch:   201/30779     loss = 182.44       perp = 53.26        
epoch: 1/1     batch:   301/30779     loss = 163.06       perp = 40.61        
epoch: 1/1     batch:   401/30779     loss = 137.13       perp = 24.38        
epoch: 1/1     batch:   501/30779     loss = 166.35       perp = 49.18        
epoch: 1/1     batch:   601/30779     loss = 165.96       perp = 53.30        
epoch: 1/1     batch:   701/30779     loss = 140.62       perp = 27.89        
epoch: 1/1     batch:   801/30779     loss = 139.33       perp = 26.84        
epoch: 1/1     batch:   901/30779     loss = 130.82       perp = 27.19        


In [5]:
timer.__enter__()
tf.reset_default_graph()

with tf.name_scope("initialization"):
    LOAD_EMBEDDING = True
    tf.set_random_seed(12345)
    np.random.seed(12345)
    initializer = tf.contrib.layers.xavier_initializer()

with tf.name_scope("input"):
    with tf.name_scope("train_dataset"):
        sentences_train_file_name = tf.placeholder(tf.string)
        training_dataset = tf.data.TextLineDataset(sentences_train_file_name).map(tf_utils.parse_ids_file).repeat(NUM_EPOCHS).batch(BATCH_SIZE)
        iterator = tf.data.Iterator.from_structure(training_dataset.output_types, training_dataset.output_shapes)
        X_batch, y_batch = iterator.get_next()
        training_init_op = iterator.make_initializer(training_dataset)

    with tf.name_scope("evaluation_dataset"):
        sentences_eval_file_name = tf.placeholder(tf.string)
        eval_dataset = tf.data.TextLineDataset(sentences_eval_file_name).map(tf_utils.parse_ids_file).batch(BATCH_SIZE)
        eval_iterator = tf.data.Iterator.from_structure(eval_dataset.output_types, eval_dataset.output_shapes)
        X_eval_batch, y_eval_batch = eval_iterator.get_next()
        eval_init_op = eval_iterator.make_initializer(eval_dataset)


with tf.name_scope("weights"):
    with tf.name_scope("output_weight"):
        output_weight = tf.get_variable("output_weight", shape=[STATE_DIM, VOCABULARY_SIZE], 
                                        initializer=initializer, trainable=True) # 512x20000
    with tf.name_scope("embedding_weight"):
        if not LOAD_EMBEDDING:
            embedding_weight = tf.get_variable("embedding_weight", shape=[VOCABULARY_SIZE, EMBEDDING_DIM], 
                                               initializer=initializer, trainable=True) # 20000x100
        else:
            embedding_weight = tf.Variable(np.empty((VOCABULARY_SIZE, EMBEDDING_DIM), dtype=np.float32), collections=[], trainable=False)  # 20000x100
    
with tf.name_scope("lstm_initialization"):
    LSTM = tf.nn.rnn_cell.BasicLSTMCell(num_units=STATE_DIM)
    with tf.name_scope("dropout"):
        LSTM = tf.nn.rnn_cell.DropoutWrapper(LSTM, input_keep_prob=KEEP_PROBS, output_keep_prob=KEEP_PROBS, 
                                             state_keep_prob=KEEP_PROBS)
        
    batch_size = tf.shape(X_batch)[0] # Adjust for last batch
    state_c, state_h = LSTM.zero_state(batch_size=batch_size, dtype=tf.float32) # 64x512



with tf.name_scope("training"):
  
    with tf.name_scope("embedding_lookup"):
        X_batch_embedded = tf.nn.embedding_lookup(embedding_weight, X_batch)  # 64x29x100
  
    losses = []
    probabilities = []

    for t in range(0, SENT_DIM - 1):
        X_t = X_batch_embedded[:, t, :]  # 64x100
        y_t = y_batch[:, t]  # 64x1
        
        with tf.name_scope("lstm_fp"):
            lstm_output, (state_c, state_h) = LSTM(inputs=X_t, state=(state_c, state_h))  # 64x512
            logits = tf.matmul(lstm_output, output_weight)  # 64x20000

        with tf.name_scope("loss"):
            loss_t = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_t, logits=logits)  # 64x1
            losses.append(loss_t)
        
        with tf.name_scope("probability"):
            probability_t = tf.math.exp(-loss_t)
            probabilities.append(probability_t)
    
    with tf.name_scope("aggregate_losses"):
        losses = tf.stack(losses)  # 29x64 
        loss = tf.reduce_mean(tf.reduce_sum(losses,axis=1))  # 29x1 -> 1x1

        perplexity = tf.reduce_mean(tf.exp(tf.reduce_mean(losses, axis=0))) # exp(-1/n sum_t=1...n  -log p(w_t|w_1:t-1))

with tf.name_scope("optimize"):
    optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    optimizer = tf.contrib.estimator.clip_gradients_by_norm(optimizer, clip_norm=MAX_GRAD_NORM)
    optimize_op = optimizer.minimize(loss)

with tf.name_scope("evaluation"):
    batch_size = tf.shape(X_eval_batch)[0]
    state_c, state_h = LSTM.zero_state(batch_size=batch_size, dtype=tf.float32) # 64x512
    
    with tf.name_scope("embedding_lookup"):
        X_eval_batch_embedded = tf.nn.embedding_lookup(embedding_weight, X_eval_batch)  # 64x29x100
  
    eval_losses = []
    eval_probabilities = []

    for t in range(0, SENT_DIM - 1):
        X_eval_t = X_eval_batch_embedded[:, t, :]  # 64x100
        y_eval_t = y_eval_batch[:, t]  # 64x1
        
        with tf.name_scope("lstm_fp"):
            eval_lstm_output, (state_c, state_h) = LSTM(inputs=X_eval_t, state=(state_c, state_h))  # 64x512
            eval_logits = tf.matmul(eval_lstm_output, output_weight)  # 64x20000

        with tf.name_scope("loss"):
            eval_loss_t = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_eval_t, logits=eval_logits)  # 64x1
            eval_losses.append(eval_loss_t)
        
        with tf.name_scope("probability"):
            eval_probability_t = tf.math.exp(-eval_loss_t)
            eval_probabilities.append(eval_probability_t)
    
    with tf.name_scope("aggregate_losses"):
        eval_losses = tf.stack(eval_losses)  # 29x64 
        eval_loss = tf.reduce_mean(tf.reduce_sum(eval_losses,axis=1))  # 29x1 -> 1x1

        eval_perplexity = tf.exp(tf.reduce_mean(eval_losses, axis=0)) # exp(-1/n sum_t=1...n  -log p(w_t|w_1:t-1))
  
  
with tf.Session() as session:
    # Initialize variables
    session.run(tf.global_variables_initializer())
    tf_utils.trainable_parameters()
    
    # Load data
    session.run(training_init_op, {sentences_train_file_name: RESULTS_DIR + "X_train.ids"})
    
    # Load embedding
    if LOAD_EMBEDDING:
        load_embedding(session, word_to_idx, embedding_weight, DATA_DIR + WORD_EMBEDDINGS_FILE, EMBEDDING_DIM, VOCABULARY_SIZE)

        
    # Training
    epoch = 0
    batch_count = 0
    total_batch = num_train / BATCH_SIZE
    while True:

        try:
            batch_loss, _, batch_perplexity = session.run([loss, optimize_op, perplexity])
            epoch = floor(batch_count / total_batch) + 1
            
            if batch_count % 100 == 0:
                print("epoch: {}/{:<6}batch: {:>5}/{:<10}loss = {:<13.2f}perp = {:<13.2f}".format(epoch, NUM_EPOCHS, 
                                                        batch_count + 1, ceil(total_batch), batch_loss, batch_perplexity))
            
            batch_count += 1
            
            if batch_count > 1002:
                break
        except tf.errors.OutOfRangeError:
            break

    # Evaluation     
    session.run(eval_init_op, {sentences_eval_file_name: RESULTS_DIR + "X_eval.ids"})
    
    batch_count = 0
    total_batch = num_eval / BATCH_SIZE
    
    eval_perplexities = np.array([], dtype=np.float32)
    while True:

        try:
            batch_perplexity = session.run(eval_perplexity)
            eval_perplexities = np.append(eval_perplexities, batch_perplexity)
            batch_count += 1
               
        except tf.errors.OutOfRangeError:
            break
    
    
timer.__exit__()

with open(RESULTS_DIR + "groupXX.perplexityB", "w") as f:
    for i in range(num_eval):
        f.write(str(eval_perplexities[i]) + "\n")


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
output_weight:0               (512, 20000)
basic_lstm_cell/kernel:0      (612, 2048)
basic_lstm_cell/bias:0        (2048,)   
num_parameters                11495424  
Loading external embeddings from ./data/wordembeddings-dim100.word2vec
<bos> not in embedding file
<eos> not in embedding file
<pad> not in embedding file
<unk> not in embedding file
19996 words out of 20000 could be loaded
epoch: 1/