In [1]:
import tensorflow as tf
import numpy as np
import math
import json

In [None]:
import os
from google.colab import drive
drive.mount('/content/gdrive/')
os.chdir("./gdrive/My Drive/NLU/project1/")

In [None]:
# from tensorboardcolab import *
# tbc = TensorBoardColab()

In [None]:
word_to_id_dict = json.load(open("dictionary_0index.txt"))
inverse_vocab = json.load(open("dictionary_inverse.txt"))

In [3]:
FILE_TRAIN = 'sentences.train'
FILE_EVAL = 'sentences.eval'
FILE_TEST = 'sentences_test.txt'
FILE_SENTENCES = 'sentences.continuation'

INPUT_TRAIN = 'train.ids'
INPUT_EVAL = 'eval.ids'
INPUT_TEST = 'test.ids'
INPUT_SENTENCES = 'sentences.ids'
EMBEDDING_FILE = 'wordembeddings-dim100.word2vec'

In [5]:
EXPERIMENT = 'A'
BATCH_SIZE=64
VOCAB_SIZE = 20000
HIDDEN_UNITS = 512
EMBED_SIZE = 100
NUM_EPOCHS = 1

In [228]:
EXPERIMENT = 'B'
BATCH_SIZE=64
VOCAB_SIZE = 20000
HIDDEN_UNITS = 512
EMBED_SIZE = 100
NUM_EPOCHS = 1

In [230]:
EXPERIMENT = 'C'
BATCH_SIZE=64
VOCAB_SIZE = 20000
HIDDEN_UNITS = 1024
EMBED_SIZE = 100
NUM_EPOCHS = 1

In [None]:
#MODE = 'train'
#MODE = 'perplexity_test'
MODE = 'generation'

In [21]:
tf.reset_default_graph()

model = model.Model(batch_size=BATCH_SIZE, vocab_size=VOCAB_SIZE, hidden_units=HIDDEN_UNITS, 
              embed_size=EMBED_SIZE, num_epochs=NUM_EPOCHS, experiment=EXPERIMENT)

print("Number of trainable parameters: {}".format(count_trainable_parameters()))

saver = tf.train.Saver()

with tf.Session() as sess:
    
    if MODE == 'train':
        writer = tbc.get_deep_writers("./")
        writer.add_graph(sess.graph)

        sess.run(tf.global_variables_initializer())
        if model.experiment == "B":
            load_embedding(sess, word_to_id_dict, model.input_embedding_mat, EMBEDDING_FILE, model.embed_size, model.vocab_size)
        epoch = 0

        summary_op = tf.summary.merge_all()

        while epoch < model.num_epochs:

            #training step
            sess.run(model.training_init_op, {model.file_name_train: INPUT_TRAIN})
            start = time.time()
            while True:
                try:
                    _loss, _global_step, _ , summary = sess.run([model.loss, model.global_step, model.updates, summary_op])

                    writer.add_summary(summary, _global_step)

                    if _global_step % 200 == 0:
                        duration = time.time()-start 
                        print("Batch: {}. Epoch: {} Loss: {} Time: {} seconds. ".format(_global_step, epoch, np.sum(_loss), duration))
                        start = time.time()

                except tf.errors.OutOfRangeError:
                    # The end of one epoch
                    epoch += 1
                    break 


            #validation step
            sess.run(model.validation_init_op, {model.file_name_validation: INPUT_EVAL})
            eval_loss = []
            eval_ppl = []
            while True:
                try:
                    _eval_loss, _eval_ppl = sess.run([model.loss_, model.perplexity_])
                    eval_loss.append(np.sum(_eval_loss))
                    eval_ppl.append(_eval_ppl)

                except tf.errors.OutOfRangeError:
                    print("Epoch: {} Avg eval loss per batch: {}. Avg eval ppl per batch: {} ".format(epoch, \
                                                                                      np.mean(eval_loss), np.mean(eval_ppl)))
                    break


        #after training is done, save the model
        save_path = saver.save(sess, "model" + EXPERIMENT + ".ckpt")
        print("Model saved in path: %s" % save_path)

        writer.flush()
    
    else: 
        
        saver.restore(sess, "model" + EXPERIMENT + ".ckpt")
        if model.experiment == "B":
            load_embedding(sess, word_to_id_dict, model.input_embedding_mat, EMBEDDING_FILE, model.embed_size, model.vocab_size)
        
        if MODE == 'perplexity_test':
            sess.run(model.test_init_op, {model.file_name_test: INPUT_TEST})
            with open ('results' + EXPERIMENT + '.txt', 'w') as file:
                while True:
                    try:
                        test_ppl = sess.run(model.perplexity_)
                        file.write(str(test_ppl) + '\n')
                    except tf.errors.OutOfRangeError:
                        break
                    
        elif MODE == 'generation':
            sess.run(model.test_init_op, {model.file_name_test: INPUT_SENTENCES})
            with open ('sentence_generation' + EXPERIMENT + '.txt', 'w') as file:
            while True:
                try:
                    sentence = sess.run(model.preds)
                    sentence = [inverse_vocab[str(i)] for i in sentence][1:] #skip the <bos> tag in the output file
                    file.write(' '.join(sentence) + '\n')
                except tf.errors.OutOfRangeError:
                    break

## RESULTS

In [2]:
import pandas as pd
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [3]:
@interact
def results(name=['A', 'B', 'C']):
    results = pd.read_csv('results/results' + name + '.txt', header=None) 
    results.columns = ['perplexity']
    print('Mean: ' + str(results.perplexity.mean()))
    print('Standard deviation ' + str(results.perplexity.std()))
    print('Median ' + str(results.perplexity.quantile(0.50)))
    return results.perplexity.describe()

interactive(children=(Dropdown(description='name', options=('A', 'B', 'C'), value='A'), Output()), _dom_classe…

### DIFFERENCES

In [39]:
@interact
def results(name=['A', 'B', 'C']):
    res_metod = pd.read_csv('results/results' + name + '.txt', header=None)
    res_siki = pd.read_csv('../rok/results/group23.perplexity' + name, header=None)
    tmp = pd.concat([res_metod, res_siki], axis=1)
    tmp.columns = ['metod', 'siki']
    tmp['diff'] = abs(tmp.metod-tmp.siki)
    return tmp['diff'].describe()

interactive(children=(Dropdown(description='name', options=('A', 'B', 'C'), value='A'), Output()), _dom_classe…