In [1]:
import tensorflow as tf
import numpy as np
import glob
import json
import pdb
import sys

sys.path.append("../")

In [2]:
# load embeddings

import textutils as tu

textworld_vocab = set()
with open('/home/mauriciogtec/Github/TextWorld/montecarlo/vocab.txt', 'r') as fn:
    for line in fn:
        word = line[:-1]
        textworld_vocab.add(word)

embeddings, vocab = tu.load_embeddings(
    embeddingsdir="/home/mauriciogtec/glove.6B/",
    embedding_dim=200,  # try 50
    vocab=textworld_vocab)
np.random.seed(110104)
index = np.random.permutation(range(200))[:128]
embeddings = embeddings[index, :]

In [3]:
# load data files
datafiles = glob.glob("../data/*.json")
datafiles.sort(reverse=True)

num_files = 50
cmdlist_list = []
memory_list = []
counts_list = []
rewards_list = []
for i in range(num_files):
    with open(datafiles[i],'r') as fn:
        data_array = json.load(fn)
        for d in data_array:
            rewards_list.append(d['reward'])
            cmdlist_list.append(d['cmdlist'])
            counts_list.append(d['counts'])
            memory_list.append(d['memory'])
            
N = len(cmdlist_list)
idx = np.random.permutation(range(N))
cmdlist_list = [cmdlist_list[i] for i in idx]
memory_list = [memory_list[i] for i in idx]
counts_list = [counts_list[i] for i in idx]
rewards_list = [rewards_list[i] for i in idx]
print("number data points")
print(len(cmdlist_list))

number data points
94633


In [4]:
import attentionnetwork as nn

model = nn.AlphaTextWorldNet(embeddings, vocab)

optim = tf.optimizers.Nadam(
    learning_rate=0.00001,
    clipnorm=30.0,
    beta_1=0.9,
    beta_2=0.98)

def train(memory, cmdlist, value, policy):
    inputs = (memory, cmdlist)    
    value_loss, policy_loss, loss = 0, 0, 0
    with tf.GradientTape() as tape:
        vhat, phat = model(inputs, training=True)
        value_loss += tf.math.reduce_sum(tf.square(value - vhat))
        phat = tf.math.softmax(phat)
        logphat = tf.math.log(phat + 1e-12)
        policy_loss += -tf.reduce_sum(logphat * policy)
        loss += value_loss + policy_loss

    variables = model.trainable_variables
    gradients = tape.gradient(loss, variables)
    update = optim.apply_gradients(zip(gradients, variables))
    
    return value_loss, policy_loss, loss

out = model((["this", "is a test"], ["hi there", "how are you here"]), training=True)
model.summary()

Model: "alpha_text_world_net"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embeddings (Embedding)       multiple                  2572672   
_________________________________________________________________
memory_encoder (SelfAttentio multiple                  54016     
_________________________________________________________________
cmd_encoder (SelfAttentionEn multiple                  37440     
_________________________________________________________________
att_encoder (AttentionEncode multiple                  33152     
_________________________________________________________________
value_time_encode (TimeSelfA multiple                  4225      
_________________________________________________________________
value_turn_encode (TimeSelfA multiple                  6561      
_________________________________________________________________
value_head (DenseHead)       multiple         

In [None]:
print_every = 10
iteration = 0
vloss_av = 1.0
ploss_av = 1.5
loss_av = 2.5

msg = "epoch: {}, iter: {}, vloss: {:.2f}, ploss: {:.2f}, loss: {:.2f}, " +\
      "vloss (av): {:.2f}, ploss (av): {:.2f}, loss (av): {:.2f}"

with tf.device('/gpu:0'):
    for epoch in range(5):
        for i in range(2000, len(cmdlist_list)):
            
            cmdlist = cmdlist_list[i]
            memory = memory_list[i]
            value = rewards_list[i]
            counts = np.array(counts_list[i])
            policy = counts / sum(counts)
            maxlen = max(len(m) for m in memory)

            if len(cmdlist) >= 2 and len(memory) > 0:                
                try:
                    vloss, ploss, loss = train(memory, cmdlist, value, policy)
                    
                    vloss_av += 0.01 * (vloss.numpy().item() - vloss_av)
                    ploss_av += 0.01 * (ploss.numpy().item() - ploss_av)
                    loss_av += 0.01 * (loss.numpy().item() - loss_av)
                    
                    if iteration % print_every == 0:
                        print(msg.format(epoch, iteration, 
                                         vloss, ploss, loss, 
                                         vloss_av, ploss_av, loss_av))  
                    iteration += 1
                    
                except Exception as e:
                    print(e)
                    emsg = "memory = {}\n\nmemorylen={}\n\ncmdlist={}"
                    print(emsg.format(memory, len(memory), cmdlist))
                    pdb.set_trace()

epoch: 0, iter: 0, vloss: 0.32, ploss: 1.28, loss: 1.60, vloss (av): 0.99, ploss (av): 1.50, loss (av): 2.49
epoch: 0, iter: 10, vloss: 0.00, ploss: 1.62, loss: 1.62, vloss (av): 0.91, ploss (av): 1.52, loss (av): 2.43
epoch: 0, iter: 20, vloss: 0.32, ploss: 1.11, loss: 1.43, vloss (av): 0.84, ploss (av): 1.52, loss (av): 2.36
epoch: 0, iter: 30, vloss: 0.56, ploss: 1.08, loss: 1.64, vloss (av): 0.81, ploss (av): 1.53, loss (av): 2.34
epoch: 0, iter: 40, vloss: 0.10, ploss: 2.24, loss: 2.33, vloss (av): 0.74, ploss (av): 1.54, loss (av): 2.28
epoch: 0, iter: 50, vloss: 0.00, ploss: 2.02, loss: 2.02, vloss (av): 0.67, ploss (av): 1.60, loss (av): 2.27
epoch: 0, iter: 60, vloss: 0.71, ploss: 1.41, loss: 2.12, vloss (av): 0.65, ploss (av): 1.61, loss (av): 2.26
epoch: 0, iter: 70, vloss: 0.45, ploss: 1.27, loss: 1.72, vloss (av): 0.62, ploss (av): 1.62, loss (av): 2.24
epoch: 0, iter: 80, vloss: 0.04, ploss: 2.67, loss: 2.71, vloss (av): 0.58, ploss (av): 1.62, loss (av): 2.20
epoch: 0, i