If not using google colab, the following can be commented out

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install textworld
!pip install gym==0.21

Collecting textworld
  Downloading textworld-1.6.1.tar.gz (708 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m708.6/708.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tatsu>=5.8.3 (from textworld)
  Downloading TatSu-5.8.3-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.5/101.5 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hashids>=1.2.0 (from textworld)
  Downloading hashids-1.3.1-py2.py3-none-any.whl (6.6 kB)
Collecting jericho>=3.0.3 (from textworld)
  Downloading jericho-3.1.2.tar.gz (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?2

In [4]:
import torch
import torch.nn as nn
import nltk
nltk.download('punkt')
from nltk import word_tokenize
import numpy as np
import random
import pandas as pd
import json
from heapq import nlargest
import time

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [23]:
import os
import subprocess
import matplotlib.pyplot as plt
import textworld.gym
from typing import Mapping, Any

In [6]:
TRAIN_SEED = 0
TEST_SEEDS = [42]

LEVEL = 5
QUEST_LENGTH = ((LEVEL - 1) % 100 + 1)  #horizon
REWARD_SCALE = 1000
HORIZON = 10 #2*QUEST_LENGTH

NUM_EPOCHS = 30 #50

request_infos = textworld.EnvInfos(
    admissible_commands=True,  # All commands relevant to the current state.
    entities=True,              # List of all interactable entities found in the game.
    facts=True
)

COLUMNS = ['Run #', 'Training Loss']
COLUMNS.extend([f'Testing Loss {i}' for i in TEST_SEEDS])
COLUMNS.append(f'Testing Loss {TRAIN_SEED}')
COLUMNS.extend([f'Test Seed {i} Score' for i in TEST_SEEDS])
COLUMNS.append(f'Test Seed {TRAIN_SEED} Score')

COLUMNS.append('Returns')
COLUMNS.append('Take Coin QValue')


## Make Coin Collector Game

In [7]:
def make_game(seed, path):
  if not os.path.exists(path):
    command = ['tw-make', 'tw-coin_collector', '--level', str(LEVEL), '--seed', str(seed), '--output', path]
    subprocess.run(command)

  env_id = textworld.gym.register_game(path, request_infos)
  env = textworld.gym.make(env_id)

  return env_id, env

In [8]:
TRAIN_ENV = make_game(TRAIN_SEED, f'tw_games/coin_collector_level{LEVEL}_{TRAIN_SEED}_game.ulx')
TEST_ENVS = [make_game(test_seed, f'tw_games/coin_collector_level{LEVEL}_{test_seed}_game.ulx') for test_seed in TEST_SEEDS]
TEST_ENVS.append(TRAIN_ENV)

## LSTM-DQN - Count Based
Counting paper
Text Embedding: Description + permissable actions

keep a count for the states as we visit them: state --> count dictionary

In [9]:
class Network(torch.nn.Module):
  def __init__(self, emb_size, action_space, hidden_dim):
    super(Network, self).__init__()

    self.emb = torch.nn.Embedding(emb_size, hidden_dim)
    self.lstm = torch.nn.LSTM(hidden_dim, hidden_dim, num_layers=1)
    self.lstm_hidden_clean = (torch.randn(1, hidden_dim), torch.randn(1, hidden_dim))

    self.dqn = torch.nn.Sequential(
        torch.nn.Linear(emb_size, hidden_dim),
        torch.nn.ReLU(),
        torch.nn.Linear(hidden_dim, action_space)
    )

  def forward(self, input_tensor):
    embeddings = self.emb(input_tensor)
    hidden = self.lstm_hidden_clean
    output, hidden = self.lstm(embeddings, hidden)
    state_rep = torch.mean(output, dim=1) #actions that can be performed from here + observations
    q_vals = self.dqn(state_rep)
    return q_vals


In [10]:
class TextAgent_Counter_Cumulative():
  def __init__(self, vocab_size, max_num_actions, bonus, im_decay, penalty=0, gamma=0.9, episodic=False):
    self.word_indices = {}
    self.last_word_idx = 0
    self.action_indices = {}
    self.idx_to_action = {}
    self.last_action_idx = 0
    self.model = Network(emb_size=vocab_size+max_num_actions, action_space=max_num_actions, hidden_dim=128)
    self.optimizer = torch.optim.Adam(self.model.parameters(), 0.003)
    self.vocab_size = vocab_size
    self.num_actions = max_num_actions
    self.loss_fcn = torch.nn.MSELoss()

    self.state_counts = {}
    self.bonus = bonus  #1.0 in the paper
    self.im_decay = im_decay  #-1/3 in the paper
    self.penalty = penalty
    self.prev_action = None
    self.prev_state = None
    self.current_state = None

    self.test_mode = False
    self.gamma = gamma

    self.episodic = episodic

  def set_test(self):
    self.test_mode = True
    self.model.eval()

  def set_train(self):
    self.test_mode = False
    self.model.train()

  def vectorize_observation(self, tokens):
    for token in tokens:
      if token not in self.word_indices.keys():
        if not self.test_mode:
          self.word_indices[token] = self.last_word_idx
          self.last_word_idx += 1
        else:
          token = "UNK"
    word_vector = [self.word_indices[token] for token in tokens if token in self.word_indices]
    indices = torch.tensor(word_vector)

    one_hot = torch.nn.functional.one_hot(indices, num_classes=self.vocab_size)
    token_vector = torch.sum(one_hot, dim=0)

    return token_vector

  # This is where "Bootstrapped Q-Learning with Context Relevant Observation Pruning to Generalize in Text-Based Games"
  # could be applied in future work.
  def tokenize_observation(self, observation):
    return word_tokenize(observation)

  def process_observation(self, observation):
    tokens = self.tokenize_observation(observation)
    vector = self.vectorize_observation(observation)
    return vector

  def process_actions(self, actions):
    for action in actions:
      if action not in self.action_indices.keys():
          self.action_indices[action] = self.last_action_idx
          self.idx_to_action[self.last_action_idx] = action
          self.last_action_idx += 1

    action_vec = [self.action_indices[action] for action in actions if action in self.action_indices]
    indices = torch.tensor(action_vec)

    one_hot = torch.zeros(self.num_actions, dtype=int)
    one_hot[indices] = 100

    return one_hot

  def get_qvals(self, obs, infos, done=False):
    action_string = ",".join(infos['admissible_commands'])

    obs_tensor = self.process_observation(obs)
    actions_tensor = self.process_actions(infos['admissible_commands'])

    input_tensor = torch.cat((obs_tensor,actions_tensor), 0)

    q_vals = None
    if self.test_mode:
      with torch.no_grad():
        q_vals = self.model(input_tensor)
    else:
      q_vals = self.model(input_tensor)



    #INTRINSIC REWARD:
    count_key = tuple(input_tensor.detach().numpy())
    self.current_state = count_key

    if not self.test_mode:

      if not self.episodic:
        if count_key not in self.state_counts:
          print("new state")
          self.state_counts[count_key] = 1

        self.state_counts[count_key] += 1     #visited this state
        intrinsic_reward = self.state_counts[count_key]**self.im_decay #+ self.bonus

        return (q_vals, intrinsic_reward)
      else:
        if count_key not in self.state_counts:
          print("new state")
          self.state_counts[count_key] = 1
        else:
          self.state_counts[count_key] = 0

        return (q_vals, self.state_counts[count_key])



    if count_key not in self.state_counts:

        return (q_vals, 0)

    if not self.episodic:
      intrinsic_reward = self.state_counts[count_key]**self.im_decay #+ self.bonus
      return (q_vals, intrinsic_reward)
    else:
      return (q_vals, self.state_counts[count_key])

  def get_max_qval(self, q_vals, infos):

    action_qvals = {}
    for action in infos['admissible_commands']:
      if action=="look" or action=="inventory":
        continue

      idx = self.action_indices[action]
      action_qval = q_vals[idx]

      action_qvals[action] = action_qval


    highest_qval = max(action_qvals.values())
    best_actions = [key for key, value in action_qvals.items() if value == highest_qval]
    selected_action = random.choice(best_actions)


    return selected_action, highest_qval


  def act(self, obs, infos, q_vals, eps_greedy=True):
    if eps_greedy:
      random = np.random.random()
      if random < 0.5:
        idx = np.random.randint(len(infos['admissible_commands']))
        action = infos['admissible_commands'][np.random.randint(len(infos['admissible_commands']))]
        q_idx = self.action_indices[action]
        return action, q_vals[q_idx]


    action, max_val = self.get_max_qval(q_vals, infos)


    return action, max_val

  def get_qval(self, action, q_vals):
    if action in self.action_indices:
      q_idx = self.action_indices[action]
      return q_vals[q_idx].detach().item()
    else:
      return 0

In [11]:
class TextAgent_PER(TextAgent_Counter_Cumulative):
  def __init__(self, vocab_size, max_num_actions, bonus=1, im_decay=-1.0, penalty=0, per_sample_frequency=0, gamma=0.9):
    super().__init__(vocab_size=vocab_size, max_num_actions=max_num_actions, bonus=bonus, im_decay=im_decay, penalty=penalty)

    self.history = []
    self.history_priority = {}
    self.per_sample_frequency = per_sample_frequency
    self.num_samples = 5  #can do batching
    self.sample_window = 3  #priority sequence experience replay

  def replay(self):
    if len(self.history) == 0:
      return

    experiences = []

    if len(self.history) < self.num_samples:
      experiences = self.history
    else:
      experience_indices = nlargest(self.num_samples, self.history_priority, key=self.history_priority.get)
      experiences = [(self.history[i], i) for i in experience_indices]

    target_qvals = []
    selected_qvals = []

    for ((obs, infos, score, action, next_obs, next_infos, next_score), i) in experiences:
      q_vals, ireward = self.get_qvals(obs, infos)
      next_q_vals, nireward = self.get_qvals(next_obs, next_infos)

      action_idx = self.action_indices[action]

      action_qval = q_vals[action_idx]
      pred_action, pred_qval = self.get_max_qval(q_vals, infos)

      target_qvals.append(action_qval)
      selected_qvals.append(pred_qval)

      reward = score*REWARD_SCALE + nireward

      self.history_priority[i] = action_qval-pred_qval

    return (selected_qvals, target_qvals)


In [12]:
class TextAgent_PSER(TextAgent_Counter_Cumulative):
  def __init__(self, vocab_size, max_num_actions, bonus, im_decay, penalty=0, per_sample_frequency=0, gamma=0.9):
    super().__init__(vocab_size=vocab_size, max_num_actions=max_num_actions, bonus=bonus, im_decay=im_decay, penalty=penalty)


    self.prev_action = None
    self.prev_state = None
    self.current_state = None

    self.history = []   #contains entire episodes
    self.history_priority = {}  #episode --> return
    self.per_sample_frequency = per_sample_frequency
    self.num_samples = 1  #can do batching
    self.sample_window = 3  #priority sequence experience replay


  def replay(self):


    if len(self.history) == 0:
      return

    experiences = []

    if len(self.history) < self.num_samples:
      experiences = self.history
    else:
      indices = np.arange(len(self.history))

      experience_indices = np.random.choice(indices, self.num_samples)
      experiences = [self.history[i] for i in experience_indices]

    target_qvals = []
    selected_qvals = []
    total_score = 0
    td_errors = []

    for experience in experiences:

      for (obs, infos, score, action, next_obs, next_infos) in experience:

        q_vals, ireward = self.get_qvals(obs, infos)
        pred_action, pred_q_val = self.act(obs, infos, q_vals)

        action_idx = self.action_indices[action]
        action_qval = q_vals[action_idx]

        next_q_vals, nireward = self.get_qvals(next_obs, next_infos)
        next_action, next_action_qval = self.get_max_qval(next_q_vals, infos)

        reward = score*REWARD_SCALE + nireward

        total_score+=reward

        target_qvals.append(action_qval)
        selected_qvals.append(pred_q_val)
        td_error = action_qval - pred_q_val
        td_errors.append(td_error)

      self.history_priority[i] = total_score

    return (selected_qvals, target_qvals, td_errors)

### Training

In [14]:
def test(agent, max_steps, env):
  # put model into eval
  agent.set_test()

  obs, infos = env.reset()

  done = False
  obs, score, done, infos = env.step('look')
  nb_moves = 0

  actions, observations = [], [obs]
  selected_qvals, target_qvals = [], []

  running_loss = 0.0
  while not done and nb_moves < max_steps:
    q_values, _ = agent.get_qvals(obs, infos, score)
    action, value = agent.get_max_qval(q_values, infos)

    obs, score, done, infos = env.step(action)

    next_q_values, _ = agent.get_qvals(obs, infos, score)
    _, next_value = agent.get_max_qval(next_q_values, infos)

    selected_qvals.append(value)
    target_qvals.append(score*REWARD_SCALE + agent.gamma * next_value)

    nb_moves += 1
    actions.append(action)
    observations.append(obs)

  agent.set_train()

  return 0, score


def conduct_tests(agent, max_steps):
  scores = []
  test_losses = []
  for env_id, env in TEST_ENVS:
    test_loss, score = test(agent, max_steps, env)
    scores.append(score)
    test_losses.append(score*REWARD_SCALE)
    break

  test_loss, score = test(agent, max_steps, TRAIN_ENV[1])
  scores.append(score)
  test_losses.append(score*REWARD_SCALE)

  return test_losses, scores

###### Train Vanilla

In [13]:

def train(env, name, epochs=100, horizon=40, vocab_size=10000, max_num_actions=1000, gamma=0.9, episodic=False):
  agent = TextAgent_Counter_Cumulative(vocab_size=vocab_size, max_num_actions=max_num_actions, bonus=1, im_decay=-1.0, penalty=0, episodic=episodic)
  training_data = []

  for episode in range(epochs):
    print("EPISODE #:", episode)
    # env = textworld.gym.make(env_id)
    obs, infos = env.reset()

    done = False
    score = 0
    obs, score, done, infos = env.step("look")    #don't want the first "here is your task for today" line
    nb_moves = 0
    actions_taken = []
    observations = [obs]
    state_infos = [infos]
    total_score = 0
    selected_qvals = []
    target_qvals = []

    q_vals, i_reward = agent.get_qvals(obs, infos)

    while nb_moves<horizon and not done:
        infos['admissible_commands'].remove('look')
        infos['admissible_commands'].remove('inventory')
        if 'examine coin' in infos['admissible_commands']:
            infos['admissible_commands'].remove('examine coin')


        action, action_qval = agent.act(obs, infos, q_vals, True)

        obs, score, done, infos = env.step(action)

        next_qvals, next_i_reward = agent.get_qvals(obs, infos, done)
        next_action, next_action_qval = agent.get_max_qval(next_qvals, infos)

        total_score += score*REWARD_SCALE+next_i_reward    #technically this game ends when score is 1 so not required

        selected_qvals.append(action_qval)
        target_qvals.append(next_i_reward+score*REWARD_SCALE+gamma*next_action_qval)

        nb_moves += 1
        state_infos.append(infos)
        actions_taken.append(action)
        observations.append(obs)

        agent.prev_state = agent.current_state
        agent.prev_action = action

        q_vals = next_qvals

    loss = agent.loss_fcn(torch.stack(selected_qvals), torch.stack(target_qvals))
    loss.backward()
    agent.optimizer.step()
    agent.optimizer.zero_grad()

    scores, losses = conduct_tests(agent, horizon)

    row = [name, loss.detach().item()]
    row.extend(scores)
    row.extend(losses)
    row.append(total_score)
    take_coin_val = agent.get_qval('take coin', q_vals)
    row.append(take_coin_val)

    training_data.append(row)

  return agent, training_data


  # creating Textworld environment & getting additional infos: https://github.com/microsoft/TextWorld/blob/main/notebooks/Playing%20text-based%20games%20with%20TextWorld.ipynb


###### Train PER

In [15]:
def train_per(env, name, epochs=100, horizon=40, vocab_size=10000, max_num_actions=1000, gamma=0.9):
  agent = TextAgent_PER(vocab_size=vocab_size, max_num_actions=max_num_actions, bonus=1, im_decay=-1.0, penalty=0, per_sample_frequency=5, gamma=gamma)
  move_index = 0
  training_data = []

  for episode in range(epochs):
    print("EPISODE #:", episode)
    obs, infos = env.reset()

    done = False
    score = 0
    obs, score, done, infos = env.step("look")    #don't want the first "here is your task for today" line
    nb_moves = 0
    actions_taken = []
    observations = [obs]
    state_infos = [infos]
    total_score = 0
    selected_qvals = []
    target_qvals = []

    q_vals, i_reward = agent.get_qvals(obs, infos, score)

    while nb_moves<horizon and not done:
        infos['admissible_commands'].remove('look')
        infos['admissible_commands'].remove('inventory')
        if 'examine coin' in infos['admissible_commands']:
            infos['admissible_commands'].remove('examine coin')

        action, action_qval = agent.act(obs, infos, q_vals)
        next_obs, next_score, done, next_infos = env.step(action)

        next_qvals, next_i_reward = agent.get_qvals(next_obs, next_infos, done)
        next_action, next_action_qval = agent.get_max_qval(next_qvals, next_infos)

        reward = next_i_reward+score*REWARD_SCALE

        selected_qvals.append(action_qval)
        target_qvals.append(reward+gamma*next_action_qval)


        experience = (obs, infos, score, action, next_obs, next_infos, next_score)
        agent.history_priority[len(agent.history)] = reward+gamma*next_action_qval-action_qval  #zero-indexed
        agent.history.append(experience)


        move_index += 1
        total_score += next_score*REWARD_SCALE+next_i_reward    #technically this game ends when score is 1 so not required
        nb_moves += 1
        state_infos.append(infos)
        actions_taken.append(action)
        observations.append(obs)

        agent.prev_state = agent.current_state
        agent.prev_action = action

        q_vals = next_qvals
        q_vals = next_qvals
        obs = next_obs
        infos = next_infos
        score = next_score


        if move_index%agent.per_sample_frequency == 0:
           selected, target = agent.replay()
           selected_qvals.extend(selected)
           target_qvals.extend(target)


    loss = agent.loss_fcn(torch.stack(selected_qvals), torch.stack(target_qvals))
    loss.backward()
    agent.optimizer.step()
    agent.optimizer.zero_grad()

    scores, losses = conduct_tests(agent, horizon)

    row = [name, loss.detach().item()]
    row.extend(scores)
    row.extend(losses)
    row.append(total_score)
    take_coin_val = agent.get_qval('take coin', q_vals)
    row.append(take_coin_val)

    training_data.append(row)

  return agent, training_data

###### Train PSER

In [16]:
def train_pser(env, name, epochs=100, horizon=40, vocab_size=10000, max_num_actions=1000, gamma=0.9):
  agent = TextAgent_PSER(vocab_size=vocab_size, max_num_actions=max_num_actions, bonus=1, im_decay=-1.0, penalty=0, per_sample_frequency=3, gamma=gamma)
  move_index = 0
  training_data = []
  episode = 0
  while episode < epochs:
    print("EPISODE #:", episode)
    selected_qvals = []
    target_qvals = []
    total_score = 0
    obs, infos = env.reset()
    done = False

    if move_index%agent.per_sample_frequency == 0 and episode>0 and len(agent.history) > 0:
      selected, target, errors = agent.replay()

      selected_tensor = torch.stack(selected)
      target_tensor = torch.stack(target)
      errors_tensor = torch.stack(errors)

      loss = agent.loss_fcn(selected_tensor, target_tensor)

      loss.backward()
      agent.optimizer.step()
      agent.optimizer.zero_grad()
      move_index+=1
      continue
    else:
      score = 0
      obs, score, done, infos = env.step("look")    #don't want the first "here is your task for today" line
      nb_moves = 0
      actions_taken = []
      observations = [obs]
      state_infos = [infos]


      q_vals, i_reward = agent.get_qvals(obs, infos)

      episode_experience = []
      while nb_moves<horizon and not done:
          infos['admissible_commands'].remove('look')
          infos['admissible_commands'].remove('inventory')
          if 'examine coin' in infos['admissible_commands']:
              infos['admissible_commands'].remove('examine coin')

          action, action_qval = agent.act(obs, infos, q_vals)

          next_obs, next_score, done, next_infos = env.step(action)

          next_qvals, next_i_reward = agent.get_qvals(next_obs, next_infos, done)
          next_action, next_action_qval = agent.get_max_qval(next_qvals, next_infos)

          reward = next_i_reward+score*REWARD_SCALE

          selected_qvals.append(action_qval)
          target_qvals.append(reward+gamma*next_action_qval)


          experience = (obs, infos, next_score, action, next_obs, next_infos)
          episode_experience.append(experience)


          nb_moves += 1
          move_index += 1
          state_infos.append(next_infos)
          actions_taken.append(action)
          observations.append(next_obs)
          total_score += next_score*REWARD_SCALE+next_i_reward    #technically this game ends when score is 1 so not required
          agent.prev_state = agent.current_state
          agent.prev_action = action

          q_vals = next_qvals
          obs = next_obs
          infos = next_infos
          score = next_score




      # only adding successful plays
      if score>0:
        agent.history.append(episode_experience)
        agent.history_priority[len(agent.history)-1] = total_score  #zero-indexed


    loss = agent.loss_fcn(torch.stack(selected_qvals), torch.stack(target_qvals))

    loss.backward()
    agent.optimizer.step()
    agent.optimizer.zero_grad()


    scores, losses = conduct_tests(agent, horizon)

    row = [name, loss.detach().item()]
    row.extend(scores)
    row.extend(losses)
    row.append(total_score)
    take_coin_val = agent.get_qval('take coin', q_vals)
    row.append(take_coin_val)

    training_data.append(row)
    episode+=1


  return agent, training_data




#### Level 5

Paths specified for saving data below should be modified (provided paths are applicable to our directory)

###### count_cumulative
- lr = 0.003
- gamma = 0.9
- num epochs = 10
- trained 10 times (10 diff agents basically)
- seed 0 = train
- seed 42 = test
- bonus = 1
- intrinsic decay = -1.0
- horizon = 30

In [26]:
print(TRAIN_ENV[0])
data_cumulative = []
for i in range(5):
  print("ITER: ", i)
  trained_agent, training_data = train(TRAIN_ENV[1], str(i), NUM_EPOCHS, HORIZON, 10000, 1000, 0.9)
  data_cumulative.extend(training_data)

tw-v0
ITER:  0
EPISODE #: 0
new state
new state
new state
new state
new state
new state
10003.166666666666
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw-v0
Results for:  tw-v1 0 0
[0, 0]
[0, 0]
Episode # 0 -0.022285297513008118
EPISODE #: 1
new state
2.3345238095238092
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw-v0
Results for:  tw-v1 0 0
[0, 0]
[0, 0]
Episode # 1 -7.634833812713623
EPISODE #: 2
1.1912448662448665
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw-v0
Results for:  tw-v1 0 0
[0, 0]
[0, 0]
Episode # 2 -16.974946975708008
EPISODE #: 3
10001.662337662337
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw-v0
Results for:  tw-v1 0 0
[0, 0]
[0, 0]
Episode # 3 -23.345218658447266
EPISODE #: 4
1.041812251216276
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw-v0
Results for:  tw-v1 0 0
[0, 0]
[0, 0]
Episode # 4 -9.088287353515625
EPISODE #: 5
1.0609057609057608
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0


In [27]:
df = pd.DataFrame(data_cumulative, columns=COLUMNS)
print(df)

df.to_csv("/content/drive/MyDrive/CS 394R - 2024/Data/cumulative_5_long.csv")
#a = df.to_csv()

    Run #  Training Loss  Testing Loss 42  Testing Loss 0  Test Seed 42 Score  \
0       0   1.428744e+07                0               0                   0   
1       0   6.835796e+00                0               0                   0   
2       0   4.655633e+00                0               0                   0   
3       0   1.438705e+07                0               0                   0   
4       0   2.567449e+01                0               0                   0   
..    ...            ...              ...             ...                 ...   
145     4   1.134413e+07                0            1000                   0   
146     4   6.095252e+06                0            1000                   0   
147     4   7.651730e+06                0            1000                   0   
148     4   5.939365e+06                0            1000                   0   
149     4   8.220956e+06                0            1000                   0   

     Test Seed 0 Score     

###### count_episodic

In [24]:
print(TRAIN_ENV[0])
data = []
for i in range(5):
  print("ITER: ", i)
  trained_agent, training_data = train(TRAIN_ENV[1], str(i), NUM_EPOCHS, HORIZON, 10000, 1000, 0.9, True)
  data.extend(training_data)

tw-v0
ITER:  0
EPISODE #: 0
new state
new state
new state
new state
3
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw-v0
Results for:  tw-v1 0 0
[0, 0]
[0, 0]
Episode # 0 0.01448189839720726
EPISODE #: 1
0
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw-v0
Results for:  tw-v1 0 0
[0, 0]
[0, 0]
Episode # 1 -1.0725001096725464
EPISODE #: 2
new state
1
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw-v0
Results for:  tw-v1 0 0
[0, 0]
[0, 0]
Episode # 2 0.14551924169063568
EPISODE #: 3
0
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw-v0
Results for:  tw-v1 0 0
[0, 0]
[0, 0]
Episode # 3 -0.02266911044716835
EPISODE #: 4
0
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 1
Testing:  tw-v0
Results for:  tw-v1 0 0
[1000, 0]
[1, 0]
Episode # 4 -0.07353532314300537
EPISODE #: 5
0
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 1
Testing:  tw-v0
Results for:  tw-v1 0 0
[1000, 0]
[1, 0]
Episode # 5 0.04824645444750786
EPISODE #: 6
0
Testing:
Tes

In [25]:
df2 = pd.DataFrame(data, columns=COLUMNS)
print(df2)

df2.to_csv("/content/drive/MyDrive/CS 394R - 2024/Data/episodic_5_long.csv")
#a = df.to_csv()

    Run #  Training Loss  Testing Loss 42  Testing Loss 0  Test Seed 42 Score  \
0       0       0.303428                0               0                   0   
1       0      53.052570                0               0                   0   
2       0       0.232364                0               0                   0   
3       0       0.696406                0               0                   0   
4       0       1.236478             1000               0                   1   
..    ...            ...              ...             ...                 ...   
145     4   10988.216797                0            1000                   0   
146     4    3897.392578                0            1000                   0   
147     4   11665.466797                0               0                   0   
148     4     131.191986                0               0                   0   
149     4      51.064827                0               0                   0   

     Test Seed 0 Score  Ret

<__main__.TextAgent_Counter_Cumulative object at 0x7a1f592fc610>


###### PER:

In [20]:
print(TRAIN_ENV[0])
per_data = []
for i in range(5):
  print("ITER: ", i)
  trained_agent, training_data = train_per(TRAIN_ENV[1], str(i), NUM_EPOCHS, HORIZON, 10000, 1000, 0.9)
  per_data.extend(training_data)

tw-v0
ITER:  0
EPISODE #: 0
new state
new state
new state
new state
REPLAY:
new state
new state
new state
new state
new state
new state
1003.1666666666666
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw-v0
Results for:  tw-v1 0 0
[0, 0]
[0, 0]
Episode # 0 0.04983143135905266
EPISODE #: 1
REPLAY:
new state
new state
REPLAY:
new state
2.501190476190476
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw-v0
Results for:  tw-v1 0 0
[0, 0]
[0, 0]
Episode # 1 6.6417083740234375
EPISODE #: 2
REPLAY:
1001.3277777777778
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw-v0
Results for:  tw-v1 0 0
[0, 0]
[0, 0]
Episode # 2 1.526358723640442
EPISODE #: 3
REPLAY:
REPLAY:
0.9050536881419234
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw-v0
Results for:  tw-v1 0 0
[0, 0]
[0, 0]
Episode # 3 0.41571325063705444
EPISODE #: 4
REPLAY:
REPLAY:
1000.9241452991454
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw-v0
Results for:  tw-v1 0 0
[0, 0]

In [21]:
df4 = pd.DataFrame(per_data, columns=COLUMNS)
print(df4)

df4.to_csv("/content/drive/MyDrive/CS 394R - 2024/Data/per_5_long.csv")
#a = df.to_csv()

    Run #  Training Loss  Testing Loss 42  Testing Loss 0  Test Seed 42 Score  \
0       0       0.128082                0               0                   0   
1       0      20.726231                0               0                   0   
2       0       2.463628                0               0                   0   
3       0       1.065155                0               0                   0   
4       0       5.896615                0               0                   0   
..    ...            ...              ...             ...                 ...   
145     4       0.022060                0            1000                   0   
146     4       0.028152                0            1000                   0   
147     4       0.005789                0            1000                   0   
148     4       0.002736                0            1000                   0   
149     4       0.002800                0               0                   0   

     Test Seed 0 Score     

##### PSER

In [17]:
print(TRAIN_ENV[0])
pser_data = []
for i in range(5):
  print("ITER: ", i)
  trained_agent, training_data = train_pser(TRAIN_ENV[1], str(i), NUM_EPOCHS, HORIZON, 10000, 1000, 0.9)
  pser_data.extend(training_data)

tw-v0
ITER:  0
EPISODE #: 0
new state
new state
new state
new state
new state
3.833333333333334
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw-v0
Results for:  tw-v1 0 0
[0, 0]
[0, 0]
Episode # 0 -0.019253617152571678
EPISODE #: 1
new state
2.019047619047619
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw-v0
Results for:  tw-v1 0 0
[0, 0]
[0, 0]
Episode # 1 0.746414303779602
EPISODE #: 2
1.130963480963481
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw-v0
Results for:  tw-v1 0 0
[0, 0]
[0, 0]
Episode # 2 -0.11228758841753006
EPISODE #: 3
1.4537087912087912
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw-v0
Results for:  tw-v1 0 0
[0, 0]
[0, 0]
Episode # 3 1.3352429866790771
EPISODE #: 4
0.7965741559781807
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw-v0
Results for:  tw-v1 0 0
[0, 0]
[0, 0]
Episode # 4 0.14190752804279327
EPISODE #: 5
0.6408125932038975
Testing:
Testing:  tw-v1
Results for:  tw-v1 0 0
Testing:  tw

In [19]:
df3 = pd.DataFrame(pser_data, columns=COLUMNS)
#print(df2)

df3.to_csv("/content/drive/MyDrive/CS 394R - 2024/Data/pser_5_long.csv")
#a = df.to_csv()

### Agent with Episodic Bonus Demo:

In [58]:
try:
    trained_agent.set_test()
    done = False
    obs, infos  = TRAIN_ENV[1].reset()
    TRAIN_ENV[1].render()
    obs, score, done, infos = TRAIN_ENV[1].step('look')
    num_moves = 0
    while not done:
        TRAIN_ENV[1].render()
        q_values, _ = trained_agent.get_qvals(obs, infos, score)
        action, value = trained_agent.get_max_qval(q_values, infos)
        obs, score, done, infos = TRAIN_ENV[1].step(action)
        num_moves+=1
        time.sleep(3)

    TRAIN_ENV[1].render()  # Final message.
    trained_agent.set_train()
except KeyboardInterrupt:
    pass  # Quit the game.

print("Played {} steps, scoring {} points.".format(num_moves, score))




                    ________  ________  __    __  ________
                   |        \|        \|  \  |  \|        \
                    \$$$$$$$$| $$$$$$$$| $$  | $$ \$$$$$$$$
                      | $$   | $$__     \$$\/  $$   | $$
                      | $$   | $$  \     >$$  $$    | $$
                      | $$   | $$$$$    /  $$$$\    | $$
                      | $$   | $$_____ |  $$ \$$\   | $$
                      | $$   | $$     \| $$  | $$   | $$
                       \$$    \$$$$$$$$ \$$   \$$    \$$
              __       __   ______   _______   __        _______
             |  \  _  |  \ /      \ |       \ |  \      |       \
             | $$ / \ | $$|  $$$$$$\| $$$$$$$\| $$      | $$$$$$$\
             | $$/  $\| $$| $$  | $$| $$__| $$| $$      | $$  | $$
             | $$  $$$\ $$| $$  | $$| $$    $$| $$      | $$  | $$
             | $$ $$\$$\$$| $$  | $$| $$$$$$$\| $$      | $$  | $$
             | $$$$  \$$$$| $$__/ $$| $$  | $$| $$_____ | $$__/ $$
          