In [7]:
#import sys
#sys.path.insert(1, '../RTFM')
# go to RTFM main folder and run 'pip install -e .', then rtfm, model and core should be available as packages
from rtfm import featurizer as X
from rtfm import tasks # needed to make rtfm visible as Gym env
from core import environment # env wrapper


import gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import copy

In [2]:
# import scripts from muzero-general folder (not installed as a ackage -> not into the python path as a default)
import sys
sys.path.insert(1, '../muzero-general')
import muzero, models

In [3]:
import importlib

# Test muzero components 

## Dynamical model

We need to:
- init the model from the game configurations
- load the weights of the trained model from a checkpoint inside the muzero folder 
- init a new game to use as a test simulator
- write an accuracy test suite for the dynamics function

In [4]:
# Load model checkpoint
checkpoint = "2021-03-11--15-51-55/model.checkpoint"
rel_path = "../muzero-general/results/rtfm_groups_simple_stationary/"
model_check = torch.load(rel_path+checkpoint)
model_check.keys()

dict_keys(['weights', 'optimizer_state', 'total_reward', 'muzero_reward', 'opponent_reward', 'episode_length', 'mean_value', 'training_step', 'lr', 'total_loss', 'value_loss', 'reward_loss', 'policy_loss', 'num_played_games', 'num_played_steps', 'num_reanalysed_games', 'terminate'])

In [13]:
# Get Game instance and configurations for the current environment
game_name="rtfm_groups_simple_stationary"
game_module = importlib.import_module("games." + game_name)
game = game_module.Game()
config = game_module.MuZeroConfig() 

In [57]:
# Use config to init the correct model, then load model's weights and make sure it runs on cpu
model = models.MuZeroNetwork(config)
model.load_state_dict(model_check['weights']) # this works only if we don't change architecture params in the meanwhile
#model = model.cpu()
model

SimpleMuZeroNLNetwork(
  (representation_network): DataParallel(
    (module): SimpleNLRepresentationNetwork(
      (emb): Embedding(262, 8, padding_idx=0)
    )
  )
  (dynamics_network): DataParallel(
    (module): DynamicsNetwork_v2(
      (conv): Conv2d(53, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (resblocks): ModuleList(
        (0): ResidualBlock(
          (conv1): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ResidualBlock(
          (conv1): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False

In [110]:
device = next(model.parameters()).device
support_size = int((model.full_support_size-1)/2)

In [58]:
def random_policy():
    return np.random.choice(5)

In [135]:
def get_1step_true_dynamics(game, action, model):
    simulator = copy.deepcopy(game) # avoid to change internal state of the game
    observation, reward, done = simulator.step(action)
    # render it as encoded state and support reward
    observation = torch.tensor(observation).unsqueeze(0).to(device)
    reward = torch.tensor([reward]).unsqueeze(0).to(device).float()
    support_size = int((model.full_support_size-1)/2)
    with torch.no_grad():
        encoded_state = model.representation(observation) # represenation or representation_network?
        reward_support = models.scalar_to_support(reward, support_size).view(1,-1)
    return encoded_state, reward_support, done

In [136]:
def get_1step_learned_dynamics(observation, action, model):
    # wrap obs and action in tensors and add batch size dimension
    obs = torch.tensor(observation).unsqueeze(0).to(device)
    action = torch.tensor([action]).unsqueeze(0).to(device)
    
    # initial encoded state
    with torch.no_grad():
        encoded_state = model.representation(obs)

        next_state, reward = model.dynamics(encoded_state, action)
    return next_state, reward

In [137]:
# Prepare Game for tests
observation = game.reset()
action = random_policy()
print("action: ", action)

action:  4


Note: DataParallel wrapper is giving me problems to load the model in the cpu and run it successfully. 

In [138]:
true_encoded_state, true_reward_support, done = get_1step_true_dynamics(game, action, model)
pred_encoded_state, pred_support_reward = get_1step_learned_dynamics(observation, action, model)

In [139]:
assert true_encoded_state.shape == pred_encoded_state.shape, "Mismatch in encoded states shapes"
assert true_reward_support.shape == pred_support_reward.shape, "Mismatch in reward support shapes"
print("State shape: ", true_encoded_state.shape)
print("Reward shape: ", true_reward_support.shape)

State shape:  torch.Size([1, 48, 5, 5])
Reward shape:  torch.Size([1, 3])


In [140]:
true_reward = models.support_to_scalar(true_reward_support, support_size)
true_reward

tensor([[-0.2526]], device='cuda:0')

In [141]:
pred_reward = models.support_to_scalar(pred_support_reward, support_size)
pred_reward

tensor([[-0.0058]], device='cuda:0')

In [142]:
F.mse_loss(true_encoded_state,pred_encoded_state)

tensor(0.1212, device='cuda:0')

In [143]:
true_encoded_state.abs().sum()

tensor(80.6629, device='cuda:0')

In [144]:
pred_encoded_state.abs().sum()

tensor(216.5389, device='cuda:0')

In [145]:
torch.abs(true_reward-pred_reward)

tensor([[0.2468]], device='cuda:0')

In [146]:
observation = game.reset()
done = False

action_history = []
true_reward_history = []
pred_reward_history = []
reward_err_history = []
true_state_history = []
pred_state_history = []

t = 1
while not done or t==100:
    action = random_policy()
    action_history.append(action)
    # compare dynamics
    true_encoded_state, true_support_reward, done = get_1step_true_dynamics(game, action, model)
    pred_encoded_state, pred_support_reward = get_1step_learned_dynamics(observation, action, model)
    # compute scalar rewards
    true_reward = models.support_to_scalar(true_support_reward, support_size)
    pred_reward = models.support_to_scalar(pred_support_reward, support_size)
    abs_err_reward = torch.abs(true_reward-pred_reward)
    # store everything
    true_reward_history.append(true_reward)
    pred_reward_history.append(pred_reward)
    reward_err_history.append(abs_err_reward)
    true_state_history.append(true_encoded_state)
    pred_state_history.append(pred_encoded_state)
    # execute step
    observation, reward, done = game.step(action)
    t += 1