# Hindsight Optimization MCTS

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time
import copy
# custom imports
import utils
import train
import mcts
from rtfm import featurizer as X
import os
from torch import multiprocessing as mp
import random

Using device cuda:0
Using device cuda:0


In [2]:
# Check only if main logic of the training loop works
ucb_C = 1.0
discount = 0.9 # try with smaller discount
episode_length = 32
max_actions = 20
num_simulations = 10
num_trees = 3
#device = mcts.device
n_episodes = 4000
memory_size = 1024
batch_size = 32
n_steps = 5
tau = 0.1 # new_trg_params = (1-tau)*old_trg_params + tau*value_net_params
dir_noise = False
dirichlet_alpha = 0.5 # no real reason to choose this value, except it's < 1
exploration_fraction = 0.25
temperature = 1.
full_cross_entropy = True
entropy_bonus = True
entropy_weight = 1e-2

# Non-stationary environment with stochastic transitions

When the monsters are allowed to move there is some stochasticity in their movements, thus to the same action of the agent in the same state can correspond many different new states.

We want to adapt the simulators used inside MCTS so that they will have a different random seed than the one used by the real environment.

In [3]:
flags = utils.Flags(env="rtfm:groups_simple-v0")
gym_env = utils.create_env(flags)
featurizer = X.Render()
game_simulator = mcts.FullTrueSimulator(gym_env, featurizer)
object_ids = utils.get_object_ids_dict(game_simulator)

In [4]:
action = 0 # stay
n_realizations = 5
n_steps = 3
frame, valid_actions = game_simulator.reset()
original_sim_state = game_simulator.save_state_dict()
game_simulator.render()

# Simulate different realizations 
# -> the random seed is changing sequentially even if I reload the same internal state
for r in range(n_realizations):
    print("Realization number %d"%(r+1))
    game_simulator.load_state_dict(copy.deepcopy(original_sim_state))
    for i in range(n_steps):
        frame, valid_actions, reward, done = game_simulator.step(action)
        game_simulator.render()


██████
█   @█
█ n  █
█y   █
█  !?█
██████

Realization number 1

██████
█   @█
█ n  █
█y !?█
█    █
██████


██████
█   @█
█ n ?█
█y  !█
█    █
██████


██████
█   ?█
█ n !█
█y   █
█    █
██████

Realization number 2

██████
█   @█
█ n  █
█y  ?█
█  ! █
██████


██████
█   @█
█ n ?█
█y ! █
█    █
██████


██████
█   ?█
█ n  █
█y  !█
█    █
██████

Realization number 3

██████
█   @█
█ n  █
█y  ?█
█   !█
██████


██████
█   @█
█ n ?█
█y   █
█   !█
██████


██████
█   ?█
█ n  █
█y  !█
█    █
██████

Realization number 4

██████
█   @█
█ n  █
█y   █
█  ?!█
██████


██████
█   @█
█ n  █
█y ?!█
█    █
██████


██████
█   @█
█ n  █
█y ! █
█  ? █
██████

Realization number 5

██████
█   @█
█ n  █
█y   █
█   ?█
██████


██████
█   @█
█ n  █
█y   █
█  !?█
██████


██████
█   @█
█ n  █
█y  ?█
█   !█
██████



### Check what is the random number generator used by the state

In [8]:
action = 0 # stay
n_realizations = 5
n_steps = 2
frame, valid_actions = game_simulator.reset()
original_sim_state = game_simulator.save_state_dict()
game_simulator.render()

# Simulate different realizations 
# -> the random seed is changing sequentially even if I reload the same internal state
for r in range(n_realizations):
    random.seed(23)
    print("Realization number %d"%(r+1))
    game_simulator.load_state_dict(copy.deepcopy(original_sim_state))
    for i in range(n_steps):
        frame, valid_actions, reward, done = game_simulator.step(action)
        game_simulator.render()


██████
█ !@ █
█  y █
█ n? █
█    █
██████

Realization number 1

██████
█  ! █
█  ? █
█ n  █
█    █
██████


██████
█    █
█  ! █
█ n  █
█    █
██████

Realization number 2

██████
█  ! █
█  ? █
█ n  █
█    █
██████


██████
█  ! █
█  y █
█ n? █
█    █
██████

Realization number 3

██████
█  ! █
█  ? █
█ n  █
█    █
██████


██████
█    █
█  ? █
█ n  █
█    █
██████

Realization number 4

██████
█  ! █
█  ? █
█ n  █
█    █
██████


██████
█  ! █
█  y █
█ n? █
█    █
██████

Realization number 5

██████
█  ! █
█  ? █
█ n  █
█    █
██████


██████
█    █
█  ! █
█ n  █
█    █
██████



Comment: fixing the random seed of the random library makes the first stochastic transition equal for all, but then the second transition is different for each realization.

### Realizing this in multiprocessing

In [23]:
# write a function that plays an action for a certain amount of steps
def play_realization(
    game_simulator,
    process_number,
    action=0,
    n_steps=2,
    seed=None
):
    if seed is not None:
        random.seed(seed)
    time.sleep(process_number)
    #print("Process number %d - initial frame"%process_number)
    #game_simulator.render()
    for i in range(n_steps):
        frame, valid_actions, reward, done = game_simulator.step(action)
        print("Process number %d - frame num. %d"%(process_number,i+1))
        game_simulator.render()

In [27]:
# trying to do the same thing with torch multiprocessing
frame, valid_actions = game_simulator.reset()
game_simulator.render()

processes = []
ctx = mp.get_context("fork")

for i in range(n_realizations):
    process = ctx.Process(
        target=play_realization,
        args=(
            game_simulator,
            i,
            0,
            2
        ),
    )
    process.start()
    processes.append(process)
for process in processes:
    process.join()


██████
█ y?n█
█!   █
█    █
█   @█
██████

Process number 0 - frame num. 1

██████
█ y ?█
█ !  █
█    █
█   @█
██████

Process number 0 - frame num. 2

██████
█ y n█
█   ?█
█ !  █
█   @█
██████

Process number 1 - frame num. 1

██████
█ y n█
█ !? █
█    █
█   @█
██████

Process number 1 - frame num. 2

██████
█ y n█
█  !?█
█    █
█   @█
██████

Process number 2 - frame num. 1

██████
█ y ?█
█!   █
█    █
█   @█
██████

Process number 2 - frame num. 2

██████
█ y n█
█   ?█
█!   █
█   @█
██████

Process number 3 - frame num. 1

██████
█ y n█
█  ? █
█!   █
█   @█
██████

Process number 3 - frame num. 2

██████
█ y n█
█    █
█! ? █
█   @█
██████

Process number 4 - frame num. 1

██████
█ y n█
█  ? █
█!   █
█   @█
██████

Process number 4 - frame num. 2

██████
█ y n█
█    █
█ !? █
█   @█
██████



### How to run the MCTS code in multiprocessing

- The interaction with the "real" environment happens on the main thread
- At every decision step we call a mcts_step function, which has to return the selected action and the probability distribution over all possible actions (so that we can train the policy network with that if we want to)
- Inside the mcts_step we have to create as many processes as the number of trees that we want to grow in parallel
- Each process has to take in input the usual arguments of the MCTS initialization + those of the tree.run and return the vector of Q values and visit counts at the root. We can use pipes for that, like I was doing in the Batched A2C (https://github.com/nicoladainese96/RelationalDeepRL/blob/dev/Utils/batched_A2C_training.py)
- Similarly we could also use a buffer like in IMPALA (basically it's a list of tensors with index corresponding to the id of the process and shared memory passed to all processes); this avoids explicit communication between the various processes and the main one through pipes of queues, but requires the usage of tensors because they're the only "standard" class that supports shared memory (not a big deal).

In [42]:
ACTION_SPACE = 5 

def dummy_MCTS_process_pipes(
    worker_end_pipe,
    num_simulations
):
    # Dummy substitute of PV_MCTS variables that one can get from the root node after running mtcs 
    Qs = np.random.rand(ACTION_SPACE)
    Ns = np.array(
        [num_simulations//ACTION_SPACE for _ in range(ACTION_SPACE-1)]+\
        [num_simulations//ACTION_SPACE + num_simulations%ACTION_SPACE]
    )

    worker_end_pipe.send((Qs, Ns))

In [47]:
def dummy_master_MCTS_pipes(
    num_trees=3,
    num_simulations=10,
):
    # Init num_trees pair of pipes
    master_ends, worker_ends = zip(*[mp.Pipe() for _ in range(num_trees)])
    
    workers = []
    for worker_id, (master_end, worker_end) in enumerate(zip(master_ends, worker_ends)):
        p = mp.Process(target=dummy_MCTS_process_pipes,
                       args=(worker_end, num_simulations))
        p.start()
        workers.append(p)
    
    # Now that the workers have been started we need to get results from each pipe before joining the processes
    results = [master_end.recv() for master_end in master_ends]
    
    # Make sure all processes are closed before proceeding
    for p in workers:
        p.join()
        
    # Separate Qs and Ns and make 2 arrays of shape (num_trees, n_actions)
    Qs_realizations = np.concatenate([r[0].reshape(1,-1) for r in results])
    print("Qs_realizations: ", Qs_realizations.shape)
    
    Ns_realizations = np.concatenate([r[1].reshape(1,-1) for r in results])
    print("Ns_realizations: ", Ns_realizations)
    
    mean_Qs = Qs_realizations.mean(axis=0) # average among different trees / realizations
    mean_Ns = Ns_realizations.mean(axis=0)
    
    return mean_Qs, mean_Ns

dummy_master_MCTS_pipes()

Qs_realizations:  (3, 5)
Ns_realizations:  [[2 2 2 2 2]
 [2 2 2 2 2]
 [2 2 2 2 2]]


(array([0.33261673, 0.76911496, 0.45858311, 0.21947536, 0.8300822 ]),
 array([2., 2., 2., 2., 2.]))

### First real attempt

In [4]:
def PV_MCTS_process_pipes(
    worker_end,
    frame, 
    env, 
    valid_actions, 
    ucb_C, 
    discount, 
    max_actions, 
    pv_net,
    num_simulations,
    dir_noise, 
    dirichlet_alpha, 
    exploration_fraction,
    ucb_method="AlphaGo",
    mode="predict",
    debug_render=False
):
    """
    Execute single-threaded PV-MCTS step starting from 'frame' and communicate the root's children's Q-values
    and visit counts to the master process through a pipe.
    """
    tree = mcts.PV_MCTS(
                         frame, 
                         env, 
                         valid_actions, 
                         ucb_C, 
                         discount, 
                         max_actions, 
                         pv_net,
                         render=debug_render, 
                         ucb_method=ucb_method
                         )
    
    root, info = tree.run(num_simulations, 
                          mode=mode, 
                          dir_noise=dir_noise, 
                          dirichlet_alpha=dirichlet_alpha, 
                          exploration_fraction=exploration_fraction
                         )
    # get Q values and visit counts at the end of the run and send them to the master process
    Qs = root.get_Q_values(discount).cpu().numpy()
    Ns = root.get_children_visit_counts()
    print("Qs: ", Qs)
    print("Ns: ", Ns)
    worker_end.send((Qs, Ns))

In [5]:
def hop_pv_mcts_step(
    frame, 
    env, 
    valid_actions, 
    ucb_C, 
    discount, 
    max_actions, 
    pv_net,
    num_simulations, 
    num_trees,
    temperature=0.0,
    dir_noise=True, 
    dirichlet_alpha=1., 
    exploration_fraction=0.25,
    ucb_method="AlphaGo",
    mode="predict",
    debug_render=False, 
):
    """
    Executes one step of Hindsight Optimization Policy&Value MCTS.
    
    Hindsight Optimization estimates Q values of many determinizations of a stochastic process through a 
    deterministic (in terms of transitions of the environment) planning algorithm in order to find an 
    approximation of the real Q values.  
    
    Returns the mean Q values and visit counts, obtained by averaging those quantities along 'num_trees' 
    parallel simulations.
    """
    
    pv_net.share_memory()
    
    # Init num_trees pair of pipes
    master_ends, worker_ends = zip(*[mp.Pipe() for _ in range(num_trees)])
    
    workers = []
    for worker_id, (master_end, worker_end) in enumerate(zip(master_ends, worker_ends)):
        p = mp.Process(target=PV_MCTS_process_pipes,
                       args=(
                            worker_end,
                            frame, 
                            env, 
                            valid_actions, 
                            ucb_C, 
                            discount, 
                            max_actions, 
                            pv_net,
                            num_simulations,
                            dir_noise, 
                            dirichlet_alpha, 
                            exploration_fraction,
                            ucb_method,
                            mode,
                            debug_render,
                       )
                      )
        p.start()
        workers.append(p)
    
    # Now that the workers have been started we need to get results from each pipe before joining the processes
    results = [master_end.recv() for master_end in master_ends]
    
    # Make sure all processes are closed before proceeding
    for p in workers:
        p.join()
        
    # Separate Qs and Ns and make 2 arrays of shape (num_trees, n_actions)
    Qs_realizations = np.concatenate([r[0].reshape(1,-1) for r in results])
    print("Qs_realizations: \n", Qs_realizations)
    
    Ns_realizations = np.concatenate([r[1].reshape(1,-1) for r in results])
    print("Ns_realizations: \n", Ns_realizations)
    
    mean_Qs = Qs_realizations.mean(axis=0) # average among different trees / realizations
    mean_Ns = Ns_realizations.mean(axis=0)
    
    return mean_Qs, mean_Ns

In [6]:
def softmax_Q(Qs, T):
    """
    Samples an action from the softmax probability obtained from the Q values and a temperature parameter.
    Returns both the action and the probability mass function over the actions.
    """
    Qs = torch.tensor(Qs)
    if T > 0:
        probs = F.softmax(Qs/T, dim=0)
    elif T==0:
        probs = torch.zeros(len(Qs)) 
        a = torch.argmax(Qs)
        probs[a] = 1.

    sampled_action = torch.multinomial(probs, 1).item()
    return sampled_action, probs.cpu().numpy()

In [7]:
def play_rollout_pv_net_hop_mcts(
    episode_length,
    object_ids,
    env, 
    ucb_C, 
    discount, 
    max_actions, 
    pv_net,
    num_simulations, 
    num_trees,
    temperature=0.0,
    dir_noise=True, 
    dirichlet_alpha=1., 
    exploration_fraction=0.25,
    ucb_method="p-UCT-AlphaGo",
    mode="predict",
    render=False,
    debug_render=False,
):
    """
    Plays a rolllout with a policy and value MCTS with the hindsight optimization technique 
    to deal with stochastic transitions. 
    
    If mode='simulate', leaf node's evaluation is done with MC rollout evaluations, if mode='predict', 
    the value network is used instead.
    
    Samples the next action based on the Q-values of the root node's children (averaged among 'num_trees' 
    realizations of the tree search, each with independently sampled transitions) and returns both the MCTS policy 
    and the list of sampled actions as possible targets with which to train the policy network.
    
    Formula used for MCTS policy (softmax of Q-values with temperature):
    
    p(a) = exp{Q(a)/T} / \sum_b exp{Q(b)/T}

    Note: the softmax function with T=0 is the argmax function.
    
    This function is also mixing a prior sampled from a Dirichlet distribution (with parameters dirichlet_alpha for each 
    possible action) to the prior of the root node's children, in order to increase exploration at the base of the tree 
    even in cases where the policy is almost deterministic. The mixture coefficient between the prior and the categorical 
    distribution sampled by the Dirichelt distribution is the exploration_fraction, such that:
    
    p(a) = (1-exploration_fraction) Prior(a) + exploration_fraction Dir(a)
    
    """
    
    A = len(env.env.action_space)
    action_dict = {
        0:"Stay",
        1:"Up",
        2:"Down",
        3:"Left",
        4:"Right"
    }
    frame, valid_actions = env.reset()
    if render:
        env.render()
    total_reward = 0
    done = False
    new_root = None
    # variables used for training of value net
    frame_lst = [frame]
    reward_lst = []
    done_lst = []
    action_lst = []
    probs_lst = []
    
    for i in range(episode_length):
        
        Qs, Ns = hop_pv_mcts_step(
            frame, 
            env, 
            valid_actions, 
            ucb_C, 
            discount, 
            max_actions, 
            pv_net,
            num_simulations, 
            num_trees,
            temperature,
            dir_noise, 
            dirichlet_alpha, 
            exploration_fraction,
            ucb_method=ucb_method,
            mode=mode,
            debug_render=debug_render
        )

        action, probs = softmax_Q(Qs, temperature)
        action_lst.append(action)
        probs_lst.append(probs)
        
        if render:
            print("Action selected from HOP-MCTS: ", action, "({})".format(action_dict[action]))
        frame, valid_actions, reward, done = env.step(action)
        
        frame_lst.append(frame)
        reward_lst.append(reward)
        done_lst.append(done)
        
        if render:
            env.render()
        total_reward += reward
        
        if done:
            frame, valid_actions = env.reset()
            if render:
                print("\nNew episode begins.")
                env.render()
            done = False
            new_root = None


    return total_reward, frame_lst, reward_lst, done_lst, action_lst, probs_lst

In [8]:
frame, valid_actions = game_simulator.reset()
game_simulator.render()


██████
█   @█
█y  ?█
█   !█
█   n█
██████



In [9]:
pv_net = mcts.DiscreteSupportPVNet_v3(gym_env)

In [12]:
next(pv_net.parameters()).device

device(type='cpu')

In [22]:
hop_pv_mcts_step(
    frame, 
    game_simulator, 
    valid_actions, 
    ucb_C, 
    discount, 
    max_actions, 
    pv_net,
    num_simulations, 
    num_trees,
    temperature,
    dir_noise, 
    dirichlet_alpha, 
    exploration_fraction,
    ucb_method="p-UCT-AlphaGo",
    mode="predict",
    debug_render=False, 
)

Qs:  [ 0.00106263 -0.20661248 -0.00563563 -1.                -inf]
Ns:  [2. 4. 3. 1. 0.]
Qs:  [-0.00607645 -0.30274752 -0.00301075 -0.01531772        -inf]
Ns:  [2. 3. 3. 2. 0.]
Qs:  [-0.00607645 -0.3012348  -0.00412099 -0.0118949         -inf]
Ns:  [2. 3. 3. 2. 0.]
Qs_realizations: 
 [[-0.00607645 -0.30274752 -0.00301075 -0.01531772        -inf]
 [ 0.00106263 -0.20661248 -0.00563563 -1.                -inf]
 [-0.00607645 -0.3012348  -0.00412099 -0.0118949         -inf]]
Ns_realizations: 
 [[2. 3. 3. 2. 0.]
 [2. 4. 3. 1. 0.]
 [2. 3. 3. 2. 0.]]


(array([-0.00369676, -0.27019826, -0.00425579, -0.34240422,        -inf],
       dtype=float32),
 array([2.        , 3.33333333, 3.        , 1.66666667, 0.        ]))

### Verify with debug render and delayed processes that the transitions can be different for the same action state in different trees

We need to make sure of the following things:
- in each tree the **sampling of the next state given an action-state pair is independent from the sampling happening in the other trees** (this can be done implicitly by just checking or explicitly by creating in the master process a random seed for each tree and fixing it in each process).
- the real transition sampled from the real environment after playing the action decided inside the hop-mcts is independent from the sampling done inside the various trees.
- the real environment is left untouched during the mcts step (i.e. its internal state doesn't change); however the random seed CAN change and that is not a problem per-se, as long as the previous point holds.

In [9]:
def delayed_PV_MCTS_process_pipes(
    process_id,
    worker_end,
    frame, 
    env, 
    valid_actions, 
    ucb_C, 
    discount, 
    max_actions, 
    pv_net,
    num_simulations,
    dir_noise, 
    dirichlet_alpha, 
    exploration_fraction,
    ucb_method="AlphaGo",
    mode="predict",
    debug_render=True
):
    time.sleep(process_id*5) # make the processes sequential to order and interpret correctly the outputs
    tree = mcts.PV_MCTS(
                         frame, 
                         env, 
                         valid_actions, 
                         ucb_C, 
                         discount, 
                         max_actions, 
                         pv_net,
                         render=debug_render, 
                         ucb_method=ucb_method
                         )
    
    root, info = tree.run(num_simulations, 
                          mode=mode, 
                          dir_noise=dir_noise, 
                          dirichlet_alpha=dirichlet_alpha, 
                          exploration_fraction=exploration_fraction
                         )
    # get Q values and visit counts at the end of the run and send them to the master process
    Qs = root.get_Q_values(discount).cpu().numpy()
    Ns = root.get_children_visit_counts()
    print("Qs: ", Qs)
    print("Ns: ", Ns)
    worker_end.send((Qs, Ns))

In [10]:
def delayed_hop_pv_mcts_step(
    frame, 
    env, 
    valid_actions, 
    ucb_C, 
    discount, 
    max_actions, 
    pv_net,
    num_simulations, 
    num_trees,
    temperature=0.0,
    dir_noise=True, 
    dirichlet_alpha=1., 
    exploration_fraction=0.25,
    ucb_method="AlphaGo",
    mode="predict",
    debug_render=False, 
):
    """
    Executes one step of Hindsight Optimization Policy&Value MCTS.
    
    Hindsight Optimization estimates Q values of many determinizations of a stochastic process through a 
    deterministic (in terms of transitions of the environment) planning algorithm in order to find an 
    approximation of the real Q values.  
    """
    
    pv_net.share_memory()
    
    # Init num_trees pair of pipes
    master_ends, worker_ends = zip(*[mp.Pipe() for _ in range(num_trees)])
    
    workers = []
    for worker_id, (master_end, worker_end) in enumerate(zip(master_ends, worker_ends)):
        p = mp.Process(target=delayed_PV_MCTS_process_pipes,
                       args=(
                            worker_id,
                            worker_end,
                            frame, 
                            env, 
                            valid_actions, 
                            ucb_C, 
                            discount, 
                            max_actions, 
                            pv_net,
                            num_simulations,
                            dir_noise, 
                            dirichlet_alpha, 
                            exploration_fraction,
                            ucb_method,
                            mode,
                            debug_render,
                       )
                      )
        p.start()
        workers.append(p)
    
    # Now that the workers have been started we need to get results from each pipe before joining the processes
    results = [master_end.recv() for master_end in master_ends]
    
    # Make sure all processes are closed before proceeding
    for p in workers:
        p.join()
        
    # Separate Qs and Ns and make 2 arrays of shape (num_trees, n_actions)
    Qs_realizations = np.concatenate([r[0].reshape(1,-1) for r in results])
    print("Qs_realizations: \n", Qs_realizations)
    
    Ns_realizations = np.concatenate([r[1].reshape(1,-1) for r in results])
    print("Ns_realizations: \n", Ns_realizations)
    
    mean_Qs = Qs_realizations.mean(axis=0) # average among different trees / realizations
    mean_Ns = Ns_realizations.mean(axis=0)
    
    return mean_Qs, mean_Ns

In [13]:
action_dict = {
        0:"Stay",
        1:"Up",
        2:"Down",
        3:"Left",
        4:"Right"
    }

In [14]:
num_simulations = 10
frame, valid_actions = game_simulator.reset()
game_simulator.render()

Qs, Ns = delayed_hop_pv_mcts_step(
            frame, 
            game_simulator, 
            valid_actions, 
            ucb_C, 
            discount, 
            max_actions, 
            pv_net,
            num_simulations, 
            num_trees,
            temperature,
            dir_noise, 
            dirichlet_alpha, 
            exploration_fraction,
            ucb_method="p-UCT-AlphaGo",
            mode="predict",
            debug_render=True, 
)

action, probs = softmax_Q(Qs, temperature, discount)
print("Action selected from HOP-MCTS: ", action, "({})".format(action_dict[action]))
frame, valid_actions, reward, done = game_simulator.step(action)
game_simulator.render()


██████
█?  !█
█    █
█  @ █
█y  n█
██████

Valid actions as child:  [0 1 2 3 4]
Prior over the children:  [0.2812942  0.16817763 0.19035386 0.21294744 0.14722684]
Weights over the children:  [0.23869912 0.18456723 0.19635922 0.20768574 0.17268872]
Terminal node:  False

Simulation 1 started.

██████
█?  !█
█    █
█  @ █
█y  n█
██████

actions:  [0 1 2 3 4]
value_terms:  [0 0 0 0 0]
exploration_terms:  [0.2812942  0.16817763 0.19035386 0.21294744 0.14722684]
ucb_values:  [0.2812942  0.16817763 0.19035386 0.21294744 0.14722684]
max_U:  0.28129419684410095
mask:  [ True False False False False]
best_actions:  [0]
Current tree depth:  1
Action selected:  0 Stay
Child node terminal:  False
Child node expanded:  False
Expansion phase started
valid_actions:  [0 1 2 3 4]
prior:  [0.26710996 0.18859784 0.1876245  0.21046345 0.14620419]
reward:  0
done:  False
Valid actions as child:  [0 1 2 3 4]
Prior over the children:  [0.26710996 0.18859784 0.1876245  0.21046345 0.14620419]
Weights over the

Terminal node:  False

██████
█ ?  █
█    █
█ @ !█
█y  n█
██████

Value prediction/simulation phase started
Predicted/simulated value:  -0.018051013350486755
Backpropagation phase started
Simulation 7 done.

Simulation 8 started.

██████
█?  !█
█    █
█  @ █
█y  n█
██████

actions:  [0 1 2 3 4]
value_terms:  [-0.01578507 -0.01125397 -0.01954085 -0.01503211 -0.00907185]
exploration_terms:  [0.26520671 0.23783909 0.269201   0.20076877 0.20821019]
ucb_values:  [0.24942164 0.22658512 0.24966015 0.18573667 0.19913835]
max_U:  0.2496601508453384
mask:  [False False  True False False]
best_actions:  [2]

██████
█    █
█?  !█
█    █
█y @n█
██████

Current tree depth:  1
Action selected:  2 Down
Child node terminal:  False
Child node expanded:  True
actions:  [0 1 3 4]
value_terms:  [0 0 0 0]
exploration_terms:  [0.34076208 0.2259195  0.26294556 0.17037289]
ucb_values:  [0.34076208 0.2259195  0.26294556 0.17037289]
max_U:  0.34076207876205444
mask:  [ True False False False]
best_actions:  [0]


exploration_terms:  [0.2812942  0.33635527 0.19035386 0.21294744 0.29445368]
ucb_values:  [0.26687612 0.33635527 0.18127031 0.19385705 0.29445368]
max_U:  0.3363552689552307
mask:  [False  True False False False]
best_actions:  [1]
Current tree depth:  1
Action selected:  1 Up
Child node terminal:  False
Child node expanded:  False
Expansion phase started
valid_actions:  [0 1 2 3 4]
prior:  [0.26629373 0.19001293 0.18333027 0.21786879 0.14249437]
reward:  0
done:  False
Valid actions as child:  [0 1 2 3 4]
Prior over the children:  [0.26629373 0.19001293 0.18333027 0.21786879 0.14249437]
Weights over the children:  [0.2319932  0.19596855 0.19249164 0.20984195 0.1697046 ]
Terminal node:  False

██████
█  ! █
█? @ █
█    █
█y  n█
██████

Value prediction/simulation phase started
Predicted/simulated value:  -0.012504416517913342
Backpropagation phase started
Simulation 4 done.

Simulation 5 started.

██████
█?  !█
█    █
█  @ █
█y  n█
██████

actions:  [0 1 2 3 4]
value_terms:  [-0.014418

max_U:  0.25465821335814126
mask:  [False  True False False False]
best_actions:  [1]

██████
█  ! █
█? @ █
█    █
█y  n█
██████

Current tree depth:  1
Action selected:  1 Up
Child node terminal:  False
Child node expanded:  True
actions:  [0 1 2 3 4]
value_terms:  [0 0 0 0 0]
exploration_terms:  [0.26629373 0.19001293 0.18333027 0.21786879 0.14249437]
ucb_values:  [0.26629373 0.19001293 0.18333027 0.21786879 0.14249437]
max_U:  0.2662937343120575
mask:  [ True False False False False]
best_actions:  [0]
Current tree depth:  2
Action selected:  0 Stay
Child node terminal:  False
Child node expanded:  False
Expansion phase started
valid_actions:  [0 1 2 3 4]
prior:  [0.2630446  0.17771338 0.18425365 0.22020388 0.15478458]
reward:  0
done:  False
Valid actions as child:  [0 1 2 3 4]
Prior over the children:  [0.2630446  0.17771338 0.18425365 0.22020388 0.15478458]
Weights over the children:  [0.23036516 0.18934864 0.1928014  0.21077281 0.17671198]
Terminal node:  False

██████
█ !  █
█ 

### Full interaction cycle

In [30]:
play_rollout_pv_net_hop_mcts(
    episode_length,
    object_ids,
    game_simulator, 
    ucb_C, 
    discount, 
    max_actions, 
    pv_net,
    num_simulations, 
    num_trees,
    render=True
)


██████
█?  n█
█!   █
█  y █
█ @  █
██████

Qs:  [-0.00103143 -0.4469593         -inf -0.00187705  0.40618083]
Ns:  [2. 2. 0. 2. 4.]
Qs:  [-0.00580754 -0.45008194        -inf -0.45702723 -0.00090242]
Ns:  [2. 2. 0. 2. 4.]
Qs:  [-0.00546341 -0.16254786        -inf -0.0023125   0.00084379]
Ns:  [1. 5. 0. 2. 2.]
Qs_realizations: 
 [[-0.00103143 -0.4469593         -inf -0.00187705  0.40618083]
 [-0.00580754 -0.45008194        -inf -0.45702723 -0.00090242]
 [-0.00546341 -0.16254786        -inf -0.0023125   0.00084379]]
Ns_realizations: 
 [[2. 2. 0. 2. 4.]
 [2. 2. 0. 2. 4.]
 [1. 5. 0. 2. 2.]]
Action selected from HOP-MCTS:  4 (Right)

██████
█   n█
█?   █
█  y █
█  @ █
██████

Qs:  [-1.3203463e-04 -4.5404807e-01           -inf -2.7055243e-01
 -3.9222743e-03]
Ns:  [2. 2. 0. 3. 3.]
Qs:  [-3.0375186e-03 -4.5404807e-01           -inf -4.5217898e-01
  1.4755425e-04]
Ns:  [2. 2. 0. 2. 4.]
Qs:  [-0.00293567 -0.00086584        -inf -0.45420122 -0.00313662]
Ns:  [1. 5. 0. 2. 2.]
Qs_realizations: 
 [[

Qs:  [-1.              -inf -1.              -inf -0.5005436]
Ns:  [1. 0. 2. 0. 7.]
Qs:  [-0.17823939        -inf -1.                -inf -0.44923458]
Ns:  [5. 0. 1. 0. 4.]
Qs:  [-0.28541514        -inf -1.                -inf -0.29561156]
Ns:  [6. 0. 1. 0. 3.]
Qs_realizations: 
 [[-0.17823939        -inf -1.                -inf -0.44923458]
 [-1.                -inf -1.                -inf -0.5005436 ]
 [-0.28541514        -inf -1.                -inf -0.29561156]]
Ns_realizations: 
 [[5. 0. 1. 0. 4.]
 [1. 0. 2. 0. 7.]
 [6. 0. 1. 0. 3.]]
Action selected from HOP-MCTS:  4 (Right)

██████
█ @ n█
█ ?y!█
█    █
█    █
██████

Qs:  [-1.                -inf -1.         -0.00320416  0.7717096 ]
Ns:  [1. 0. 1. 1. 7.]
Qs:  [-1.              -inf -1.        -0.0058542  0.7721163]
Ns:  [1. 0. 1. 1. 7.]
Qs:  [ 1.6457363e-04           -inf -1.0000000e+00 -3.9424207e-03
  7.7170962e-01]
Ns:  [1. 0. 1. 1. 7.]
Qs_realizations: 
 [[-1.0000000e+00           -inf -1.0000000e+00 -5.8542043e-03
   7.72116

Ns:  [4. 3. 2. 0. 1.]
Qs_realizations: 
 [[ 1.0000000e+00 -4.5096225e-01  9.9046878e-04           -inf
  -1.0000000e+00]
 [ 1.0000000e+00 -4.5096225e-01  1.5040984e-03           -inf
  -1.0000000e+00]
 [-2.2583315e-01 -3.0298212e-01 -4.5409477e-01           -inf
  -1.0000000e+00]]
Ns_realizations: 
 [[6. 2. 1. 0. 1.]
 [6. 2. 1. 0. 1.]
 [4. 3. 2. 0. 1.]]
Action selected from HOP-MCTS:  0 (Stay)

██████
█    █
█? n █
█    █
█    █
██████
blessed sword

New episode begins.

██████
█    █
█ ?y@█
█ !  █
█   n█
██████

Qs:  [ 1.0910917e-04 -2.9964939e-01  1.4149060e-03 -1.0000000e+00
           -inf]
Qs:  [-0.20283109 -0.6003696  -0.4473644  -1.                -inf]
Ns:  [3. 3. 3. 1. 0.]
Ns:  [4. 3. 2. 1. 0.]
Qs:  [-0.0057136  -0.30016044 -0.0009089   0.00634058        -inf]
Ns:  [2. 3. 3. 2. 0.]
Qs_realizations: 
 [[ 1.0910917e-04 -2.9964939e-01  1.4149060e-03 -1.0000000e+00
            -inf]
 [-2.0283109e-01 -6.0036957e-01 -4.4736439e-01 -1.0000000e+00
            -inf]
 [-5.7136030e-03 -3

(2, [{'name': tensor([[[[[  3,   0],
              [170,   0]],
   
             [[  3,   0],
              [170,   0]],
   
             [[  3,   0],
              [170,   0]],
   
             [[  3,   0],
              [170,   0]],
   
             [[  3,   0],
              [170,   0]],
   
             [[  3,   0],
              [170,   0]]],
   
   
            [[[  3,   0],
              [170,   0]],
   
             [[180,   0],
              [170,   0]],
   
             [[170,   0],
              [170,   0]],
   
             [[170,   0],
              [170,   0]],
   
             [[201,   0],
              [170,   0]],
   
             [[  3,   0],
              [170,   0]]],
   
   
            [[[  3,   0],
              [170,   0]],
   
             [[179,   0],
              [170,   0]],
   
             [[170,   0],
              [170,   0]],
   
             [[170,   0],
              [170,   0]],
   
             [[170,   0],
              [170,   0]],
   
          

## Next steps

### 1. Train standard PV-MCTS on stochastic environment and use it as a baseline

If implemented properly (still to be checked), the **most probable mode of failure is the one of planning in hindsight of the wrong event**: basically you sample a transition and treat it as if it's the only possible transition and find the best action in this new determinization of the environment (which is similar to re-planning in hindsight if the transition actually happened). 
The problem with this is that you are planning while taking for sure an event that might not happen and you do not take into account any other possibility nor any uncertainty.

### 2. Train HOP-MCTS on stochastic environment with a single main process interacting with the environment

This should be able to keep almost unchanged most of the code used for the training cycle, but probably cannot work with the pv_net loaded on the GPU, because sharing the memory of such a model it's tricky.
So the idea is to test in this sub-optimal setting that the code works and the architecture can train, even if we have to do it on CPUs.

### 3. Set up a distributed system with many HOP-MCTS actors and a single central learner on GPU 

The idea here is to create experience faster by using multiple "actors" interacting each with a different environment in parallel. We then want to store all the trajectories in a central replay buffer and use a copy of the policy and value net loaded on the GPU to perform updates asynchronously w.r.t. the experience collection phase.

Example of allocation of resources:
- 4 actor processes on CPU
- each actor process uses tot_num_cpus//4 processes to run the MCTS in parallel at each decision step
- 1 learner process on GPU

Code for orchestrating actor and learner processes taken from Facebook's implementation of IMPALA on a signle machine with torch.multiprocessing (https://github.com/facebookresearch/torchbeast).

What changes is that we use a memory to store trajectories and then we sample them (thus ours is an off-policy algorithm), whereas in IMPALA only batches of fresh trajectories are used for updates.

In IMPALA the buffer is quite small because every time we use some experience for an update we then dischard it (thus the shape is approximately (batch_size, rollout_length) ); the buffer uses shared memory, so that every actor can write down its trajectories directly in the buffer, without passing the data to the main process thorugh pipes and it also uses a queuing system to tell to every process at which index of the buffer (i.e. the position along the first dimension) to write the next trajectory.

I'm not sure how scalable the buffer with shared memory is, but it shouldn't be a problem. Also the queuing system can probably be replaced with something simple, maybe acting with a lock while appending the trajectory in the last place of the replay buffer. If that doesn't work, we can still devise a queuing system like in IMPALA.

Lastly, the weights of the actors are updated in the following way: all the actors are using the same shared model with a single set of weigths; the learner process also has access to the actor model and so we perform the update on the learner model (which has the same architecture of the actors and it's initialized with the same weights) and then at the end of every update we load its state dictionary (with the new weigths) into the actor model. 

In this way the actors never need to stop interacting with the environment in order to receive the new weights.
