In [1]:
import utils

In [2]:
import numpy as np
import copy
from rtfm import featurizer as X

In [3]:
verbose = True
vprint = print if verbose else lambda *args, **kwargs: None

In [4]:
class MCTS():
    def __init__(self, 
                 simulator,
                 valid_actions,
                 ucb_c,
                 discount,
                 max_actions,
                 root=None,
                 render=False):
        """
        Monte Carlo Tree Search assuming deterministic dynamics.
        
        simulator: 
            wrapper of the environment that returns a scalar reward, a list of valid actions 
            and a 'done' boolean flag when presented with an action
        valid_moves:
            list of valid moves for the root node
        ucb_c:
            Constantused in the UCB1 formula for trees
            UCB(s,a) = Q(s,a) + ucb_c*sqrt(log N(s,a)/(\sum_b N(s,b)))
        discount:
            discoung factor gamma of the MDP
        max_actions:
            number of actions to be taken at most from the root node to the end of a rollout
        root: 
            might be the child of an old root node; use it to keep all the cached computations 
            from previous searches with a different root node. 
        """
        self.simulator = simulator
        self.original_dict = simulator.save_state_dict()
        self.valid_actions = valid_actions
        self.action_space = len(valid_actions)
        self.ucb_c = ucb_c
        self.discount = discount
        self.max_actions = max_actions
        self.root = root
        self.render = render
    
    def get_subtree(self, action):
        """
        Returns the subtree whose root node is the current root's child corresponding to
        the given action.
        """
        return self.root.children[action]
    
    def run(self, num_simulations):
        """
        Runs num_simulations searches starting from the root node corresponding to the internal
        state of the simulator given during initialization.
        Returns the root node and an extra_info dictionary
        """
        if self.root is None:
            self.root = Node()
            self.root.expand(
                self.valid_actions,
                0, # reward to get to root
                False, # terminal node
                self.simulator # state of the simulator at the root node 
            )
            # not sure about this
            self.root.visit_count += 1
        
        max_tree_depth = 0
        root = self.root
        for n in range(num_simulations):
            ### Start of a simulation/search ###
            vprint("\nSimulation %d started."%(n+1))
            node = root
            search_path = [node]
            current_tree_depth = 0
            if self.render:
                node.render()
            ### Selection phase until leaf node is reached ###
            while node.expanded or (current_tree_depth<self.max_actions):
                current_tree_depth += 1
                action, node = self.select(node)
                if self.render and node.expanded:
                    node.render()
                vprint("Current tree depth: ", current_tree_depth)
                vprint("Action selected: ", action)
                vprint("Child node terminal: ", node.terminal)
                vprint("Child node expanded: ", node.expanded)
                if node.expanded or node.terminal:
                    search_path.append(node)
                    if node.terminal:
                        break
                else:
                    break
                
            ### Expansion of leaf node (if not terminal)###
            vprint("Expansion phase started")
            if not node.terminal:
                parent = search_path[-1] # last expanded node on the search path
                node = self.expand(node, parent, action)
                if self.render:
                    node.render()
                search_path.append(node)
            
            ### Simulation phase for self.max_actions - current_tree_depth steps ###
            vprint("Simulation  phase started")
            value = self.simulate(node, current_tree_depth)
            vprint("Simulated value: ", value)
            
            ### Backpropagation of the leaf node value along the seach_path ###
            vprint("Backpropagation phase started")
            self.backprop(search_path, value)
        
            max_tree_depth = max(max_tree_depth, current_tree_depth)
            vprint("Simulation %d done."%(n+1))
        extra_info = {
            "max_tree_depth": max_tree_depth
        }
        # just a check to see if root works as a shallow copy of self.root
        assert root.visit_count == self.root.visit_count, "self.root not updated during search"
        
        # make sure that the simulator internal state is reset to the original one
        self.simulator.load_state_dict(root.simulator_dict)
        return root, extra_info
        
    def select(self, node):
        """
        Use UCT formula on the input node; return the action selected and the corresponding child node 
        """
        actions = []
        ucb_values = []
        for action, child in node.children.items():
            actions.append(action)
            ucb_values.append(self.ucb_score(node, child))
        actions = np.array(actions)
        vprint("actions: ", actions)
        
        ucb_values = np.array(ucb_values)
        vprint("ucb_values: ", ucb_values)
        
        max_U = ucb_values.max()
        vprint("max_U: ", max_U)
        
        mask = (ucb_values==max_U)
        vprint("mask: ", mask)
        
        best_actions = actions[mask]
        vprint("best_actions: ", best_actions)
        
        action = np.random.choice(best_actions)
        return action, node.children[action]

    def ucb_score(self, parent, child, eps=1e-3):
        """
        The score for a node is based on its value, plus an exploration bonus.
        """
        exploration_term = self.ucb_c*np.sqrt(np.log(parent.visit_count)/(child.visit_count+eps))

        if child.visit_count > 0:
            # Mean value Q
            value_term = child.reward + self.discount*child.value() 
        else:
            value_term = 0

        return value_term + exploration_term
    
    def expand(self, node, parent, action):
        """
        Expand the node obtained by taking the given action from the parent node 
        """
        simulator = parent.get_simulator(self.simulator) # get a deepcopy of the simulator with the parent's state stored
        valid_actions, reward, done = simulator.step(action) # this also updates the simulator's internal state
        vprint("reward: ", reward)
        vprint("done: ", done)
        node.expand(valid_actions, reward, done, simulator)
        return node
    
    def simulate(self, node, current_depth):
        """
        Simulate a rollout with a random policy starting from the input node
        until the end of the episode or self.max_actions are reached 
        (also considering the current depth of the input node from the root)
        """
        if not node.terminal:
            simulator = node.get_simulator(self.simulator)
            valid_actions = node.valid_actions
            steps = self.max_actions - current_depth
            cum_discounted_reward = 0
            for i in range(steps):
                action = np.random.choice(valid_actions)
                valid_actions, reward, done = simulator.step(action)
                cum_discounted_reward += (self.discount**i)*reward
                if done:
                    break
        else:
            cum_discounted_reward = 0
        return cum_discounted_reward
            
    def backprop(self, search_path, value):
        """
        Update the value sum and visit count of all nodes along the search path.
        """
        for node in reversed(search_path):
            node.value_sum += value
            node.visit_count += 1
            value = node.reward + self.discount*value

In [5]:
class Node:
    def __init__(self):
        self.visit_count = 0
        self.value_sum = 0
        self.children = {}
        self.reward = 0
        self.simulator = None
        self.expanded = False
        self.terminal = False
        self.simulator_dict = None

    def value(self):
        if self.visit_count == 0:
            return 0
        return self.value_sum / self.visit_count

    def expand(self, valid_actions, reward, done, simulator):
        self.expanded = True
        vprint("Valid actions as child: ", valid_actions)
        vprint("Terminal node: ", done)
        self.reward = reward
        self.terminal = done
        self.valid_actions = valid_actions
        if not done:
            for action in valid_actions:
                self.children[action] = Node()
        self.simulator_dict = simulator.save_state_dict()
        
    def get_simulator(self, simulator):
        if self.simulator_dict is not None:
            # load a deepcoy of the simulator_dict, so that the internal variable remains unchanged
            simulator.load_state_dict(copy.deepcopy(self.simulator_dict)) 
            return simulator
        else:
            print("Trying to load simulator_dict, but it was never instantiated.")
            raise NotImplementedError()
    
    def best_action(self, discount):
        """
        Look among the children and take the one with higher Q-value. 
        Exclude children with 0 visits.
        """
        actions = []
        Qvalues = []
        for action, child in self.children.items():
            actions.append(action)
            Qvalues.append(child.reward + discount*child.value())
        actions = np.array(actions)
        Qvalues = np.array(Qvalues)
        max_Q = Qvalues.max()
        mask = (Qvalues==max_Q)
        best_actions = actions[mask]
        return np.random.choice(best_actions)
    
    def render(self):
        if self.simulator is not None:
            self.simulator.render()
        else:
            raise NotImplementedError()

In [6]:
class TrueSimulator():
    def __init__(self, env, featurizer=None):
        self.env = env
        self.action_space = len(gym_env.action_space)
        self.featurizer = featurizer
        
    def reset(self):
        frame = self.env.reset()
        valid_moves = frame['valid'].numpy().astype(bool) # boolean mask of shape (action_space)
        actions = np.arange(self.action_space)
        valid_actions = actions[valid_moves]
        return valid_actions
    
    def step(self, action):
        frame, reward, done, _ = self.env.step(int(action))
        valid_moves = frame['valid'].numpy().astype(bool) # boolean mask of shape (action_space)
        actions = np.arange(self.action_space)
        valid_actions = actions[valid_moves]
        return valid_actions, reward, done
    
    def render(self):
        self.featurizer.featurize(self.env)
        
    def save_state_dict(self):
        return self.env.save_state_dict()
        
    def load_state_dict(self, d):
        self.env.load_state_dict(d)

In [7]:
### Define parameters ###
ucb_C = 1.0
discount = 0.997
episode_length = 100
max_actions = 100
num_simulations = 5

flags = utils.Flags()
gym_env = utils.create_env(flags)
#gym_env = utils.create_env(flags, featurizer=X.Concat([X.Text(), X.ValidMoves(), X.Render()]))
featurizer = X.Render()
game_simulator = TrueSimulator(gym_env, featurizer)

In [8]:
game_simulator.render()


██████
█   y█
█? @ █
█    █
█  !n█
██████


In [9]:
def show_root_summary(root, discount):
    action_dict = {
        0:"Stay",
        1:"Up",
        2:"Down",
        3:"Left",
        4:"Right"
    }
    
    for action, child in root.children.items():
        Q =  child.reward + discount*child.value()
        visits = child.visit_count
        print("Action ", action_dict[action], ": Q-value=%.3f - Visit counts=%d"%(Q,visits))

In [10]:
def play_episode_v0(
    env,
    episode_length,
    ucb_C,
    discount,
    max_actions,
    num_simulations,
    render = True,
    debug_render=False
):
    action_dict = {
        0:"Stay",
        1:"Up",
        2:"Down",
        3:"Left",
        4:"Right"
    }
    valid_actions = env.reset()
    if render:
        env.render()
    total_reward = 0
    done = False
    for i in range(episode_length):
        mcts = MCTS(env, valid_actions, ucb_C, discount, max_actions, render=debug_render)
        print("Performing MCTS step")
        root, info = mcts.run(num_simulations)
        show_root_summary(root, discount)
        print("Tree info: ", info)
        action = root.best_action(discount)
        print("Action selected from MCTS: ", action, "({})".format(action_dict[action]))
        valid_actions, reward, done = env.step(action)
        if render:
            env.render()
        print("Reward received: ", reward)
        print("Done: ", done)
        total_reward += reward
        if done:
            break
    return total_reward

In [11]:
def play_episode_v1(
    env,
    episode_length,
    ucb_C,
    discount,
    max_actions,
    num_simulations,
    render = True,
    debug_render=False
):
    """
    W.r.t. version 0 it re-uses the information cached in the child node selected 
    """
    action_dict = {
        0:"Stay",
        1:"Up",
        2:"Down",
        3:"Left",
        4:"Right"
    }
    valid_actions = env.reset()
    if render:
        env.render()
    total_reward = 0
    done = False
    new_root = None
    for i in range(episode_length):
        mcts = MCTS(env, valid_actions, ucb_C, discount, max_actions, render=debug_render, root=new_root)
        print("Performing MCTS step")
        root, info = mcts.run(num_simulations)
        show_root_summary(root, discount)
        print("Tree info: ", info)
        action = root.best_action(discount)
        print("Action selected from MCTS: ", action, "({})".format(action_dict[action]))
        new_root = mcts.get_subtree(action)
        valid_actions, reward, done = env.step(action)
        if render:
            env.render()
        print("Reward received: ", reward)
        print("Done: ", done)
        total_reward += reward
        if done:
            break
    return total_reward

In [12]:
import time

In [13]:
%%time
R = play_episode_v0(
    game_simulator,
    episode_length,
    ucb_C,
    discount,
    max_actions,
    num_simulations,
    debug_render=True
)


██████
█! n █
█ @? █
█ y  █
█    █
██████
Performing MCTS step
Valid actions as child:  [0 1 2 3 4]
Terminal node:  False

Simulation 1 started.


NotImplementedError: 

In [14]:
%%time
R = play_episode_v1(
    game_simulator,
    episode_length,
    ucb_C,
    discount,
    max_actions,
    num_simulations,
    debug_render=False
)


██████
█    █
█ y !█
█   @█
█  ?n█
██████
Performing MCTS step
Valid actions as child:  [0 1 2 3]
Terminal node:  False

Simulation 1 started.
actions:  [0 1 2 3]
ucb_values:  [0. 0. 0. 0.]
max_U:  0.0
mask:  [ True  True  True  True]
best_actions:  [0 1 2 3]
Current tree depth:  1
Action selected:  0
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  -1
done:  True
Valid actions as child:  [0 1 2 3 4]
Terminal node:  True
Simulation  phase started
Simulated value:  0
Backpropagation phase started
Simulation 1 done.

Simulation 2 started.
actions:  [0 1 2 3]
ucb_values:  [-0.16786135 26.32768848 26.32768848 26.32768848]
max_U:  26.327688477341592
mask:  [False  True  True  True]
best_actions:  [1 2 3]
Current tree depth:  1
Action selected:  1
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  -1
done:  True
Valid actions as child:  [0 1 2 3 4]
Terminal node:  True
Simulation  phase started
Simulated value:  0

Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  -0.997
Backpropagation phase started
Simulation 5 done.
Action  Stay : Q-value=-1.000 - Visit counts=1
Action  Up : Q-value=-0.985 - Visit counts=2
Action  Left : Q-value=-0.994 - Visit counts=2
Action  Right : Q-value=-1.000 - Visit counts=1
Tree info:  {'max_tree_depth': 2}
Action selected from MCTS:  1 (Up)

██████
█    █
█ y  █
█  @ █
█   !█
██████
Reward received:  0
Done:  False
Performing MCTS step

Simulation 1 started.
actions:  [0 1 2 3 4]
ucb_values:  [26.32768848 26.32768848 26.32768848 26.32768848 -0.16786135]
max_U:  26.327688477341592
mask:  [ True  True  True  True False]
best_actions:  [0 1 2 3]
Current tree depth:  1
Action selected:  1
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [

Simulated value:  0.0
Backpropagation phase started
Simulation 4 done.

Simulation 5 started.
actions:  [0 1 2 3 4]
ucb_values:  [1.33789742 1.33789742 1.33789742 1.33789742 1.33789742]
max_U:  1.337897417490716
mask:  [ True  True  True  True  True]
best_actions:  [0 1 2 3 4]
Current tree depth:  1
Action selected:  0
Child node terminal:  False
Child node expanded:  True
actions:  [0 1 2 3 4]
ucb_values:  [0. 0. 0. 0. 0.]
max_U:  0.0
mask:  [ True  True  True  True  True]
best_actions:  [0 1 2 3 4]
Current tree depth:  2
Action selected:  1
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 5 done.
Action  Stay : Q-value=0.000 - Visit counts=2
Action  Up : Q-value=0.000 - Visit counts=1
Action  Down : Q-value=0.000 - Visit counts=1
Action  Left : Q-value=0.000 - Visit counts=1
Action  Rig

Simulated value:  0.0
Backpropagation phase started
Simulation 4 done.

Simulation 5 started.
actions:  [0 1 2 3 4]
ucb_values:  [ 1.2680024  40.11780044  1.2680024   1.2680024   1.2680024 ]
max_U:  40.11780044361979
mask:  [False  True False False False]
best_actions:  [1]
Current tree depth:  1
Action selected:  1
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 5 done.
Action  Stay : Q-value=0.000 - Visit counts=1
Action  Up : Q-value=0.000 - Visit counts=1
Action  Down : Q-value=0.000 - Visit counts=1
Action  Left : Q-value=0.000 - Visit counts=1
Action  Right : Q-value=0.000 - Visit counts=1
Tree info:  {'max_tree_depth': 1}
Action selected from MCTS:  0 (Stay)

██████
█    █
█    █
█ @  █
█    █
██████
Reward received:  0
Done:  False
Performing MCTS step

Simulation 1 started.
ac

Simulated value:  0.0
Backpropagation phase started
Simulation 4 done.

Simulation 5 started.
actions:  [0 1 4]
ucb_values:  [1.2680024  1.2680024  0.89683711]
max_U:  1.2680023984014182
mask:  [ True  True False]
best_actions:  [0 1]
Current tree depth:  1
Action selected:  1
Child node terminal:  False
Child node expanded:  True
actions:  [0 1 2 4]
ucb_values:  [0. 0. 0. 0.]
max_U:  0.0
mask:  [ True  True  True  True]
best_actions:  [0 1 2 4]
Current tree depth:  2
Action selected:  2
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 5 done.
Action  Stay : Q-value=0.000 - Visit counts=1
Action  Up : Q-value=0.000 - Visit counts=2
Action  Right : Q-value=0.000 - Visit counts=2
Tree info:  {'max_tree_depth': 2}
Action selected from MCTS:  4 (Right)

██████
█    █
█    █
█    █
█ @  █
██████

Simulated value:  0.0
Backpropagation phase started
Simulation 4 done.

Simulation 5 started.
actions:  [0 1 2 4]
ucb_values:  [1.33789742 1.33789742 1.33789742 0.9462727 ]
max_U:  1.337897417490716
mask:  [ True  True  True False]
best_actions:  [0 1 2]
Current tree depth:  1
Action selected:  1
Child node terminal:  False
Child node expanded:  True
actions:  [0 1 2 4]
ucb_values:  [0. 0. 0. 0.]
max_U:  0.0
mask:  [ True  True  True  True]
best_actions:  [0 1 2 4]
Current tree depth:  2
Action selected:  2
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 5 done.
Action  Stay : Q-value=0.000 - Visit counts=1
Action  Up : Q-value=0.000 - Visit counts=2
Action  Down : Q-value=0.000 - Visit counts=1
Action  Right : Q-value=0.000 - Visit counts=2
Tree info:  {'max_tree_depth': 2}
Action selec

Simulated value:  0.0
Backpropagation phase started
Simulation 4 done.

Simulation 5 started.
actions:  [0 2 4]
ucb_values:  [0.9462727  0.9462727  1.33789742]
max_U:  1.337897417490716
mask:  [False False  True]
best_actions:  [4]
Current tree depth:  1
Action selected:  4
Child node terminal:  False
Child node expanded:  True
actions:  [0 2 3 4]
ucb_values:  [0. 0. 0. 0.]
max_U:  0.0
mask:  [ True  True  True  True]
best_actions:  [0 2 3 4]
Current tree depth:  2
Action selected:  4
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 5 done.
Action  Stay : Q-value=0.000 - Visit counts=2
Action  Down : Q-value=0.000 - Visit counts=2
Action  Right : Q-value=0.000 - Visit counts=2
Tree info:  {'max_tree_depth': 2}
Action selected from MCTS:  4 (Right)

██████
█ @  █
█    █
█    █
█    █
█████

Simulated value:  0.0
Backpropagation phase started
Simulation 4 done.

Simulation 5 started.
actions:  [0 1 2 4]
ucb_values:  [0.9462727  1.33789742 1.33789742 1.33789742]
max_U:  1.337897417490716
mask:  [False  True  True  True]
best_actions:  [1 2 4]
Current tree depth:  1
Action selected:  4
Child node terminal:  False
Child node expanded:  True
actions:  [0 1 2 3 4]
ucb_values:  [0. 0. 0. 0. 0.]
max_U:  0.0
mask:  [ True  True  True  True  True]
best_actions:  [0 1 2 3 4]
Current tree depth:  2
Action selected:  3
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 5 done.
Action  Stay : Q-value=0.000 - Visit counts=2
Action  Up : Q-value=0.000 - Visit counts=1
Action  Down : Q-value=0.000 - Visit counts=1
Action  Right : Q-value=0.000 - Visit counts=2
Tree info:  {'max_tree_depth': 2}

Simulated value:  0.0
Backpropagation phase started
Simulation 3 done.

Simulation 4 started.
actions:  [0 1 2 4]
ucb_values:  [1.2680024 1.2680024 1.2680024 1.2680024]
max_U:  1.2680023984014182
mask:  [ True  True  True  True]
best_actions:  [0 1 2 4]
Current tree depth:  1
Action selected:  4
Child node terminal:  False
Child node expanded:  True
actions:  [0 1 2 3 4]
ucb_values:  [0. 0. 0. 0. 0.]
max_U:  0.0
mask:  [ True  True  True  True  True]
best_actions:  [0 1 2 3 4]
Current tree depth:  2
Action selected:  2
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 4 done.

Simulation 5 started.
actions:  [0 1 2 4]
ucb_values:  [1.33789742 1.33789742 1.33789742 0.9462727 ]
max_U:  1.337897417490716
mask:  [ True  True  True False]
best_actions:  [0 1 2]
Current tree depth:  1
Action s

Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 2 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 2 done.

Simulation 3 started.
actions:  [0 2 4]
ucb_values:  [ 1.04762339  1.04762339 33.14532077]
max_U:  33.14532076580509
mask:  [False False  True]
best_actions:  [4]
Current tree depth:  1
Action selected:  4
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 3 done.

Simulation 4 started.
actions:  [0 2 4]
ucb_values:  [1.17682176 1.17682176 1.17682176]
max_U:  1.1768217586653564
mask:  [ True  True  True]
best_actions:  [0 2 4]
Current tree depth:  1
Action selected:  4
Child node terminal:  False
Child node expanded:  True
actions:  [0 2 

Simulated value:  0.0
Backpropagation phase started
Simulation 5 done.
Action  Stay : Q-value=0.000 - Visit counts=2
Action  Down : Q-value=0.000 - Visit counts=1
Action  Right : Q-value=0.000 - Visit counts=2
Tree info:  {'max_tree_depth': 2}
Action selected from MCTS:  0 (Stay)

██████
█@   █
█    █
█    █
█    █
██████
Reward received:  0
Done:  False
Performing MCTS step

Simulation 1 started.
actions:  [0 2 4]
ucb_values:  [26.32768848 26.32768848  0.83213865]
max_U:  26.327688477341592
mask:  [ True  True False]
best_actions:  [0 2]
Current tree depth:  1
Action selected:  2
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 1 done.

Simulation 2 started.
actions:  [0 2 4]
ucb_values:  [33.14532077  1.04762339  1.04762339]
max_U:  33.14532076580509
mask:  [ True False False]
best_acti

Simulated value:  0.0
Backpropagation phase started
Simulation 5 done.
Action  Stay : Q-value=0.000 - Visit counts=2
Action  Up : Q-value=0.000 - Visit counts=2
Action  Down : Q-value=0.000 - Visit counts=1
Action  Right : Q-value=0.000 - Visit counts=1
Tree info:  {'max_tree_depth': 2}
Action selected from MCTS:  0 (Stay)

██████
█    █
█@   █
█    █
█    █
██████
Reward received:  0
Done:  False
Performing MCTS step

Simulation 1 started.
actions:  [0 1 2 4]
ucb_values:  [26.32768848  0.83213865 26.32768848 26.32768848]
max_U:  26.327688477341592
mask:  [ True False  True  True]
best_actions:  [0 2 4]
Current tree depth:  1
Action selected:  4
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 1 done.

Simulation 2 started.
actions:  [0 1 2 4]
ucb_values:  [33.14532077  1.04762339 33.14


Simulation 5 started.
actions:  [0 2 4]
ucb_values:  [0.9462727  0.9462727  1.33789742]
max_U:  1.337897417490716
mask:  [False False  True]
best_actions:  [4]
Current tree depth:  1
Action selected:  4
Child node terminal:  False
Child node expanded:  True
actions:  [0 2 3 4]
ucb_values:  [0. 0. 0. 0.]
max_U:  0.0
mask:  [ True  True  True  True]
best_actions:  [0 2 3 4]
Current tree depth:  2
Action selected:  3
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 2 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 5 done.
Action  Stay : Q-value=0.000 - Visit counts=2
Action  Down : Q-value=0.000 - Visit counts=2
Action  Right : Q-value=0.000 - Visit counts=2
Tree info:  {'max_tree_depth': 2}
Action selected from MCTS:  4 (Right)

██████
█ @  █
█    █
█    █
█    █
██████
Reward received:  0
Done:  False
Performing MCTS step

Simulation 1 sta

Simulated value:  0.0
Backpropagation phase started
Simulation 4 done.

Simulation 5 started.
actions:  [0 2 4]
ucb_values:  [1.2680024  0.89683711 1.2680024 ]
max_U:  1.2680023984014182
mask:  [ True False  True]
best_actions:  [0 4]
Current tree depth:  1
Action selected:  4
Child node terminal:  False
Child node expanded:  True
actions:  [0 2 3 4]
ucb_values:  [0. 0. 0. 0.]
max_U:  0.0
mask:  [ True  True  True  True]
best_actions:  [0 2 3 4]
Current tree depth:  2
Action selected:  2
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 5 done.
Action  Stay : Q-value=0.000 - Visit counts=1
Action  Down : Q-value=0.000 - Visit counts=2
Action  Right : Q-value=0.000 - Visit counts=2
Tree info:  {'max_tree_depth': 2}
Action selected from MCTS:  4 (Right)

██████
█ @  █
█    █
█    █
█    █


Simulated value:  0.0
Backpropagation phase started
Simulation 3 done.

Simulation 4 started.
actions:  [0 2 3 4]
ucb_values:  [1.2680024 1.2680024 1.2680024 1.2680024]
max_U:  1.2680023984014182
mask:  [ True  True  True  True]
best_actions:  [0 2 3 4]
Current tree depth:  1
Action selected:  3
Child node terminal:  False
Child node expanded:  True
actions:  [0 2 4]
ucb_values:  [0. 0. 0.]
max_U:  0.0
mask:  [ True  True  True]
best_actions:  [0 2 4]
Current tree depth:  2
Action selected:  0
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 2 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 4 done.

Simulation 5 started.
actions:  [0 2 3 4]
ucb_values:  [1.33789742 1.33789742 0.9462727  1.33789742]
max_U:  1.337897417490716
mask:  [ True  True False  True]
best_actions:  [0 2 4]
Current tree depth:  1
Action selected:  4
Child node termina

Simulated value:  0.0
Backpropagation phase started
Simulation 3 done.

Simulation 4 started.
actions:  [0 2 3 4]
ucb_values:  [ 1.17682176  1.17682176 37.23297411  1.17682176]
max_U:  37.23297411059034
mask:  [False False  True False]
best_actions:  [3]
Current tree depth:  1
Action selected:  3
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 4 done.

Simulation 5 started.
actions:  [0 2 3 4]
ucb_values:  [1.2680024 1.2680024 1.2680024 1.2680024]
max_U:  1.2680023984014182
mask:  [ True  True  True  True]
best_actions:  [0 2 3 4]
Current tree depth:  1
Action selected:  0
Child node terminal:  False
Child node expanded:  True
actions:  [0 2 3 4]
ucb_values:  [0. 0. 0. 0.]
max_U:  0.0
mask:  [ True  True  True  True]
best_actions:  [0 2 3 4]
Current tree depth:  2
Action selected:  4
Chi

Simulated value:  0.0
Backpropagation phase started
Simulation 2 done.

Simulation 3 started.
actions:  [0 2 3]
ucb_values:  [ 1.04762339 33.14532077  1.04762339]
max_U:  33.14532076580509
mask:  [False  True False]
best_actions:  [2]
Current tree depth:  1
Action selected:  2
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 3 done.

Simulation 4 started.
actions:  [0 2 3]
ucb_values:  [1.17682176 1.17682176 1.17682176]
max_U:  1.1768217586653564
mask:  [ True  True  True]
best_actions:  [0 2 3]
Current tree depth:  1
Action selected:  3
Child node terminal:  False
Child node expanded:  True
actions:  [0 2 3 4]
ucb_values:  [0. 0. 0. 0.]
max_U:  0.0
mask:  [ True  True  True  True]
best_actions:  [0 2 3 4]
Current tree depth:  2
Action selected:  4
Child node terminal:  False
Child node e

Simulated value:  0.0
Backpropagation phase started
Simulation 1 done.

Simulation 2 started.
actions:  [0 1 2 3 4]
ucb_values:  [26.32768848 26.32768848  0.83213865 26.32768848 26.32768848]
max_U:  26.327688477341592
mask:  [ True  True False  True  True]
best_actions:  [0 1 3 4]
Current tree depth:  1
Action selected:  3
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 2 done.

Simulation 3 started.
actions:  [0 1 2 3 4]
ucb_values:  [33.14532077 33.14532077  1.04762339  1.04762339 33.14532077]
max_U:  33.14532076580509
mask:  [ True  True False False  True]
best_actions:  [0 1 4]
Current tree depth:  1
Action selected:  1
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 2 3 4]
Terminal node:  False
Sim

Simulated value:  0.0
Backpropagation phase started
Simulation 1 done.

Simulation 2 started.
actions:  [0 1 2 3 4]
ucb_values:  [26.32768848 26.32768848  0.83213865 26.32768848 26.32768848]
max_U:  26.327688477341592
mask:  [ True  True False  True  True]
best_actions:  [0 1 3 4]
Current tree depth:  1
Action selected:  1
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 2 done.

Simulation 3 started.
actions:  [0 1 2 3 4]
ucb_values:  [33.14532077  1.04762339  1.04762339 33.14532077 33.14532077]
max_U:  33.14532076580509
mask:  [ True False False  True  True]
best_actions:  [0 3 4]
Current tree depth:  1
Action selected:  4
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3]
Terminal node:  False
Simul


██████
█    █
█    █
█   @█
█    █
██████
Reward received:  0
Done:  False
Performing MCTS step

Simulation 1 started.
actions:  [0 1 2 3]
ucb_values:  [0. 0. 0. 0.]
max_U:  0.0
mask:  [ True  True  True  True]
best_actions:  [0 1 2 3]
Current tree depth:  1
Action selected:  2
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 3]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 1 done.

Simulation 2 started.
actions:  [0 1 2 3]
ucb_values:  [26.32768848 26.32768848  0.83213865 26.32768848]
max_U:  26.327688477341592
mask:  [ True  True False  True]
best_actions:  [0 1 3]
Current tree depth:  1
Action selected:  1
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase s

reward:  0
done:  False
Valid actions as child:  [0 2 3]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 5 done.
Action  Stay : Q-value=0.000 - Visit counts=1
Action  Up : Q-value=0.000 - Visit counts=2
Action  Down : Q-value=0.000 - Visit counts=1
Action  Left : Q-value=0.000 - Visit counts=1
Tree info:  {'max_tree_depth': 2}
Action selected from MCTS:  3 (Left)

██████
█    █
█  @ █
█    █
█    █
██████
Reward received:  0
Done:  False
Performing MCTS step

Simulation 1 started.
actions:  [0 1 2 3 4]
ucb_values:  [0. 0. 0. 0. 0.]
max_U:  0.0
mask:  [ True  True  True  True  True]
best_actions:  [0 1 2 3 4]
Current tree depth:  1
Action selected:  2
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 1 done.

Simulation 2 star


██████
█    █
█   @█
█    █
█    █
██████
Reward received:  0
Done:  False
Performing MCTS step

Simulation 1 started.
actions:  [0 1 2 3]
ucb_values:  [26.32768848 26.32768848 26.32768848  0.83213865]
max_U:  26.327688477341592
mask:  [ True  True  True False]
best_actions:  [0 1 2]
Current tree depth:  1
Action selected:  0
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 1 done.

Simulation 2 started.
actions:  [0 1 2 3]
ucb_values:  [ 1.04762339 33.14532077 33.14532077  1.04762339]
max_U:  33.14532076580509
mask:  [False  True  True False]
best_actions:  [1 2]
Current tree depth:  1
Action selected:  1
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 2 3]
Terminal node:  False
Simulation  phase started

Simulated value:  0.0
Backpropagation phase started
Simulation 1 done.

Simulation 2 started.
actions:  [0 1 3 4]
ucb_values:  [ 0.83213865 26.32768848 26.32768848 26.32768848]
max_U:  26.327688477341592
mask:  [False  True  True  True]
best_actions:  [1 3 4]
Current tree depth:  1
Action selected:  4
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 3]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 2 done.

Simulation 3 started.
actions:  [0 1 3 4]
ucb_values:  [ 1.04762339 33.14532077 33.14532077  1.04762339]
max_U:  33.14532076580509
mask:  [False  True  True False]
best_actions:  [1 3]
Current tree depth:  1
Action selected:  1
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
B

Simulated value:  0.0
Backpropagation phase started
Simulation 2 done.

Simulation 3 started.
actions:  [0 1 3 4]
ucb_values:  [33.14532077  1.04762339  1.04762339 33.14532077]
max_U:  33.14532076580509
mask:  [ True False False  True]
best_actions:  [0 4]
Current tree depth:  1
Action selected:  4
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 3 done.

Simulation 4 started.
actions:  [0 1 3 4]
ucb_values:  [37.23297411  1.17682176  1.17682176  1.17682176]
max_U:  37.23297411059034
mask:  [ True False False False]
best_actions:  [0]
Current tree depth:  1
Action selected:  0
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpr

Simulated value:  0.0
Backpropagation phase started
Simulation 2 done.

Simulation 3 started.
actions:  [0 1 3]
ucb_values:  [1.17682176 1.17682176 1.17682176]
max_U:  1.1768217586653564
mask:  [ True  True  True]
best_actions:  [0 1 3]
Current tree depth:  1
Action selected:  1
Child node terminal:  False
Child node expanded:  True
actions:  [0 1 2 3]
ucb_values:  [0. 0. 0. 0.]
max_U:  0.0
mask:  [ True  True  True  True]
best_actions:  [0 1 2 3]
Current tree depth:  2
Action selected:  3
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 3 done.

Simulation 4 started.
actions:  [0 1 3]
ucb_values:  [1.2680024  0.89683711 1.2680024 ]
max_U:  1.2680023984014182
mask:  [ True False  True]
best_actions:  [0 3]
Current tree depth:  1
Action selected:  3
Child node terminal:  False
Child node

Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 3 done.

Simulation 4 started.
actions:  [0 1 3]
ucb_values:  [1.17682176 1.17682176 1.17682176]
max_U:  1.1768217586653564
mask:  [ True  True  True]
best_actions:  [0 1 3]
Current tree depth:  1
Action selected:  1
Child node terminal:  False
Child node expanded:  True
actions:  [0 1 2 3]
ucb_values:  [0. 0. 0. 0.]
max_U:  0.0
mask:  [ True  True  True  True]
best_actions:  [0 1 2 3]
Current tree depth:  2
Action selected:  1
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 4 done.

Simulation 5 started.
actions:  [0 1 3]
ucb_values:  [1.2680024  0.89683711 1.2680024 ]
max_U:  1.2680023984014182
mask:  [ True False  True]
best_actions:  [0 3]
Current tree depth:  1
Action selected:  3
Child node ter

Simulated value:  0.0
Backpropagation phase started
Simulation 2 done.

Simulation 3 started.
actions:  [0 1 3 4]
ucb_values:  [ 1.17682176 37.23297411  1.17682176  1.17682176]
max_U:  37.23297411059034
mask:  [False  True False False]
best_actions:  [1]
Current tree depth:  1
Action selected:  1
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 3 done.

Simulation 4 started.
actions:  [0 1 3 4]
ucb_values:  [1.2680024 1.2680024 1.2680024 1.2680024]
max_U:  1.2680023984014182
mask:  [ True  True  True  True]
best_actions:  [0 1 3 4]
Current tree depth:  1
Action selected:  3
Child node terminal:  False
Child node expanded:  True
actions:  [0 1 3 4]
ucb_values:  [0. 0. 0. 0.]
max_U:  0.0
mask:  [ True  True  True  True]
best_actions:  [0 1 3 4]
Current tree depth:  2
Action selected:  4
C

done:  False
Valid actions as child:  [0 1 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 2 done.

Simulation 3 started.
actions:  [0 1 3 4]
ucb_values:  [ 1.04762339 33.14532077  1.04762339 33.14532077]
max_U:  33.14532076580509
mask:  [False  True False  True]
best_actions:  [1 4]
Current tree depth:  1
Action selected:  1
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 3 done.

Simulation 4 started.
actions:  [0 1 3 4]
ucb_values:  [ 1.17682176  1.17682176  1.17682176 37.23297411]
max_U:  37.23297411059034
mask:  [False False False  True]
best_actions:  [4]
Current tree depth:  1
Action selected:  4
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions


██████
█    █
█    █
█    █
█  @ █
██████
Reward received:  0
Done:  False
Performing MCTS step

Simulation 1 started.
actions:  [0 1 3 4]
ucb_values:  [26.32768848 26.32768848  0.83213865 26.32768848]
max_U:  26.327688477341592
mask:  [ True  True False  True]
best_actions:  [0 1 4]
Current tree depth:  1
Action selected:  0
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 1 done.

Simulation 2 started.
actions:  [0 1 3 4]
ucb_values:  [ 1.04762339 33.14532077  1.04762339 33.14532077]
max_U:  33.14532076580509
mask:  [False  True False  True]
best_actions:  [1 4]
Current tree depth:  1
Action selected:  4
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 3]
Terminal node:  False
Simulation  phase started

Simulated value:  0.0
Backpropagation phase started
Simulation 5 done.
Action  Stay : Q-value=0.000 - Visit counts=1
Action  Up : Q-value=0.000 - Visit counts=2
Action  Left : Q-value=0.000 - Visit counts=1
Action  Right : Q-value=0.000 - Visit counts=1
Tree info:  {'max_tree_depth': 2}
Action selected from MCTS:  1 (Up)

██████
█    █
█    █
█  @ █
█    █
██████
Reward received:  0
Done:  False
Performing MCTS step

Simulation 1 started.
actions:  [0 1 2 3 4]
ucb_values:  [ 0.83213865 26.32768848 26.32768848 26.32768848 26.32768848]
max_U:  26.327688477341592
mask:  [False  True  True  True  True]
best_actions:  [1 2 3 4]
Current tree depth:  1
Action selected:  3
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 1 done.

Simulation 2 started.
actions:  [0 1 2 3 4]
ucb_values:  [ 1.0476

Simulated value:  0.0
Backpropagation phase started
Simulation 4 done.

Simulation 5 started.
actions:  [0 1 2 3 4]
ucb_values:  [ 1.2680024  40.11780044  1.2680024   1.2680024   1.2680024 ]
max_U:  40.11780044361979
mask:  [False  True False False False]
best_actions:  [1]
Current tree depth:  1
Action selected:  1
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 5 done.
Action  Stay : Q-value=0.000 - Visit counts=1
Action  Up : Q-value=0.000 - Visit counts=1
Action  Down : Q-value=0.000 - Visit counts=1
Action  Left : Q-value=0.000 - Visit counts=1
Action  Right : Q-value=0.000 - Visit counts=1
Tree info:  {'max_tree_depth': 1}
Action selected from MCTS:  4 (Right)

██████
█    █
█    █
█   @█
█    █
██████
Reward received:  0
Done:  False
Performing MCTS step

Simulation 1 started.
a

reward:  0
done:  False
Valid actions as child:  [0 1 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 4 done.

Simulation 5 started.
actions:  [0 1 3 4]
ucb_values:  [1.2680024 1.2680024 1.2680024 1.2680024]
max_U:  1.2680023984014182
mask:  [ True  True  True  True]
best_actions:  [0 1 3 4]
Current tree depth:  1
Action selected:  3
Child node terminal:  False
Child node expanded:  True
actions:  [0 1 3 4]
ucb_values:  [0. 0. 0. 0.]
max_U:  0.0
mask:  [ True  True  True  True]
best_actions:  [0 1 3 4]
Current tree depth:  2
Action selected:  1
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 5 done.
Action  Stay : Q-value=0.000 - Visit counts=1
Action  Up : Q-value=0.000 - Visit counts=1
Action  Left : Q-value=0.000 - 

best_actions:  [0 1 3 4]
Current tree depth:  1
Action selected:  1
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 1 done.

Simulation 2 started.
actions:  [0 1 3 4]
ucb_values:  [26.32768848  0.83213865 26.32768848 26.32768848]
max_U:  26.327688477341592
mask:  [ True False  True  True]
best_actions:  [0 3 4]
Current tree depth:  1
Action selected:  4
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 2 done.

Simulation 3 started.
actions:  [0 1 3 4]
ucb_values:  [33.14532077  1.04762339 33.14532077  1.04762339]
max_U:  33.14532076580509
mask:  [ True False  True False]
best_actions:  [0

Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 5 done.
Action  Stay : Q-value=0.000 - Visit counts=1
Action  Up : Q-value=0.000 - Visit counts=2
Action  Left : Q-value=0.000 - Visit counts=1
Action  Right : Q-value=0.000 - Visit counts=1
Tree info:  {'max_tree_depth': 2}
Action selected from MCTS:  1 (Up)

██████
█    █
█    █
█ @  █
█    █
██████
Reward received:  0
Done:  False
Performing MCTS step

Simulation 1 started.
actions:  [0 1 2 3 4]
ucb_values:  [26.32768848 26.32768848  0.83213865 26.32768848 26.32768848]
max_U:  26.327688477341592
mask:  [ True  True False  True  True]
best_actions:  [0 1 3 4]
Current tree depth:  1
Action selected:  4
Child node terminal:  False
Child node expanded:  False
Expansion phase started
reward:  0
done:  False
Valid actions as child:  [0 1 2 3 4]
Terminal node:  False
Simulation  phase started
Simulated value:  0.0
Backpropagation phase started
Simulation 1 done.

Simulation 2 started.
actions:  [0 1 2 

## Introducing a value network

TODO
- get the state to predict the value
- define a target with which to train the value net
- choose on which data to train (whole trajectory? just one trajectory or many?)
- **make the simulations faster** (function to set the state of the simulator instead of having to make a deepcopy every time?)
- define training cycle

EXTRA
- use some muti-threaded application, like torch.multiprocessing, to run many episodes in parallel; adapt code from IMPALA