# Move Tree
notebook will implement a basic tree mode that covers changes in our set ItemSpace which is a wrapper for n-dimensional integer.

We wish to construct a tree for a collection of moves (which are represented by itemsets)
where each Node has a given state (ItemSet)


Ok, this was original work, but then discovered I am essentially making a *Monte Carlo Tree Search* algorithm, so I read the literature. I think I can use [y_yoko](https://qiita.com/y_yoko/items/07b9e3e8d4a43c61d39f) as a reference

In [1]:
import random
import math
from copy import copy

In [2]:
class ItemSet(object):
    """ 
    this is the basically space of our game
    it can represent an inventory or items, or a play/move to convert from one set to another
     or can represent a recipe (target state)
    """
    def __init__(self, items):
        self.items = items

    def is_positive(self):
        return all(map(lambda x: x>=0, self.items))    
        
    def __add__(self, addend): 
        return ItemSet(list(map(lambda x:x[0]+x[1], zip(self.items, addend.items))))
    
    def __len__(self):
        return sum(map(abs,self.items))
    
    def __repr__(self):
        return f'({self.items})'

class State(object):
    """ represents the state of our game - inventory and history """
    def __init__(self, play_history, inventory):
        self.inventory = inventory
        self.play_history = play_history
        
    def __hash__(self):
        return hash(str(self.play_history))
    
    def __repr__(self):
        return f'State({self.inventory}, h={self.play_history})'
    
class Game(object):
    """ represents our stateless game, with available moves """
    def __init__(self, inventory=None):
        self.initial_inventory = inventory if inventory else ItemSet([0,0])
    
    MOVES = [ItemSet([2,0]), ItemSet([-1,1])]
    
    def start(self):
        return State([], self.initial_inventory)
    
    def available_moves(self, state):
        return [m for m in Game.MOVES if (state.inventory + m).is_positive()]
    
    def next_state(self, state, move):
        #return new_state
        new_history = state.play_history.copy()
        new_history.append(move)
        
        return State(new_history, state.inventory + move) 
    
    
    def reward(self, state):
        """ reward function for the state """
        return 100. if self.winner(state) else 0.
    
    def winner(self, state):
        """ return true if self.state is a winning state """
        #return winner
        if state.inventory.items[1] > 1:
            return True
        
        return False
    
    def __repr__(self):
        return f'Game(init={self.initial_inventory}, moves={self.MOVES})'    

    


In [3]:
game = Game()
state = game.start()
moves = game.available_moves(state)
print(f'winner: {game.winner(state)}: {state} -> {moves}')
state = game.next_state(state, moves[-1])
moves = game.available_moves(state)
print(f'winner: {game.winner(state)}: {state} -> {moves}')
state = game.next_state(state, moves[-1])
moves = game.available_moves(state)
print(f'winner: {game.winner(state)}: {state} -> {moves}')
state = game.next_state(state, moves[-1])
moves = game.available_moves(state)
print(f'winner: {game.winner(state)}: {state} -> {moves}')
state = game.next_state(state, moves[-1])
moves = game.available_moves(state)
print(f'winner: {game.winner(state)}: {state} -> {moves}')
state = game.next_state(state, moves[-1])
moves = game.available_moves(state)
print(f'winner: {game.winner(state)}: {state} -> {moves}')


winner: False: State(([0, 0]), h=[]) -> [([2, 0])]
winner: False: State(([2, 0]), h=[([2, 0])]) -> [([2, 0]), ([-1, 1])]
winner: False: State(([1, 1]), h=[([2, 0]), ([-1, 1])]) -> [([2, 0]), ([-1, 1])]
winner: True: State(([0, 2]), h=[([2, 0]), ([-1, 1]), ([-1, 1])]) -> [([2, 0])]
winner: True: State(([2, 2]), h=[([2, 0]), ([-1, 1]), ([-1, 1]), ([2, 0])]) -> [([2, 0]), ([-1, 1])]
winner: True: State(([1, 3]), h=[([2, 0]), ([-1, 1]), ([-1, 1]), ([2, 0]), ([-1, 1])]) -> [([2, 0]), ([-1, 1])]


In [73]:
class MonteCarloTreeNode(object):
    def __init__(self, parent, action):
        self.num_visits = 1
        self.reward = 0.
        self.children = []
        self.parent = parent
        self.action = action

    def update(self, reward):
        self.reward += reward
        self.visits += 1


    def ucb(self):
        return self.reward / self.num_visits + math.sqrt(math.log(self.parent.num_visits)/self.num_visits)

    def reward_rate(self):
        return self.reward / self.num_visits        
        
    def recurse(self):
        c = '(' + ','.join([child.recurse() for child in self.children]) + ')' if self.children else 'None'
        return f'Node(a={self.action}, r={self.reward}, c={c})'

    def __repr__(self):
        #s="Reward/Visits =  %.1f/%.1f (Child %d)"%(self.reward, self.num_visits, len(self.children))
        s="{Reward/Visits =  %.1f/%.1f, Child %d}"%(self.reward, self.num_visits, len(self.children))
        return f'Node(p={self.parent}, a={self.action}, s={s})'

In [97]:
#SHOW_INTERMEDIATE_RESULTS = False
SHOW_INTERMEDIATE_RESULTS = True

_pick_random = lambda x: x[random.randint(0, len(x)-1)]

game = Game()
initial_state = game.start()

num_mainloops = 1
max_playout_depth = 3
num_tree_search = 3 # 180

best_sum_reward = -math.inf
best_action_sequence = []
# best_f = 0
# best_x = []

for _ in range(num_mainloops):
    root = MonteCarloTreeNode(None, None)


    for run_no in range(num_tree_search):
        #env_copy = copy(env)
        state = copy(initial_state)
        
        sum_reward = 0

        # 1) Selection
        current_node = root
        while len(current_node.children) != 0:
            current_node = max(current_node.children, key=MonteCarloTreeNode.ucb)
            #_, reward, terminal, _ = env_copy.step(current_node.action)
            state = game.next_state(state, current_node.action)
            #sum_reward += reward
            sum_reward += game.reward(state)
        print(f'selecting node: {current_node} for expansion')
            
        # 2) Expansion
        #if not terminal:
        if not game.winner(state):
            #possible_actions = env_copy.action_space()
            possible_actions = game.available_moves(state)
            current_node.children = [MonteCarloTreeNode(current_node, action) for action in possible_actions]     

        print(f'set current_node.children.action[] {current_node.parent} to {[c.action for c in current_node.children]}')
        # Routine for each children hereafter

        for c in current_node.children:
            # 3) Playout
            #env_playout = copy(env_copy)
            sum_reward_playout = 0
            action_sequence = []

            #_, reward, terminal, _ = env_playout.step(c.action)
            child_state = game.next_state(state, c.action)
            #sum_reward_playout += reward
            sum_reward_playout += game.reward(child_state)
            action_sequence.append(c.action)

            #while not terminal:
            while not game.winner(child_state):
                #action = env_copy.sample()
                #print(f'avail moves: {game.available_moves(state)}')
                action = _pick_random(game.available_moves(state))
                #_, reward, terminal, _ = env_playout.step(action)
                child_state = game.next_state(child_state, action)
                #sum_reward_playout += reward
                sum_reward_playout += game.reward(child_state)
                action_sequence.append(action)

                if len(action_sequence) > max_playout_depth:
                    break

            if game.winner(child_state):
                print(f'Terminal {child_state} reached during a playout. #########')

            # 4) Backpropagate
            c_ = c
            while c_:
                c_.num_visits +=1
                c_.reward += sum_reward + sum_reward_playout
                c_ = c_.parent

                
        print(f'complete run {run_no}: state: {state}')
                
    print(f'make a decision on root: {root.recurse()}')
    
    
    
    #Decision
    current_node = root
    action_sequence = []
    sum_reward = 0
    #env_copy = copy(env)

    while len(current_node.children) != 0:
        current_node = max(current_node.children, key=MonteCarloTreeNode.reward_rate)
        action_sequence.append(current_node.action)

    for action in action_sequence:
        #_, reward, terminal, _ = env_copy.step(action)
        new_state = game.next_state(state, action)
        #sum_reward += reward
        sum_reward += game.reward(new_state)
        if game.winner(new_state):
            break

    #f, _, _, x = env_copy.step(0)       

    if SHOW_INTERMEDIATE_RESULTS == True:
        print("Action sequence: ", str(action_sequence))
        print("Sum_reward: ", str(sum_reward))

        #print("f, x (original): ", env.f , str([env.x1, env.x2]))
        #print("f, x (after MCT): ", str(f), str(x) )
        print("----------")

    if sum_reward > best_sum_reward:
        print(f'updated best: {current_node}')
        best_sum_reward = sum_reward
        best_action_sequence = action_sequence
        #best_f = f
        #best_x = x

print("Best Action sequence: ", str(best_action_sequence))
print("Action sequence length: ", str(len(best_action_sequence)))
print("Best Sum_reward: ", str(best_sum_reward))
#print("f, x (original): ", env.f , str([env.x1, env.x2, env.x3, env.x4]))
#print("f, x (after MCTS): ", str(best_f), str(best_x) )

selecting node: Node(p=None, a=None, s={Reward/Visits =  0.0/1.0, Child 0}) for expansion
set current_node.children.action[] None to [([2, 0])]
complete run 0: state: State(([0, 0]), h=[])
selecting node: Node(p=Node(p=None, a=None, s={Reward/Visits =  0.0/2.0, Child 1}), a=([2, 0]), s={Reward/Visits =  0.0/2.0, Child 0}) for expansion
set current_node.children.action[] Node(p=None, a=None, s={Reward/Visits =  0.0/2.0, Child 1}) to [([2, 0]), ([-1, 1])]
Terminal State(([2, 2]), h=[([2, 0]), ([-1, 1]), ([2, 0]), ([-1, 1])]) reached during a playout. #########
complete run 1: state: State(([2, 0]), h=[([2, 0])])
selecting node: Node(p=Node(p=Node(p=None, a=None, s={Reward/Visits =  100.0/4.0, Child 1}), a=([2, 0]), s={Reward/Visits =  100.0/4.0, Child 2}), a=([-1, 1]), s={Reward/Visits =  100.0/2.0, Child 0}) for expansion
set current_node.children.action[] Node(p=Node(p=None, a=None, s={Reward/Visits =  100.0/4.0, Child 1}), a=([2, 0]), s={Reward/Visits =  100.0/4.0, Child 2}) to [([2, 

In [6]:
# selecting node: 
# Node(
#     p=Node(
#         p=Node(p=None, a=None, s={Reward/Visits =  200.0/4.0, Child 1}), 
#         a=([2, 0]), 
#         s={Reward/Visits =  200.0/4.0, Child 2}
#     ), 
#     a=([2, 0]), 
#     s={Reward/Visits =  100.0/2.0, Child 0}
# )

# Node(
#     a=None, 
#     r=600.0, 
#     c=(
#         Node(
#             a=([2, 0]), 
#             r=600.0, 
#             c=(
#                 Node(
#                     a=([2, 0]),
#                     r=500.0, 
#                     c=(
#                         Node(
#                             a=([2, 0]), 
#                             r=300.0, 
#                             c=(
#                                 Node(
#                                     a=([2, 0]), 
#                                     r=100.0, 
#                                     c=None
#                                 ),
#                                 Node(
#                                     a=([-1, 1]), 
#                                     r=100.0, 
#                                     c=None
#                                 )
#                             )
#                     ),
#                     Node(
#                         a=([-1, 1]), 
#                         r=100.0, 
#                         c=None
#                     )
#                 )
#             ),
#             Node(
#                 a=([-1, 1]), 
#                 r=100.0, 
#                 c=None
#             )
#         )
#     )
# )
# )

SyntaxError: invalid syntax (<ipython-input-6-1212e9130a41>, line 3)

In [93]:
game = Game()
state = game.start()
print(f'{state}')
for action in best_action_sequence:
    state = game.next_state(state, action)
    print(f'win: {game.winner(state)}: {state}')

State(([0, 0]), h=[])
win: False: State(([2, 0]), h=[([2, 0])])
win: False: State(([1, 1]), h=[([2, 0]), ([-1, 1])])


In [94]:
def _parent(node):
    """ returns all actions for the node up the parent chain """
    if node.parent:
        return f'{node.action}, {_parent(node.parent)}'
    return 'None'

def bfs(tree): # tree has .children
    visited = [tree]
    queue = [tree]
    
    while queue:
        s = queue.pop(0)
        #p = s.parent.action if s.parent else 'None'
        p = _parent(s)
        print(f'[a={s.action}, p=[{p}], r={s.reward}]', end=' ')
        for nbh in s.children:
            if nbh not in visited:
                visited.append(nbh)
                queue.append(nbh)
                
bfs(root)
    



[a=None, p=[None], r=100.0] [a=([2, 0]), p=[([2, 0]), None], r=100.0] [a=([2, 0]), p=[([2, 0]), ([2, 0]), None], r=0.0] [a=([-1, 1]), p=[([-1, 1]), ([2, 0]), None], r=100.0] 