In [51]:
from gym.envs.registration import register  # https://github.com/openai/gym/blob/master/gym/envs/registration.py
import gym
import numpy as np

MY_ENV_NAME='FrozenLakeNonskid4x4-v3'

# 0 = left; 1 = down; 2 = right;  3 = up
# reward of 1 if success, no reward otherwise
# done = True even if not success due to timestep limit

register(
    id=MY_ENV_NAME,
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name': '4x4', 'is_slippery': False},
    reward_threshold=0.78)

env = gym.make(MY_ENV_NAME)

In [92]:
class Node(object):
    """
    each state will be represented as a State object. we do this 
    because we want to track (a) total rewards and (b) play count.
    
    we need to track rewards and play count in order to calculate
    UCB score; we want to calculate UCB score in order to select 
    the most promising action.
    """
    def __init__(self):
        self.q = 0  # win count
        self.n = 0  # play count

def _find_valid_actions(s1, env):
    """return a list of valid (action, state2) pairs for a given state.
    """
    action_state = []
    for a in range(3):  # 4 possible actions at any grid
        s2, _, _, _ = env.step(a)
        
        # scenario 1: action does not lead to a new state, so ignore
        if s2 == s1:
            pass
        # scenario 2: action leads to a new state, so keep
        else:
            action_state.append((a, s2))
        env.s = s1  # return env to original state
    return action_state
        
def select_action(s, memory, env):
    """
    return action corresponding to the child with highest UCB score.
    
    s is the current state; memory is the dict containing all states
    we've seen in the current rollout; and env is the current 
    environment.
    """
    # step 1: find all actions that would lead to a valid state
    action_state = _find_valid_actions(s, env)
    
    # step 2a: if we havent seen that child before, then go with it
    for ap in action_state:
        a = ap[0]
        s2 = ap[1]
        if s2 in memory.keys():
            pass
        else:
            return a  # this action leads to a state that we havent seen before
        
    # step 2b: if we've visited all possible s2, return action that would lead to s2 with the highest UCB
    action = None
    max_ucb = 0
    for ap in action_state:
        s2 = memory[ap[1]]
        s1 = memory[s]
        score = calculate_ucb(s1, s2)
        if score > max_ucb:
            action = ap[0]
            max_ucb = score  # new score to beat
    if action is not None:
        return action
    # if there isnt a clear candidate, randomly select from all possible action/state pairs
    else:
        actions = [ap[0] for ap in action_state]
        return np.random.choice(actions)
    
def calculate_ucb(s1, s2):
    c_param = .14
    term_1 = (s2.q / s2.n)  # can be interpreted as the value estimate
    term_2 = c_param * np.sqrt((2 * np.log(s1.n) / s2.n))
    return  term_1 + term_2

In [122]:
starting_state = 14
n_rollouts = 10
max_steps = 20

In [123]:
memory = {starting_state: Node()}  # track states that we've seen before since we cant "color" environment

for rollout in range(n_rollouts):    

    # starting_state is the root node; ultimately we want to know which action we should take from the root node
    trace = [starting_state]
    env.s = starting_state  # manually set env to the state we want
    s = starting_state  # s will be updated as we play out each rollout
    done = False
    steps = 0

    while not done and steps < max_steps:
        a = select_action(s, memory, env)  # select actions based on UCB scores
        s, r, done, _ = env.step(a)
        steps += 1

        # if we've seen state s before, we dont need to do anything
        if s in memory:
            pass
        # if we havent seen state s before, we update memory and continue
        else:
            memory[s] = Node()

        # add action to trace so we can backpropagate from terminal through root
        trace.append(s)


    # now that we're done with the rollout...
    print("trace", trace)
    print("completed rollout", rollout, "with reward", r)

    # backpropagate the nodes in the trajectory
    for t in set(trace):
        node = memory[t]  # retrieve node for corresponding state
        node.q += r  # update reward
        node.n += 1  # update games played
    print("-" * 20)

trace [14, 13, 12]
completed rollout 0 with reward 0.0
--------------------
trace [14, 15]
completed rollout 1 with reward 1.0
--------------------
trace [14, 15]
completed rollout 2 with reward 1.0
--------------------
trace [14, 15]
completed rollout 3 with reward 1.0
--------------------
trace [14, 15]
completed rollout 4 with reward 1.0
--------------------
trace [14, 15]
completed rollout 5 with reward 1.0
--------------------
trace [14, 15]
completed rollout 6 with reward 1.0
--------------------
trace [14, 15]
completed rollout 7 with reward 1.0
--------------------
trace [14, 15]
completed rollout 8 with reward 1.0
--------------------
trace [14, 15]
completed rollout 9 with reward 1.0
--------------------


In [125]:
# 0 = left; 1 = down; 2 = right;  3 = up
# now that we're done sampling, lets find the best move (it's the one that results in the child with the most plays)
best = select_action(starting_state, memory, env)
print(">> best move", best)
s2, _, _, _ = env.step(best)
print(">> new state", s2)
env.render()

>> best move 2
>> new state 15
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
