In [1]:
import numpy as np
from snake import *

In [2]:
class MonteCarlo(object):
    @staticmethod
    def _monte_carlo_eval(agent, env, epsilon):
        '''update action-value function'''
        # get (state, action, reward) list
        s = env.reset()
        sample = list()
        while True:
            a = agent.play(s, epsilon)
            s, r, done, _ = env.step(a)
            sample.append((s, a, r))
            if done:
                break
        
        # discounted reward
        new_sample = list()
        return_val = 0
        for s, a, r in reversed(sample):
            return_val = return_val * agent.gamma + r
            new_sample.append((s, a, return_val))
        sample = reversed(new_sample)
        
        # update update action-value function
        for s, a, r in sample:
            agent.value_n[s, a] += 1 # increase by 1
            N = agent.value_n[s, a]
            agent.value_sa[s, a] += (r - agent.value_sa[s, a]) / N
        
    @staticmethod
    def _policy_improvement(agent):
        '''update policy'''
        new_policy = np.zeros_like(agent.pi) # (action_size, state_size, state_size)
        for s in range(1, agent.state_size):
            new_policy[s] = np.argmax(agent.value_sa[s, :]) # select the max action !!! not [s, a], but [s, :]
        if np.all(np.equal(new_policy, agent.pi)):
            return True # converge
        else:
            agent.pi = new_policy
            return False # not converge
    
    @staticmethod
    def monte_carlo_opt(agent, env, epsilon=0.0):
        for i in range(10):
            for j in range(100):
                MonteCarlo._monte_carlo_eval(agent, env, epsilon=epsilon)
            MonteCarlo._policy_improvement(agent)

### Test

Case 1: naive monte carlo

In [3]:
np.random.seed(3)
env = SnakeEnv(10, [3, 6])
agent = ModelFreeAgent(env)
MonteCarlo.monte_carlo_opt(agent, env)
print('return:', eval_game(env, agent))
print(agent.pi)

return: 86
[0 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 0 0 0 1 0 1 1 1 1 1 1 1 0 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 0 0]


Case 2: $\epsilon$-greedy: getting more real samples

In [4]:
np.random.seed(3)
env = SnakeEnv(10, [3, 6])
agent = ModelFreeAgent(env)
MonteCarlo.monte_carlo_opt(agent, env, epsilon=0.05)
print('return:', eval_game(env, agent))
print(agent.pi)

return: 95
[0 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 0 1 1 0 1
 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0]
