# Monte Carlo

## Objectiveness
Compute the action-value function by iteratively updating.
$$ q_t^N = \frac{1}{N}\sum_{i=0}^N \hat{q}_t^i = \frac{1}{N} \times (N-1)q_t^{N-1} + \frac{1}{N}\hat{q}_t^N = q_t^{N-1} + \frac{1}{N}  (\hat{q}_t^N - q_t^{N-1} ) $$



### Notations
+ N: number of iterations
+ t: current time step (the position in the list of (s, a, r))
+ $q_t^N$: action-value function at time step t
+ $\hat{q}_i^N$: future reward

In [1]:
import numpy as np
from snake import *

In [2]:
class MonteCarlo(object):
    @staticmethod
    def _monte_carlo_eval(agent, env, epsilon):
        '''update action-value function'''
        # get (state, action, reward) list
        s = env.reset()
        prev_s = s
        transitions = list()
        while True:
            a = agent.play(s, epsilon)
            
            s, r, done, _ = env.step(a)
            transitions.append((prev_s, a, r))
            prev_s = s
            if done:
                break
        
        # discounted reward
        new_transitions = list()
        future_reward = 0
        for s, a, r in reversed(transitions):
            future_reward = future_reward * agent.gamma + r
            new_transitions.append((s, a, future_reward))
        transitions = reversed(new_transitions)
        
        # update update action-value function
        for s, a, f_r in transitions:
            agent.value_n[s, a] += 1 # increase by 1
            N = agent.value_n[s, a]
            agent.value_sa[s, a] += (f_r - agent.value_sa[s, a]) / N
        
    @staticmethod
    def _policy_improvement(agent):
        '''update policy'''
        new_policy = np.zeros_like(agent.pi) # (action_size, state_size, state_size)
        for s in range(1, agent.state_size):
            new_policy[s] = np.argmax(agent.value_sa[s, :]) # select the max action !!! not [s, a], but [s, :]
        if np.all(np.equal(new_policy, agent.pi)):
            return True # converge
        else:
            agent.pi = new_policy
            return False # not converge
    
    @staticmethod
    def monte_carlo_opt(agent, env, epsilon=0.0):
        for i in range(10):
            for j in range(100):
                MonteCarlo._monte_carlo_eval(agent, env, epsilon=epsilon)
            MonteCarlo._policy_improvement(agent)

### Test

Case 1: naive monte carlo

In [3]:
np.random.seed(0)
env = SnakeEnv(10, [3, 6])
agent = ModelFreeAgent(env)
MonteCarlo.monte_carlo_opt(agent, env)
print('return:', eval_game(env, agent))
print(agent.pi)

return: 91
[0 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 1 0 1 0 0 1 0 1 1 0 0 0 0 0 0
 0 0 0 1 1 1 0 0 0 0 0 1 0 1 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1
 1 1 0 1 1 1 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


Case 2: $\epsilon$-greedy: getting more real samples

In [4]:
np.random.seed(0)
env = SnakeEnv(10, [3, 6])
agent = ModelFreeAgent(env)
MonteCarlo.monte_carlo_opt(agent, env, epsilon=0.05)
print('return:', eval_game(env, agent))
print(agent.pi)

return: 88
[0 1 1 1 0 1 1 1 1 0 0 1 1 0 0 1 1 1 1 1 0 1 1 1 1 0 0 1 1 1 1 1 0 1 1 0 0
 0 0 1 0 0 1 1 0 0 0 0 1 1 1 1 1 0 0 1 0 1 1 1 0 1 1 0 1 1 1 0 0 0 0 0 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 0 0 0 0]
