## Monte Carlo Methods

- MC Prediction using Every-Visit
- MC Prediction using First-Visit
- MC Control - On Policy

## 1. MC prediction using Every visit

In [1]:
import gym
import pandas as pd
from collections import defaultdict

In [2]:
def policy(state):
    return 0 if state[0] > 19 else 1

num_timestep = 100
def generate_episode(policy):
    episode = []  
    state = env.reset()
    for i in range(num_timestep):
        action = policy(state)
        #perform the action and store the next state information
        next_state, reward, done, info = env.step(action)
        #store the state, action, reward into our episode list
        episode.append((state, action, reward))
        if done:
            break
        state = next_state
    return episode

In [3]:
env = gym.make('Blackjack-v0')
state = env.reset()
print(state)

(15, 7, False)


In [4]:
print(policy(state))

1


In [5]:
generate_episode(policy)

[((6, 5, False), 1, 0.0),
 ((9, 5, False), 1, 0.0),
 ((18, 5, False), 1, 0.0),
 ((20, 5, False), 0, 1.0)]

In [6]:
total_return = defaultdict(float)
N = defaultdict(int)
num_iterations = 50000

for i in range(num_iterations):
    
    # episode - all state, action and reward pain until agent 
    # reached terminal state
    episode = generate_episode(policy)
    states, actions, rewards = zip(*episode)

    for t, state in enumerate(states):
        # sum rewards - starting from state (t) till terminal state
        R = (sum(rewards[t:]))
        total_return[state] =  total_return[state] + R
        N[state] =  N[state] + 1

total_return = pd.DataFrame(total_return.items(), columns=['state', 'total_return'])
N = pd.DataFrame(N.items(), columns=['state', 'N'])
df = pd.merge(total_return, N, on="state")
df.head()

Unnamed: 0,state,total_return,N
0,"(5, 7, False)",-18.0,56
1,"(15, 7, False)",-382.0,567
2,"(12, 10, False)",-1056.0,1853
3,"(20, 10, False)",1236.0,2951
4,"(14, 10, False)",-1228.0,1959


In [7]:
df.shape

(280, 3)

In [8]:
df['value'] = df['total_return']/df['N']
df.head(10)

Unnamed: 0,state,total_return,N,value
0,"(5, 7, False)",-18.0,56,-0.321429
1,"(15, 7, False)",-382.0,567,-0.673721
2,"(12, 10, False)",-1056.0,1853,-0.569887
3,"(20, 10, False)",1236.0,2951,0.418841
4,"(14, 10, False)",-1228.0,1959,-0.62685
5,"(13, 10, False)",-1124.0,1832,-0.613537
6,"(17, 10, False)",-1440.0,2076,-0.693642
7,"(14, 7, False)",-285.0,480,-0.59375
8,"(18, 7, False)",-407.0,528,-0.770833
9,"(9, 1, False)",-80.0,126,-0.634921


In [9]:
df[df['state']==(21,9,False)]['value'].values

array([0.95064935])

In [10]:
df[df['state']==(5,8,False)]['value'].values

array([-0.62962963])

## 2. MC prediction using First visit

In [11]:
total_return = defaultdict(float)
N = defaultdict(int)
num_iterations = 50000

for i in range(num_iterations):
    
    # episode - all state, action and reward pain until agent 
    # reached terminal state
    episode = generate_episode(policy)
    states, actions, rewards = zip(*episode)

    for t, state in enumerate(states):
        #if the state is not visited already
        if state not in states[0:t]:
            # sum rewards - starting from state (t) till terminal state
            R = (sum(rewards[t:]))
            total_return[state] =  total_return[state] + R
            N[state] =  N[state] + 1

total_return = pd.DataFrame(total_return.items(), columns=['state', 'total_return'])
N = pd.DataFrame(N.items(), columns=['state', 'N'])
df = pd.merge(total_return, N, on="state")
df.head()

Unnamed: 0,state,total_return,N
0,"(21, 10, True)",788.0,877
1,"(11, 10, False)",-79.0,886
2,"(13, 10, False)",-1082.0,1895
3,"(5, 10, False)",-81.0,185
4,"(16, 10, True)",-99.0,253


In [12]:
df.shape

(280, 3)

In [13]:
df['value'] = df['total_return']/df['N']
df.head(10)

Unnamed: 0,state,total_return,N,value
0,"(21, 10, True)",788.0,877,0.898518
1,"(11, 10, False)",-79.0,886,-0.089165
2,"(13, 10, False)",-1082.0,1895,-0.570976
3,"(5, 10, False)",-81.0,185,-0.437838
4,"(16, 10, True)",-99.0,253,-0.391304
5,"(15, 10, False)",-1256.0,1974,-0.636272
6,"(16, 10, False)",-1401.0,2057,-0.681089
7,"(20, 10, False)",1355.0,2953,0.458855
8,"(16, 4, False)",-345.0,513,-0.672515
9,"(16, 1, False)",-381.0,537,-0.709497


In [14]:
df[df['state']==(21,9,False)]['value'].values

array([0.95180723])

In [15]:
df[df['state']==(5,8,False)]['value'].values

array([-0.37931034])