In [1]:
import gym
import pandas as pd
from collections import defaultdict

In [2]:
env = gym.make('Blackjack-v0')

> **We have 2 methods:**

* First-Visit MC
* Every-Visit MC

>  In first-visit MC, we compute the return of the state-action pair only for the first time the state-action pair is visited in the episode and in every-visit MC we compute the return of the state-action pair every time the state-action pair is visited in the episode.

In [3]:
def policy(state):
  return 0 if state[0] > 19 else 1

> Action 0, stands for **STAND**, and 1 for **HIT**

In [4]:
state = env.reset()
print(state)

(21, 2, True)


> (21,2,True), 21(Sum of our cards), 2(1 card show of Dealer), True(implies that we have a usable ace, and it will be False if we don't have a usable ace.)

In [5]:
print(policy(state))

0


In [6]:
env.step(0)  # So we are given reward 1, by having action 0, in state (21,2,True)

((21, 2, True), 1.0, True, {})

**in the prediction method, we will be given an input policy and we predict the value function of the given input policy. So, now, we first define a policy function which acts as an input policy. That is, we define the input policy whose value function will be predicted in the upcoming steps.**

In [7]:
num_timestep = 100

def generate_episode(policy):

  episode = []
  state = env.reset()

  for i in range(num_timestep):

    action = policy(state)

    next_state, reward, done, info =env.step(action)

    episode.append((state, action, reward))

    if done:
      break

    state = next_state

  return episode



In [8]:
generate_episode(policy)

[((15, 10, True), 1, 0.0), ((15, 10, False), 1, -1.0)]

> **Computing the value function**

In [9]:
total_return = defaultdict(float)
N = defaultdict(int)

In [10]:
num_iterations = 500000


In [11]:
for i in range(num_iterations):
  episode = generate_episode(policy)

  states, actions, rewards = zip(*episode)

  for t, state in enumerate(states):

    # So the return R of the state, will be sum of reward

    R=(sum(rewards[t:]))
    total_return[state] = total_return[state] + R

    N[state] = N[state] + 1

In [13]:
total_return = pd.DataFrame(total_return.items(), columns=['state', 'total_return'])

In [14]:
N = pd.DataFrame(N.items(), columns = ['state', 'N'])

In [18]:
len(N), len(total_return)

(280, 280)

In [15]:
df = pd.merge(total_return, N, on = 'state')

In [16]:
df.head(4)

Unnamed: 0,state,total_return,N
0,"(13, 6, True)",-139.0,446
1,"(14, 6, True)",-139.0,540
2,"(18, 6, True)",-298.0,744
3,"(12, 6, False)",-2452.0,4543


In [17]:
len(df)

280

In [19]:
df['value'] = df['total_return']/df['N']


In [20]:
df.head(4)

Unnamed: 0,state,total_return,N,value
0,"(13, 6, True)",-139.0,446,-0.311659
1,"(14, 6, True)",-139.0,540,-0.257407
2,"(18, 6, True)",-298.0,744,-0.400538
3,"(12, 6, False)",-2452.0,4543,-0.539731


In [21]:
df[df['state']==(21,9,False)]['value'].values


array([0.93418202])

In [22]:
df[df['state']==(5,10,False)]['value'].values


array([-0.49080087])

## Now for First-Visit MC method

In [32]:
total_return = defaultdict(float)
N = defaultdict(int)

In [33]:
total_return, N

(defaultdict(float, {}), defaultdict(int, {}))

In [34]:
num_iterations = 10000

for i in range(num_iterations):

  episode= generate_episode(policy)

  states, actions, rewards = zip(*episode)

  for t, state in enumerate(states):

    if state not in states[0:t]:

      R = (sum(rewards[t:]))

      total_return[state] = total_return[state] + R

      N[state] = N[state] + 1

In [35]:
total_return = pd.DataFrame(total_return.items(),columns=['state', 'total_return'])


In [36]:
N = pd.DataFrame(N.items(),columns=['state', 'N'])


In [37]:
df = pd.merge(total_return, N, on="state")


In [38]:
df.head(10)


Unnamed: 0,state,total_return,N
0,"(21, 10, True)",177.0,193
1,"(8, 6, False)",-21.0,33
2,"(14, 6, False)",-51.0,86
3,"(16, 10, True)",-22.0,65
4,"(18, 10, False)",-321.0,425
5,"(12, 1, False)",-65.0,86
6,"(19, 1, False)",-80.0,103
7,"(16, 10, False)",-282.0,426
8,"(19, 10, False)",-366.0,496
9,"(20, 4, False)",100.0,162


In [39]:
df['value'] = df['total_return']/df['N']


In [40]:
df.head(10)


Unnamed: 0,state,total_return,N,value
0,"(21, 10, True)",177.0,193,0.917098
1,"(8, 6, False)",-21.0,33,-0.636364
2,"(14, 6, False)",-51.0,86,-0.593023
3,"(16, 10, True)",-22.0,65,-0.338462
4,"(18, 10, False)",-321.0,425,-0.755294
5,"(12, 1, False)",-65.0,86,-0.755814
6,"(19, 1, False)",-80.0,103,-0.776699
7,"(16, 10, False)",-282.0,426,-0.661972
8,"(19, 10, False)",-366.0,496,-0.737903
9,"(20, 4, False)",100.0,162,0.617284


In [41]:
df[df['state']==(21,9,False)]['value'].values


array([0.93846154])

In [42]:
df[df['state']==(5,10,False)]['value'].values


array([-0.44827586])