In [1]:
import gym

In [2]:
env=gym.make("Blackjack-v0")

In [3]:
print(env.reset())

(16, 2, False)


In [5]:
#State
#1.The value of the sum of our cards
#2. The face value of one of the dealer's card
#3. Boolean value—True if we have a useable ace and False if we don't have a
#useable ace

In [4]:
print(env.action_space)

Discrete(2)


In [6]:
#Action
# The action stand is represented by 0
# The action hit is represented by 1

In [7]:
#Reward
# +1.0 reward if we win the game
# -1.0 reward if we lose the game
# 0 reward if the game is a draw

In [8]:
import pandas as pd
from collections import defaultdict

In [9]:
#Policy: the state[0], the sum of our cards, value, is greater than 19, then it will return action 0
#(stand), else it will return action 1 (hit).

def policy(state):
    return 0 if state[0]>19 else 1

In [10]:
state=env.reset()

In [11]:
state

(13, 4, False)

In [12]:
policy(state)

1

In [14]:
num_timesteps=100
def generate_episode(policy):
    episode=[]
    state=env.reset()
    
    for t in range(num_timesteps):
        action=policy(state)
        next_state,reward,done,info=env.step(action)
        episode.append((state,action,reward))
        
        if done:
            break
        state=next_state
        
    return episode    

In [22]:
print(generate_episode(policy))

[((5, 8, False), 1, 0.0), ((10, 8, False), 1, 0.0), ((17, 8, False), 1, -1.0)]


In [24]:
total_return=defaultdict(float)
N=defaultdict(int)
num_iterations=500000
for i in range(num_iterations):
    episode=generate_episode(policy)
    states,actions,rewards=zip(*episode)
    
    for t,state in enumerate (states):
        R=(sum(rewards[t:]))
        total_return[state]=total_return[state]+R
        N[state]=N[state]+1
        
#Convert the total_returns dictionary into a data frame:   
total_return=pd.DataFrame(total_return.items(),columns=['State','total_return'])
N=pd.DataFrame(N.items(),columns=['State','N'])
df=pd.merge(total_return,N,on="State")
df.head(10)

Unnamed: 0,State,total_return,N
0,"(20, 10, False)",12820.0,29855
1,"(18, 3, False)",-3836.0,5469
2,"(13, 10, False)",-11215.0,18792
3,"(18, 10, False)",-15912.0,21844
4,"(14, 10, False)",-12302.0,19378
5,"(19, 10, False)",-16626.0,22223
6,"(21, 10, True)",8356.0,9361
7,"(7, 2, False)",-554.0,974
8,"(18, 2, True)",-310.0,762
9,"(16, 2, False)",-3368.0,5106


In [25]:
df['value']=df['total_return']/df['N']

In [26]:
df.head(10)

Unnamed: 0,State,total_return,N,value
0,"(20, 10, False)",12820.0,29855,0.429409
1,"(18, 3, False)",-3836.0,5469,-0.701408
2,"(13, 10, False)",-11215.0,18792,-0.596797
3,"(18, 10, False)",-15912.0,21844,-0.728438
4,"(14, 10, False)",-12302.0,19378,-0.634844
5,"(19, 10, False)",-16626.0,22223,-0.748144
6,"(21, 10, True)",8356.0,9361,0.89264
7,"(7, 2, False)",-554.0,974,-0.568789
8,"(18, 2, True)",-310.0,762,-0.406824
9,"(16, 2, False)",-3368.0,5106,-0.659616


In [28]:
df[df['State']==(21,9,False)]['value'].values

array([0.9340515])

In [29]:
df[df['State']==(5,8,False)]['value'].values

array([-0.40646651])