# Reference: 
    
Deep Reinforcement Learning with Python

By: Sudharsan Ravichandiran



In [1]:
import gym
import pandas as pd
from collections import defaultdict

Create a blackjack environment:

In [9]:
env = gym.make('Blackjack-v1')

In [10]:
env.observation_space

Tuple(Discrete(32), Discrete(11), Discrete(2))

In [11]:
def policy(state):
    return 0 if state[0] > 19 else 1

In [12]:
state = env.reset()
print(state[0])

(11, 4, False)


In [13]:
print(policy(state[0]))

1


In [14]:
num_timestep = 100

In [18]:
def generate_episode(policy):
    
    #let's define a list called episode for storing the episode
    episode = []
    
    #initialize the state by resetting the environment
    state = env.reset()[0]
    
    #then for each time step
    for i in range(num_timestep):
        
        #select the action according to the given policy
        action = policy(state)
        
        #perform the action and store the next state information
        next_state, reward, done, truncate, info = env.step(action)
        
        #store the state, action, reward into our episode list
        episode.append((state, action, reward))
        
        #If the next state is a final state then break the loop else update the next state to the current state
        if done or truncate:
            break
            
        state = next_state

    return episode

In [19]:
generate_episode(policy)

  if not isinstance(terminated, (bool, np.bool8)):


[((12, 10, False), 1, 0.0), ((14, 10, False), 1, -1.0)]

In [20]:
total_return = defaultdict(float)
N = defaultdict(int)

In [21]:
num_iterations = 10000

In [22]:
#then, for every iteration
for i in range(num_iterations):
    
    #generate the episode using the given policy, that is, generate an episode using the policy
    #function we defined earlier
    episode = generate_episode(policy)
    
    #store all the states, actions, rewards obtained from the episode
    states, actions, rewards = zip(*episode)
    
    #then, for each step in the episode
    for t, state in enumerate(states):
        
        #if the state is not visited already
        if state not in states[0:t]:
                
            #compute the return R of the state as the sum of reward
            R = (sum(rewards[t:]))
            
            #update the total_return of the state
            total_return[state] =  total_return[state] + R
            
            #update the number of times the state is visited in the episode
            N[state] =  N[state] + 1

In [23]:
total_return = pd.DataFrame(total_return.items(),columns=['state', 'total_return'])

In [24]:
N = pd.DataFrame(N.items(),columns=['state', 'N'])

In [25]:
df = pd.merge(total_return, N, on="state")

In [26]:
df.head(10)

Unnamed: 0,state,total_return,N
0,"(18, 1, False)",-82.0,118
1,"(20, 10, False)",257.0,575
2,"(11, 10, False)",-23.0,171
3,"(14, 10, False)",-250.0,390
4,"(17, 4, False)",-96.0,128
5,"(20, 4, False)",81.0,138
6,"(15, 10, False)",-292.0,428
7,"(13, 10, False)",-255.0,411
8,"(16, 10, False)",-282.0,429
9,"(17, 10, False)",-331.0,456


In [27]:
df['value'] = df['total_return']/df['N']

In [28]:
df.head(10)

Unnamed: 0,state,total_return,N,value
0,"(18, 1, False)",-82.0,118,-0.694915
1,"(20, 10, False)",257.0,575,0.446957
2,"(11, 10, False)",-23.0,171,-0.134503
3,"(14, 10, False)",-250.0,390,-0.641026
4,"(17, 4, False)",-96.0,128,-0.75
5,"(20, 4, False)",81.0,138,0.586957
6,"(15, 10, False)",-292.0,428,-0.682243
7,"(13, 10, False)",-255.0,411,-0.620438
8,"(16, 10, False)",-282.0,429,-0.657343
9,"(17, 10, False)",-331.0,456,-0.725877


In [29]:
df[df['state']==(21,9,False)]['value'].values

array([0.91549296])

In [30]:
df[df['state']==(5,8,False)]['value'].values

array([-0.69230769])