# Reference: 
    
Deep Reinforcement Learning with Python

By: Sudharsan Ravichandiran



In [49]:
import gym
import pandas as pd
from collections import defaultdict

In [50]:
env = gym.make('Blackjack-v1')

In [51]:
def policy(state):
    return 0 if state[0] > 19 else 1

In [52]:
state = env.reset()
print(state[0])

(5, 1, False)


In [53]:
print(policy(state[0]))

1


In [54]:
num_timestep = 100

In [55]:
env.observation_space

Tuple(Discrete(32), Discrete(11), Discrete(2))

In [56]:
def generate_episode(policy):
    
    #let's define a list called episode for storing the episode
    episode = []
    
    #initialize the state by resetting the environment
    state = env.reset()[0]
    
    #then for each time step
    for i in range(num_timestep):
        
        #select the action according to the given policy
        action = policy(state)
        
        #perform the action and store the next state information
        next_state, reward, done, truncated, info = env.step(action)
        
        #store the state, action, reward into our episode list
        episode.append((state, action, reward))
        
        #If the next state is a final state then break the loop else update the next state to the current state
        if done or truncated:
            break
            
        state = next_state

    return episode

In [57]:
generate_episode(policy)

  if not isinstance(terminated, (bool, np.bool8)):


[((12, 8, False), 1, -1.0)]

In [58]:
total_return = defaultdict(float)
N = defaultdict(int)

In [59]:
num_iterations = 500000

In [60]:
#then, for every iteration
for i in range(num_iterations):
    
    #generate the episode using the given policy, that is, generate an episode using the policy
    #function we defined earlier
    episode = generate_episode(policy)
    
    #store all the states, actions, rewards obtained from the episode
    states, actions, rewards = zip(*episode)
    
    #then for each step in the episode 
    for t, state in enumerate(states):
        
            #compute the return R of the state as the sum of reward
            R = (sum(rewards[t:]))
            
            #update the total_return of the state
            total_return[state] =  total_return[state] + R
            
            #update the number of times the state is visited in the episode
            N[state] =  N[state] + 1

In [61]:
total_return = pd.DataFrame(total_return.items(),columns=['state', 'total_return'])

In [62]:
N = pd.DataFrame(N.items(),columns=['state', 'N'])

In [63]:
df = pd.merge(total_return, N, on="state")

In [64]:
df.head(10)

Unnamed: 0,state,total_return,N
0,"(19, 10, False)",-16194.0,21948
1,"(20, 8, False)",5922.0,7463
2,"(13, 10, False)",-11167.0,18763
3,"(18, 10, True)",-1410.0,3095
4,"(16, 10, False)",-14122.0,20691
5,"(20, 1, False)",1080.0,7630
6,"(13, 9, True)",-76.0,484
7,"(20, 9, True)",743.0,966
8,"(19, 3, False)",-4049.0,5532
9,"(18, 1, False)",-4217.0,5458


In [65]:
df['value'] = df['total_return']/df['N']

In [66]:
df.head(10)

Unnamed: 0,state,total_return,N,value
0,"(19, 10, False)",-16194.0,21948,-0.737835
1,"(20, 8, False)",5922.0,7463,0.793515
2,"(13, 10, False)",-11167.0,18763,-0.595161
3,"(18, 10, True)",-1410.0,3095,-0.455574
4,"(16, 10, False)",-14122.0,20691,-0.682519
5,"(20, 1, False)",1080.0,7630,0.141547
6,"(13, 9, True)",-76.0,484,-0.157025
7,"(20, 9, True)",743.0,966,0.769151
8,"(19, 3, False)",-4049.0,5532,-0.731923
9,"(18, 1, False)",-4217.0,5458,-0.772627


In [67]:
df[df['state']==(21,9,False)]['value'].values

array([0.94389262])

In [68]:
df[df['state']==(5,8,False)]['value'].values

array([-0.46205357])