# Reference: 
    
Deep Reinforcement Learning with Python

By: Sudharsan Ravichandiran



In [1]:
import gym
import pandas as pd
from collections import defaultdict
import random

In [2]:
env = gym.make('Blackjack-v1')

In [3]:
Q = defaultdict(float)

In [4]:
total_return = defaultdict(float)

In [5]:
N = defaultdict(int)

In [6]:
def epsilon_greedy_policy(state,Q):
    
    #set the epsilon value to 0.5
    epsilon = 0.5
    
    #sample a random value from the uniform distribution, if the sampled value is less than
    #epsilon then we select a random action else we select the best action which has maximum Q
    #value as shown below
    
    if random.uniform(0,1) < epsilon:
        return env.action_space.sample()
    else:
        return max(list(range(env.action_space.n)), key = lambda x: Q[(state,x)])

In [7]:
num_timesteps = 100

In [10]:
def generate_episode(Q):
    
    #initialize a list for storing the episode
    episode = []
    
    #initialize the state using the reset function
    state = env.reset()[0]
    
    #then for each time step
    for t in range(num_timesteps):
        
        #select the action according to the epsilon-greedy policy
        action = epsilon_greedy_policy(state,Q)
        
        #perform the selected action and store the next state information
        next_state, reward, done, truncate, info = env.step(action)
        
        #store the state, action, reward in the episode list
        episode.append((state, action, reward))
        
        #if the next state is a final state then break the loop else update the next state to the current
        #state
        if done or truncate:
            break
            
        state = next_state

    return episode

In [11]:
num_iterations = 50000

In [12]:
#for each iteration
for i in range(num_iterations):
    
    #so, here we pass our initialized Q function to generate an episode
    episode = generate_episode(Q)
    
    #get all the state-action pairs in the episode
    all_state_action_pairs = [(s, a) for (s,a,r) in episode]
    
    #store all the rewards obtained in the episode in the rewards list
    rewards = [r for (s,a,r) in episode]

    #for each step in the episode 
    for t, (state, action, reward) in enumerate(episode):

        #if the state-action pair is occurring for the first time in the episode
        if not (state, action) in all_state_action_pairs[0:t]:
            
            #compute the return R of the state-action pair as the sum of rewards
            R = sum(rewards[t:])
            
            #update total return of the state-action pair
            total_return[(state,action)] = total_return[(state,action)] + R
            
            #update the number of times the state-action pair is visited
            N[(state, action)] += 1

            #compute the Q value by just taking the average
            Q[(state,action)] = total_return[(state, action)] / N[(state, action)]

  if not isinstance(terminated, (bool, np.bool8)):


In [13]:
df = pd.DataFrame(Q.items(),columns=['state_action pair','value'])

In [15]:
df.head(20)

Unnamed: 0,state_action pair,value
0,"((16, 6, False), 0)",-0.160714
1,"((12, 10, False), 0)",-0.563895
2,"((15, 2, False), 0)",-0.225352
3,"((15, 2, False), 1)",-0.56
4,"((20, 5, False), 1)",-0.934783
5,"((5, 6, False), 0)",0.0
6,"((12, 8, False), 0)",-0.536842
7,"((14, 1, False), 0)",-0.8125
8,"((14, 1, False), 1)",-0.666667
9,"((20, 3, False), 0)",0.604278
