In [1]:
import sys
import gym
import numpy as np
from collections import defaultdict

In [3]:
env = gym.make('Blackjack-v0')
print(env.observation_space)

Tuple(Discrete(32), Discrete(11), Discrete(2))


#### Random policy check

In [22]:
for i_episode in range(5):
    state=env.reset()
    while True:
        print(state)
        action = env.action_space.sample()
        print('Stick') if action == 0 else print('hit')
        state, reward, done, info = env.step(action)
        if done:
            print('Reward', reward)
            print('You won') if reward > 0 else print('You lost')
            break

(12, 2, False)
Stick
Reward -1.0
You lost
(12, 8, False)
hit
(21, 8, False)
Stick
Reward 1.0
You won
(17, 10, False)
hit
Reward -1
You lost


## Monte Carlo Prediction of Action Value Function

#### Policy is stick when sum>18 and hit when sum<18 with 80% probability for both

In [4]:
def generate_episode(bj_env):
    episode=[]
    state= bj_env.reset()
    
    while True:
        
        probs = [0.8,0.2] if state[0]>18 else [0.2,0.8]
        action = np.random.choice(np.arange(2), p=probs)
        next_state, reward, done, info = bj_env.step(action)
        episode.append((state,action,reward))
        state= next_state
        
        if done:
            break
        
    return episode

In [5]:
for i in range(3):
    print(generate_episode(env))

[((13, 10, False), 1, -1)]
[((9, 10, False), 1, 0), ((16, 10, False), 1, 0), ((18, 10, False), 1, -1)]
[((15, 3, False), 0, -1.0)]


In [6]:
def action_val_pred(env, num_episodes, generate_episode, gamma=1.0):
    
    # initialize empty dictionaries of arrays
    returns_sum = defaultdict(lambda: np.zeros(env.action_space.n))
    
    N = defaultdict(lambda: np.zeros(env.action_space.n))
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    
    # loop over episodes
    for i_episode in range(1, num_episodes+1):
        
        # monitor progress
        if i_episode % 1000 == 0:
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()
            
        # generate an episode
        episode = generate_episode(env)
        
        # obtain the states, actions, and rewards
        states, actions, rewards = zip(*episode)
        
        # prepare for discounting
        discounts = np.array([gamma**i for i in range(len(rewards)+1)])
        
        # update the sum of the returns, number of visits, and action-value 
        # function estimates for each state-action pair in the episode
        for i, state in enumerate(states):
            
            returns_sum[state][actions[i]] += sum(rewards[i:]*discounts[:-(1+i)])
            
            N[state][actions[i]] += 1.0
            Q[state][actions[i]] = returns_sum[state][actions[i]] / N[state][actions[i]]
            
    return Q

In [9]:
Q = action_val_pred(env, 500, generate_episode)


In [10]:
Q

defaultdict(<function __main__.action_val_pred.<locals>.<lambda>()>,
            {(20, 3, False): array([ 0.75, -1.  ]),
             (13, 10, False): array([-0.5       , -0.73684211]),
             (17, 10, False): array([-0.25      , -0.33333333]),
             (21, 10, False): array([ 0.72727273, -1.        ]),
             (14, 1, False): array([ 0.        , -0.66666667]),
             (8, 5, False): array([ 0., -1.]),
             (17, 5, False): array([ 0., -1.]),
             (17, 1, True): array([0., 0.]),
             (19, 1, True): array([0., 0.]),
             (21, 6, True): array([ 0., -1.]),
             (16, 6, False): array([-1., -1.]),
             (17, 7, False): array([ 0. , -0.2]),
             (21, 7, False): array([1., 0.]),
             (21, 1, True): array([ 0., -1.]),
             (12, 1, False): array([-1. , -0.8]),
             (16, 1, False): array([ 0., -1.]),
             (17, 1, False): array([-1., -1.]),
             (12, 2, True): array([ 0., -1.]),
    