In [2]:
%matplotlib inline

import gym
import matplotlib
import numpy as np
import sys

from collections import defaultdict
if "../" not in sys.path:
  sys.path.append("../") 
from lib.envs.blackjack import BlackjackEnv
from lib import plotting

matplotlib.style.use('ggplot')

In [3]:
env = BlackjackEnv()

In [4]:
def make_epsilon_greedy_policy(Q, epsilon, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function and epsilon.
    
    Args:
        Q: A dictionary that maps from state -> action-values.
            Each value is a numpy array of length nA (see below)
        epsilon: The probability to select a random action . float between 0 and 1.
        nA: Number of actions in the environment.
    
    Returns:
        A function that takes the observation as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.
    
    """
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

In [5]:
policy((20,12,True))

NameError: name 'policy' is not defined

In [6]:
def mc_control_epsilon_greedy(env, num_episodes, discount_factor=1.0, epsilon=0.1):
    """
    Monte Carlo Control using Epsilon-Greedy policies.
    Finds an optimal epsilon-greedy policy.
    
    Args:
        env: OpenAI gym environment.
        num_episodes: Number of episodes to sample.
        discount_factor: Gamma discount factor.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
    
    Returns:
        A tuple (Q, policy).
        Q is a dictionary mapping state -> action values.
        policy is a function that takes an observation as an argument and returns
        action probabilities
    """
    
    # Keeps track of sum and count of returns for each state
    # to calculate an average. We could use an array to save all
    # returns (like in the book) but that's memory inefficient.
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
    
    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    
    # The policy we're following
    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)
    
    for i_episode in range(1, num_episodes + 1):
        # Print out which episode we're on, useful for debugging.
        if i_episode % 1000 == 0:
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()

        # Generate an episode.
        # An episode is an array of (state, action, reward) tuples
        episode = []
        state = env.reset()
        for t in range(100):
            probs = policy(state)
            action = np.random.choice(np.arange(len(probs)), p=probs)
            next_state, reward, done, _ = env.step(action)
            episode.append((state, action, reward))
            if done:
                break
            state = next_state

        # Find all (state, action) pairs we've visited in this episode
        # We convert each state to a tuple so that we can use it as a dict key
        sa_in_episode = set([(tuple(x[0]), x[1]) for x in episode])
        for state, action in sa_in_episode:
            sa_pair = (state, action)
            # Find the first occurance of the (state, action) pair in the episode
            first_occurence_idx = next(i for i,x in enumerate(episode)
                                       if x[0] == state and x[1] == action)
            # Sum up all rewards since the first occurance
            G = sum([x[2]*(discount_factor**i) for i,x in enumerate(episode[first_occurence_idx:])])
            # Calculate average return for this state over all sampled episodes
            returns_sum[sa_pair] += G
            returns_count[sa_pair] += 1.0
            Q[state][action] = returns_sum[sa_pair] / returns_count[sa_pair]
        
        # The policy is improved implicitly by changing the Q dictionary
    
    return Q, policy

In [22]:
Q, policy = mc_control_epsilon_greedy(env, num_episodes=500000, epsilon=0.1)

Episode 500000/500000.

In [7]:
Q

NameError: name 'Q' is not defined

In [8]:
# For plotting: Create value function from action-value function
# by picking the best action at each state
V = defaultdict(float)
for state, actions in Q.items():
    action_value = actions[0]
    V[state] = action_value
plotting.plot_value_function(V, title="Optimal Value Function")


for state, actions in Q.items():
    action_value = actions[1]
    V[state] = action_value
plotting.plot_value_function(V, title="Optimal Value Function")

NameError: name 'Q' is not defined

In [26]:
import matplotlib.pyplot as plt

In [1]:
Y = np.array([s[0] for s, a in Q.items()])
X = np.array([s[1] for s, a in Q.items()])
Z = np.array([np.max(a) for s, a in Q.items()])
Z1 = np.array([a[0] for s, a in Q.items()])
Z2 = np.array([a[1] for s, a in Q.items()])

# X, Y = np.meshgrid(X, Y)
fig = plt.figure(figsize=(16,10))
ax = fig.gca(projection='3d')
ax.plot_trisurf(X,Y,Z0)
ax.plot_trisurf(X,Y,Z1)

ax.plot_surface(X,Y,np.where(Z1<Z2,Z1,np.nan))
ax.plot_surface(X,Y,Z2)
ax.plot_surface(X,Y,np.where(Z1>=Z2,Z1,np.nan))

NameError: name 'np' is not defined

In [None]:
from mayavi import mlab
mlab.init_notebook()
# fig = mlab.figure()

# x = np.arange(-2, 2, 0.1)
# y = np.arange(-2, 2, 0.1)
# mx, my = np.meshgrid(x, y, indexing='ij')
# mz1 = np.abs(mx) + np.abs(my)
# mz2 = mx ** 2 + my ** 2

# ax_ranges = [-2, 2, -2, 2, 0, 8]
# ax_scale = [1.0, 1.0, 0.4]
# ax_extent = ax_ranges * np.repeat(ax_scale, 2)

# surf3 = mlab.surf(mx, my, mz1, colormap='Blues')
# surf4 = mlab.surf(mx, my, mz2, colormap='Oranges')

# surf3.actor.actor.scale = ax_scale
# surf4.actor.actor.scale = ax_scale
# mlab.view(60, 74, 17, [-2.5, -4.6, -0.3])
# mlab.outline(surf3, color=(.7, .7, .7), extent=ax_extent)
# mlab.axes(surf3, color=(.7, .7, .7), extent=ax_extent,
#           ranges=ax_ranges,
#           xlabel='x', ylabel='y', zlabel='z')
# surf3.actor.property.opacity = 0.5
# surf4.actor.property.opacity = 0.5
# fig.scene.renderer.use_depth_peeling = 1

In [None]:
mlab.plot3d(X,Y,Z)
mlab.show()