In [None]:
import matplotlib.pyplot as plt
from util import to_onehot, moving_average, Agent
import gym
from gym.envs.registration import register, spec

plt.style.use('fivethirtyeight')
%matplotlib inline

In [None]:
MY_ENV_NAME='FrozenLakeNonskid4x4-v1'

register(
    id=MY_ENV_NAME,
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name': '4x4', 'is_slippery': False},
    timestep_limit=500,
    reward_threshold=0.8196
)
env = gym.make(MY_ENV_NAME)

In [None]:
SAMPLE_BATCH_SIZE = 32
OBS_SPACE = env.observation_space.n  # size of state space
ACTION_SPACE = env.action_space.n  # 0 = left; 1 = down; 2 = right;  3 = up
N_EPISODES = 1000

# scenario 1: single agent training
the agent is responsible for learning value and policy simultaneously. 

the original atari approach used a single DQN agent. it was discovered later that an actor/critic approach works better.

In [None]:
from agent import Agent
agent = Agent(OBS_SPACE, ACTION_SPACE)

In [None]:
history = []
num_steps = []

for episode in range(N_EPISODES):
    
    state = env.reset()  # state is a int number corresponding to the agent's position in a board
    done = False
    steps = 0
    
    while not done:
        
        # propose an action
        curr_state_encoded = to_onehot(OBS_SPACE, state).reshape(1, OBS_SPACE)
        action = agent.act(curr_state_encoded)
        
        # what are the consequences of taking that action?
        next_state, reward, done, transmit_prob = env.step(action)
        next_state_encoded = to_onehot(OBS_SPACE, next_state).reshape(1, OBS_SPACE)
        
        # store memories for experience replay (prevents catastrophic forgetting)
        agent.remember(curr_state_encoded, action, reward, next_state_encoded, done)
        
        # move to next state
        state = next_state
        steps += 1
    agent.replay(SAMPLE_BATCH_SIZE)
    
    # if in final state, then mark 1 for success
    if state == (OBS_SPACE - 1):
        history.append(1)
    else:
        history.append(0)
    num_steps.append(steps)

In [None]:
plt.plot(moving_average(history, 500))
plt.ylabel("probability of successive outcome")
plt.xlabel("episodes")
plt.title("RL")

In [None]:
plt.plot(moving_average(num_steps, 500))
plt.ylabel("number of steps required for success")
plt.xlabel("episodes")
plt.title("RL")

# approach 2: actor/critic model

a critic network (aka value network) predicts a single value for a specific location.

the value network will place higher values near the final winning position (and low values in the "hole" positions). it will place the higest value on the winning final position.

the value network expresses the the most efficient path to the policy network. it is no different from a critic telling an actor how to act (and similarly, it is up to the actor to follow the critic's suggestions.)

the actor (aka policy network) predicts the best action from the current state. the best action is quantified as (1) reward + (2) discounted next value - (3) predicted value. the actor will no longer select the < action/next-state > with the highest value; instead it will select the < action/next-state > that results in the greatest change in value.

* http://www.rage.net/~greg/2016-07-05-ActorCritic-with-OpenAI-Gym.html

### critic

In [None]:
from critic import Critic
import math
import numpy as np

In [None]:
critic = Critic(OBS_SPACE, ACTION_SPACE)
gamma = .9  # weighting

OBS_SQR= int(math.sqrt(OBS_SPACE))
STATEGRID = np.zeros((OBS_SQR,OBS_SQR))
STATEGRID

In [None]:
for episode in range(N_EPISODES):
    
    state = env.reset()
    done = False
    steps = 0
    
    while not done:
        
        # ask critic to predict value for current state        
        curr_state_encoded = to_onehot(OBS_SPACE, state).reshape(1, OBS_SPACE)
        orig_val = critic.predict(curr_state_encoded)
        
        # take an action
        action = agent.act(curr_state_encoded)
        
        # what are the consequences of taking that action?
        next_state, reward, done, transmit_prob = env.step(action)
        next_state_encoded = to_onehot(OBS_SPACE, next_state).reshape(1, OBS_SPACE)
        
        # get critic's prediction on next state
        new_val = critic.predict(next_state_encoded)
        
        # determine target value
        if not done:
            target = (gamma * new_val)
        else:
            target = (gamma * reward)  # max value is discounted reward at final location
        best_val = max((orig_val*gamma), target)
        
        # each state is encoded with its "best" target value
        # the target value is equal to the discounted next value
        # we dont care which action led to this value!
        critic.remember((curr_state_encoded, best_val))
        
        # if terminal, append another replay for final location (otherwise final loc will never be recorded)
        if done:
            critic.remember((next_state_encoded, reward))
        
        # move on to next state and continue
        state = next_state
        steps += 1
    
    critic.replay(SAMPLE_BATCH_SIZE)
    
    if episode % 500 == 0:
        print (episode)

In [None]:
critic.plot_value(STATEGRID)

In [None]:
env.render()

## actor

In [None]:
from actor import Actor
actor = Actor(OBS_SPACE, ACTION_SPACE)

In [None]:
for episode in range(500):
    
    state = env.reset()
    done = False
    steps = 0
    
    while not done:
        # get value for current state from critic
        curr_state_encoded = to_onehot(OBS_SPACE, state).reshape(1, OBS_SPACE)
        orig_val = critic.predict(curr_state_encoded)
        
        # propose an action
        action = actor.act(curr_state_encoded)
        
        # what are the consequences of taking that action?
        next_state, reward, done, transmit_prob = env.step(action)
        next_state_encoded = to_onehot(OBS_SPACE, next_state).reshape(1, OBS_SPACE)
        
        # get value for next state from critic
        new_val = critic.predict(next_state_encoded)
        
        # The actor is updated
        # by using the difference of the value the critic
        # placed on the old state vs. the value the critic
        # places on the new state.. encouraging the actor
        # to move into more valuable states.
        
        # actor learns to predict a set of actor deltas
        actor_delta = new_val - orig_val                
        actor.remember([curr_state_encoded, action, actor_delta])
    
        # move on to next state and continue
        state = next_state
        steps += 1
    
        # TODO: inside while loop or not?
        actor.replay()
    
    if episode % 250 == 0:
        print (episode)

In [None]:
obs_sqr = 4
np_w_cri_r = np.zeros((obs_sqr, obs_sqr))
working_state = STATEGRID.copy()
for x in range(0, obs_sqr):
    for y in range(0, obs_sqr):
        my_state = working_state.copy()
        
        my_state[x,y] = 1  # Place the player at a given X/Y location.

        # And now have the critic model predict the state value
        # with the player in that location.
        value = actor.predict(my_state.reshape(1, OBS_SPACE))
        np_w_cri_r[x,y] = np.argmax(value)
np_w_cri_r.shape
plt.pcolor(np_w_cri_r)
plt.title("ACTION Network")
plt.colorbar()
plt.xlabel("X")
plt.ylabel("Y")
plt.gca().invert_yaxis()
plt.draw()

In [None]:
# 0 = left; 1 = down; 2 = right;  3 = up

In [None]:
env.render()