In [None]:
#XVFB will be launched if you run on a server
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY"))==0:
    !bash ../xvfb start
    %env DISPLAY=:1

# Digging deeper: approximate crossentropy with neural nets

In this section we will train a neural network policy for continuous state space game

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

cart_env = gym.make("CartPole-v0").env  #if you see "<classname> has no attribute .env", remove .env or update gym
cart_env.reset()
cart_n_actions = cart_env.action_space.n

plt.imshow(cart_env.render("rgb_array"))

In [None]:
#create agent
from sklearn.neural_network import MLPClassifier
cart_agent = MLPClassifier(hidden_layer_sizes=(20,20),
                           activation='tanh',
                           warm_start=True, #keep progress between .fit(...) calls
                           max_iter=1 #make only 1 iteration on each .fit(...)
                          )

#initialize agent to the dimension of state an amount of actions
cart_initial_observation = cart_env.reset()
cart_agent.fit(X=[cart_initial_observation]*cart_n_actions, 
               y=list(range(cart_n_actions)))

In [None]:
def generate_session(agent, env, n_actions, t_max=1000):
    """
    Generates sessions
    
    Parameters
    ----------
    agent : MLPClassifier
        The agent to use
    env : gym.envs
        The environment to use
    n_actions : int
        The number of actions
    t_max : int
        Maximum number of steps to use
    
    Returns
    -------
    states : list
        The list of states
    actions : list
        Thes list of actions
    total_reward : float
        The sum of the reward
    """
    
    states = list()
    actions = list()
    total_reward = 0
    
    s = env.reset()
    
    for t in range(t_max):
        
        # a vector of action probabilities in current state
        probs = agent.predict_proba([s])[0] 
        
        a = np.random.choice(n_actions, p=probs)
        
        new_s, r, done, info = env.step(a)
        
        #record sessions like you did before
        states.append(s)
        actions.append(a)
        total_reward += r
        
        s = new_s
        if done:
            break

    return states, actions, total_reward       

### CEM steps
Deep CEM uses exactly the same strategy as the regular CEM, so you can copy your function code from previous notebook.

The only difference is that now each observation is not a number but a float32 vector.

In [None]:
def select_elites(states_batch,
                  actions_batch,
                  rewards_batch,
                  percentile=50):
    """
    Select states and actions from games that have rewards >= percentile
    
    Notes
    -----
    It is not assumed that states are integers (they'll get different later)
    
    Parameters
    ----------
    states_batch : list
        List of list of states given as
        >>> states_batch[session_i][t]
        Where session_i is the session and t is the step
    action_batch : list
        List of list of actions given as
        >>> actions_batch[session_i][t]
        Where session_i is the session and t is the step
    rewards_batch : list
        List of rewards given in the sessions
    percentile : float
        The percentile to select the elites from
        We are selecting states from games that have rewards >= percentile
    
    Returns
    -------
    elite_states : list
        A list of the states where the elite actions took place
        Sorted by  session number and timestep within session
    elite_actions : list
        A list of the elite actions
        Sorted by  session number and timestep within session
    """
    
    reward_threshold = np.percentile(rewards_batch, percentile)
    
    # NOTE: [0] as rewards_batch is a 1-d list
    indices = np.where(rewards_batch >= reward_threshold)[0]
    
    elite_states  = [state for session, state_session in enumerate(states_batch) 
                     for state in state_session if session in indices]
    elite_actions  = [action for session, action_session in enumerate(actions_batch) 
                      for action in action_session if session in indices]
    
    return elite_states, elite_actions

# Training loop
Generate sessions, select N best and fit to those.

In [None]:
from IPython.display import clear_output

def show_progress(batch_rewards, log, percentile, reward_range=[-990,+10]):
    """
    A convenience function that displays training progress. 
    No cool math here, just charts.
    """
    
    mean_reward, threshold = np.mean(batch_rewards), np.percentile(batch_rewards, percentile)
    log.append([mean_reward, threshold])

    clear_output(True)
    print("mean reward = %.3f, threshold=%.3f"%(mean_reward, threshold))
    plt.figure(figsize=[8,4])
    plt.subplot(1,2,1)
    plt.plot(list(zip(*log))[0], label='Mean rewards')
    plt.plot(list(zip(*log))[1], label='Reward thresholds')
    plt.legend()
    plt.grid()
    
    plt.subplot(1,2,2)
    plt.hist(batch_rewards, range=reward_range);
    plt.vlines([np.percentile(batch_rewards, percentile)], [0], [100], label="percentile", color='red')
    plt.legend()
    plt.grid()

    plt.show()

In [None]:
cart_n_sessions = 100
cart_percentile = 70
cart_log = []

for i in range(100):
    #generate new sessions
    cart_sessions = [generate_session(cart_agent, cart_env, cart_n_actions) for i in range(cart_n_sessions)]

    cart_batch_states, cart_batch_actions, cart_batch_rewards = map(np.array, zip(*cart_sessions))

    cart_elite_states, cart_elite_actions = select_elites(cart_batch_states,
                                                          cart_batch_actions,
                                                          cart_batch_rewards)
    
    # Train the network one epoch with the new information
    cart_agent.fit(X=cart_elite_states, y=cart_elite_actions)    
    show_progress(cart_batch_rewards, cart_log, cart_percentile, reward_range=[0, np.max(cart_batch_rewards)])
    
    if np.mean(cart_batch_rewards)> 190:
        print("You Win! I'll stop for you.")
        break

# Results

In [None]:
#record sessions
import gym.wrappers
cart_v_env = gym.wrappers.Monitor(gym.make("CartPole-v0"), directory="videos", force=True)
cart_v_sessions = [generate_session(cart_agent, cart_v_env, cart_n_actions) for _ in range(100)]
cart_v_env.close()

In [None]:
#show video
from IPython.display import HTML
import os

video_names = list(filter(lambda s:s.endswith(".mp4"), os.listdir("./videos/")))

HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format("./videos/"+video_names[-1])) #this may or may not be _last_ video. Try other indices

### Now what?

By this moment you should have got enough score on [CartPole-v0](https://gym.openai.com/envs/CartPole-v0) to consider it solved (see the link). It's time to upload the result and get to something harder.

_if you have any trouble with CartPole-v0 and feel stuck, take a look at the forums_

* Pick one of environments: MountainCar-v0 or LunarLander-v2.
  * For MountainCar, get average reward of __at least -150__
  * For LunarLander, get average reward of __at least +50__

See the tips section below, it's kinda important.
__Note:__ If your agent is below the target score, you'll still get most of the points depending on the result, so don't be afraid to submit it.
  
  
* Bonus quest: Devise a way to speed up training at least 2x against the default version
  * Obvious improvement: use [joblib](https://www.google.com/search?client=ubuntu&channel=fs&q=joblib&ie=utf-8&oe=utf-8)
  * Try re-using samples from 3-5 last iterations when computing threshold and training
  * Experiment with amount of training iterations and learning rate of the neural network (see params)
  
  
### Tips & tricks
* Gym page: [mountaincar](https://gym.openai.com/envs/MountainCar-v0), [lunarlander](https://gym.openai.com/envs/LunarLander-v2)
* Sessions for MountainCar may last for 10k+ ticks. Make sure ```t_max``` param is at least 10k.
 * Also it may be a good idea to cut rewards via ">" and not ">=". If 90% of your sessions get reward of -10k and 20% are better, than if you use percentile 20% as threshold, R >= threshold __fails cut off bad sessions__ while R > threshold works alright.
* _issue with gym_: Some versions of gym limit game time by 200 ticks. This will prevent cem training in most cases. Make sure your agent is able to play for the specified __t_max__, and if it isn't, try `env = gym.make("MountainCar-v0").env` or otherwise get rid of TimeLimit wrapper.
* If you use old _swig_ lib for LunarLander-v2, you may get an error. See this [issue](https://github.com/openai/gym/issues/100) for solution.
* If it won't train it's a good idea to plot reward distribution and record sessions: they may give you some clue. If they don't, call course staff :)
* 20-neuron network is probably not enough, feel free to experiment.
* __Please upload the results to openai gym and send links to all submissions in the e-mail__

# Solving the mountain car problem

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

mountain_env = gym.make("MountainCar-v0").env  #if you see "<classname> has no attribute .env", remove .env or update gym
mountain_env.reset()
mountain_n_actions = mountain_env.action_space.n

plt.imshow(mountain_env.render("rgb_array"))

In [None]:
mountain_agent = MLPClassifier(hidden_layer_sizes=(20, 20),
                               activation='tanh',
                               warm_start=True, #keep progress between .fit(...) calls
                               max_iter=1 #make only 1 iteration on each .fit(...)
                              )

#initialize agent to the dimension of state an amount of actions
mountain_initial_observation = mountain_env.reset()
mountain_agent.fit(X=[mountain_initial_observation]*mountain_n_actions, 
                   y=list(range(mountain_n_actions)))

In [None]:
def mountain_generate_session(t_max=10000):
    """
    Generates sessions
    
    Notes
    -----
    Although this is a programaticly bad function (due to the use of global variables),
    this is what the grader accepts
    
    Parameters
    ----------
    t_max : int
        Maximum number of steps to use
    
    Returns
    -------
    states : list
        The list of states
    actions : list
        Thes list of actions
    total_reward : float
        The sum of the reward
    """
    
    states = list()
    actions = list()
    total_reward = 0
    
    s = mountain_env.reset()
    
    for t in range(t_max):
        
        # a vector of action probabilities in current state
        probs = mountain_agent.predict_proba([s])[0] 
        
        a = np.random.choice(mountain_n_actions, p=probs)
        
        # NOTE: For each step taken, the reward is -1
        new_s, r, done, info = mountain_env.step(a)
        
        #record sessions like you did before
        states.append(s)
        actions.append(a)
        total_reward += r
        
        s = new_s
        if done:
            break

    return states, actions, total_reward       

In [None]:
mountain_n_sessions = 100
mountain_percentile = 70
mountain_log = []
mountain_t_max = 10000

for i in range(350):
    #generate new sessions
    mountain_sessions = [mountain_generate_session(t_max=mountain_t_max)
                         for i in range(mountain_n_sessions)]

    mountain_batch_states, mountain_batch_actions, mountain_batch_rewards = map(np.array, zip(*mountain_sessions))

    mountain_elite_states, mountain_elite_actions = select_elites(mountain_batch_states,
                                                                  mountain_batch_actions,
                                                                  mountain_batch_rewards,
                                                                  mountain_percentile)
    
    threshold = np.percentile(mountain_batch_rewards, mountain_percentile)
    reward_mean = np.mean(mountain_batch_rewards)
    
    if np.isclose(threshold, -mountain_t_max):
        print('Not enough samples reached the destination, will not train')
    elif threshold > -140 and reward_mean > -140:
        print('You successfully trained the algorithm')
        show_progress(mountain_batch_rewards, 
                      mountain_log, 
                      mountain_percentile, 
                      reward_range=[np.min(mountain_batch_rewards), np.max(mountain_batch_rewards)])
        break
    else:
        # Train the network one epoch with the new information
        mountain_agent.fit(X=mountain_elite_states, y=mountain_elite_actions)    
    show_progress(mountain_batch_rewards, 
                  mountain_log, 
                  mountain_percentile, 
                  reward_range=[np.min(mountain_batch_rewards), np.max(mountain_batch_rewards)])

In [None]:
#record sessions
mountain_env = gym.wrappers.Monitor(gym.make("MountainCar-v0"), directory="videos", force=True)
mountain_sessions = [mountain_generate_session() for _ in range(100)]
mountain_env.close()

In [None]:
#show video
from IPython.display import HTML
import os

video_names = list(filter(lambda s:s.endswith(".mp4"), os.listdir("./videos/")))

HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format("./videos/"+video_names[-1])) #this may or may not be _last_ video. Try other indices

### Submit to Coursera

In [None]:
from submit import submit_mountain_car
from functools import partial

EMAIL = 'michael.l.magnussen@gmail.com'
TOKEN = 'BDtRmu3qWqHjRkOU'

submit_mountain_car(mountain_generate_session, EMAIL, TOKEN)