## Welcome to the FUN track of WiDS reinforement learning in a nutshell tutorial!

In this track, you will play with several RL libraries that provide
- Standard environments to train and compare different algorithms
- Easy-to-use pre-implemented algorithms

Now let's get started!

First, we will use some trick to show the rendered animation from the remote notebook. You don't need to do this for running it on your local machine.

In [19]:
!apt-get install -y xvfb python-opengl ffmpeg
!pip install pyvirtualdisplay

Reading package lists... Done
Building dependency tree       
Reading state information... Done
python-opengl is already the newest version (3.1.0+dfsg-1).
ffmpeg is already the newest version (7:3.4.4-0ubuntu0.18.04.1).
xvfb is already the newest version (2:1.19.6-1ubuntu4.2).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 10 not upgraded.


In [0]:
import glob
import io
import base64
from IPython import display as ipythondisplay
from IPython.display import HTML

import matplotlib.pyplot as plt
%matplotlib inline

In [21]:
# The libraries need to making rendering work from a server
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1005'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1005'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

In [0]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = gym.wrappers.Monitor(env, './video', force=True)
  return env

Set up the Tensorboard.

In [4]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip 

--2019-03-27 11:49:56--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 52.86.186.182, 35.172.177.65, 52.203.102.189, ...
Connecting to bin.equinox.io (bin.equinox.io)|52.86.186.182|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14977695 (14M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip’


2019-03-27 11:49:57 (18.0 MB/s) - ‘ngrok-stable-linux-amd64.zip’ saved [14977695/14977695]

Archive:  ngrok-stable-linux-amd64.zip
  inflating: ngrok                   


In [14]:
log_path = "./log/"

get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format(log_path)
)

get_ipython().system_raw('./ngrok http 6006 &')

! curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

http://c822a17e.ngrok.io


In [17]:
!ls 

log  ngrok  ngrok-stable-linux-amd64.zip  sample_data


Install the dependencies.

In [8]:
!apt install swig
!pip install box2d

!pip install gym
!pip install gym[atari]

!pip install stable_baselines
!pip install tensorforce

Reading package lists... Done
Building dependency tree       
Reading state information... Done
swig is already the newest version (3.0.12-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 10 not upgraded.


In [0]:
import numpy as np
import time

In [10]:
!mkdir models

mkdir: cannot create directory ‘models’: File exists
mkdir: cannot create directory ‘log’: File exists


### OpenAIGym

In [0]:
import gym
from stable_baselines import PPO2
from stable_baselines.common.vec_env import DummyVecEnv

In [0]:
# Render the simulation of the model in the environment
# For vectorized models like PPO and A2C, set use_vec_env to True
def render(env_id, use_vec_env=False, model=None, max_step=500):
    env = gym.make(env_id)
    env = wrap_env(env) # Comment out this line if running locally
    if use_vec_env:
        # Note: Vectorized environments allow multiprocess training. 
        # In this tutorial, we only uses one process, so we use the DummyVecEnv which is just a simple wrapper.

        env = DummyVecEnv([lambda: env])
    
    observation = env.reset()
    
    for _ in range(max_step):
        env.render()
        if (model==None): # Sample a random action from the action space if no model is provided
            if use_vec_env:
                action = [env.action_space.sample()]
            else:
                action = env.action_space.sample()
        else:
            action, _states = model.predict(observation)

        observation, reward, done, info = env.step(action)

        if done:
            observation = env.reset()

    if use_vec_env:
        env.envs[0].close()
    else:
        env.close()
   
    show_video()

In [0]:
#env_id = "CartPole-v1"
env_id = "MsPacman-v0"

# Training parameters
policy = "MlpPolicy"
max_train_step = 10000
learning_rate = 0.0001

model_path = "./models/MsPacman_PPO.model"

In [15]:
render(env_id, model=None)

To make life easier, we use a variation of the original OpenAI baselines: [stable baselines](https://github.com/hill-a/stable-baselines).

In [0]:
model = PPO2(policy, env_id, learning_rate=learning_rate, tensorboard_log=log_path)
model.learn(max_train_step, tb_log_name=env_id+str(time.time()))
# Save the agent
model.save(model_path)
del model  # delete trained model to demonstrate loading

In [17]:
# Enjoy trained agent
model = PPO2(policy, env_id).load(model_path)
render(env_id, use_vec_env=True, model=model)

### Tensorforce

[Tensorforce](https://github.com/tensorforce/tensorforce) is an open-source library that provides modulized APIs for reinforcement learning. As the name suggest, it is built on top of TensorFlow.

In [0]:
from tensorforce.agents import PPOAgent
from tensorforce.execution import Runner
from tensorforce.contrib.openai_gym import OpenAIGym

In [0]:
# Create an OpenAIgym environment
env = OpenAIGym('CartPole-v0', visualize=True)

In [0]:
# Network as list of layers
network_spec = [
    dict(type='dense', size=32, activation='tanh'),
    dict(type='dense', size=32, activation='tanh')
]

agent = PPOAgent(
    states=env.states,
    actions=env.actions,
    network=network_spec,
    batching_capacity=4096,
    # BatchAgent
#    keep_last_timestep=True,
    # PPOAgent
    step_optimizer=dict(
        type='adam',
        learning_rate=1e-3
    ),
    optimization_steps=10,
    # Model
    scope='ppo',
    discount=0.99,
    # DistributionModel
#    distributions_spec=None,
    entropy_regularization=0.01,
    # PGModel
    baseline_mode=None,
    baseline=None,
    baseline_optimizer=None,
    gae_lambda=None,
    # PGLRModel
    likelihood_ratio_clipping=0.2,
#    summarizer=dict(directory="./board/",
#                    steps=50,
#                    labels=['graph',
#                            'configuration',
#                            'gradients_scalar',
#                            'regularization',
#                            'inputs',
#                            'losses'
#                            'variables'
#                           ])
#    summary_spec=None,
#    distributed_spec=None
)

In [0]:
# Callback function printing episode statistics
def episode_finished(r):
    print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.episode_timestep,
                                                                                 reward=r.episode_rewards[-1]))
    return True

In [0]:
# Create the runner
runner = Runner(agent=agent, environment=env)

# Start learning
runner.run(episodes=100, max_episode_timesteps=200, episode_finished=episode_finished)

# Print statistics
print("Learning finished. Total episodes: {ep}. Average reward of last 100 episodes: {ar}.".format(
    ep=runner.episode,
    ar=np.mean(runner.episode_rewards[-100:]))
)

In [0]:
runner.agent.save_model(directory="./agents/")
runner.close()

In [0]:
print(runner.agent)