In [1]:
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
from builtins import *  # NOQA
from future import standard_library
standard_library.install_aliases()  # NOQA
import argparse
import logging
import sys

import chainer
from chainer import functions as F
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import gym
gym.undo_logger_setup()  # NOQA
import gym.wrappers


from osim.env import ProstheticsEnv


from chainerrl.agents import a3c
from chainerrl.agents import PPO
from chainerrl import experiments
from chainerrl import links
from chainerrl import misc
from chainerrl.optimizers.nonbias_weight_decay import NonbiasWeightDecay
from chainerrl import policies

style.use('ggplot')



In [2]:
# environment settings

In [2]:
# Chainer's settings
seed=1
gpu=0

In [3]:
# Network Setting

#actor_hidden_layers=3
#actor_hidden_units=300
actor_lr=1e-4


In [4]:
# other settings

number_of_episodes=5000
max_episode_length=1000

update_interval=4

number_of_eval_runs=10
eval_interval=10 ** 4

epochs=10
gamma=0.995
batch_size=128
entropy_coef=0.0


In [5]:
# Helpers classes


class A3CFFSoftmax(chainer.ChainList, a3c.A3CModel):
    """An example of A3C feedforward softmax policy."""

    def __init__(self, ndim_obs, n_actions, hidden_sizes=(200, 200)):
        self.pi = policies.SoftmaxPolicy(
            model=links.MLP(ndim_obs, n_actions, hidden_sizes))
        self.v = links.MLP(ndim_obs, 1, hidden_sizes=hidden_sizes)
        super().__init__(self.pi, self.v)

    def pi_and_v(self, state):
        return self.pi(state), self.v(state)


class A3CFFMellowmax(chainer.ChainList, a3c.A3CModel):
    """An example of A3C feedforward mellowmax policy."""

    def __init__(self, ndim_obs, n_actions, hidden_sizes=(200, 200)):
        self.pi = policies.MellowmaxPolicy(
            model=links.MLP(ndim_obs, n_actions, hidden_sizes))
        self.v = links.MLP(ndim_obs, 1, hidden_sizes=hidden_sizes)
        super().__init__(self.pi, self.v)

    def pi_and_v(self, state):
        return self.pi(state), self.v(state)


class A3CFFGaussian(chainer.Chain, a3c.A3CModel):
    """An example of A3C feedforward Gaussian policy."""

    def __init__(self, obs_size, action_space,
                 n_hidden_layers=2, n_hidden_channels=64,
                 bound_mean=None, normalize_obs=None):
        assert bound_mean in [False, True]
        assert normalize_obs in [False, True]
        super().__init__()
        hidden_sizes = (n_hidden_channels,) * n_hidden_layers
        self.normalize_obs = normalize_obs
        with self.init_scope():
            self.pi = policies.FCGaussianPolicyWithStateIndependentCovariance(
                obs_size, action_space.low.size,
                n_hidden_layers, n_hidden_channels,
                var_type='diagonal', nonlinearity=F.tanh,
                bound_mean=bound_mean,
                min_action=action_space.low, max_action=action_space.high,
                mean_wscale=1e-2)
            self.v = links.MLP(obs_size, 1, hidden_sizes=hidden_sizes)
            if self.normalize_obs:
                self.obs_filter = links.EmpiricalNormalization(
                    shape=obs_size
                )

    def pi_and_v(self, state):
        if self.normalize_obs:
            state = F.clip(self.obs_filter(state, update=False),
                           -5.0, 5.0)

        return self.pi(state), self.v(state)

In [6]:
# Helper's functions

# Linearly decay the learning rate to zero
def lr_setter(env, agent, value):
    agent.optimizer.alpha = value

# Linearly decay the clipping parameter to zero
def clip_eps_setter(env, agent, value):
    agent.clip_eps = value


def clip_action_filter(a):
    return np.clip(a, action_space.low, action_space.high)

def reward_filter(r):
    return r * 1


def phi(obs):
    obs=np.array(obs)
    return obs.astype(np.float32)


def make_env(test,render=False):
    env = ProstheticsEnv(visualize=render)
    # Use different random seeds for train and test envs
    env_seed = 2 ** 32 - 1 - seed if test else seed
    env.seed(env_seed)

    if not test:
        misc.env_modifiers.make_reward_filtered(env, reward_filter)
    if render and not test:
        misc.env_modifiers.make_rendered(env)
    return env

In [7]:
# Set a random seed used in ChainerRL
misc.set_random_seed(seed)

In [8]:
env = make_env(test=False,render=False)
obs_space=env.observation_space
obs_size = obs_space.low.size
action_space = env.action_space

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: Could not seed environment <ProstheticsEnv<0>>[0m


In [9]:
model = A3CFFGaussian(obs_size, action_space,bound_mean=True,normalize_obs=True)

In [10]:
opt = chainer.optimizers.Adam(alpha=actor_lr, eps=1e-5)
opt.setup(model)

<chainer.optimizers.adam.Adam at 0x7f60039fe898>

In [11]:
agent = PPO(model, opt,
                #gpu=args.gpu,
                phi=phi,
                update_interval=update_interval,
                minibatch_size=batch_size, epochs=epochs,
                clip_eps_vf=None, entropy_coef=entropy_coef,
                #standardize_advantages=args.standardize_advantages,
)

In [12]:
#lr_decay_hook = experiments.LinearInterpolationHook(number_of_steps, actor_lr, 0, lr_setter)

#clip_eps_decay_hook = experiments.LinearInterpolationHook(number_of_steps, 0.2, 0, clip_eps_setter)

eval_env = make_env(test=True,render=False)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: Could not seed environment <ProstheticsEnv<0>>[0m


In [None]:
G=[]
G_mean=[]
for ep in range(1, number_of_episodes+ 1):
    if ep%100:
        agent.save("PPO_Prosthetic_5000")
    obs = env.reset()
    reward = 0
    done = False
    R = 0  # return (sum of rewards)
    t = 0  # time step
    episode_rewards=[]
    while not done and t < max_episode_length:
        # Uncomment to watch the behaviour
        #env.render()
        action = agent.act_and_train(obs, reward)
        obs, reward, done, _ = env.step(action)
        R += reward
        episode_rewards.append(reward)
        t += 1
        
    if done or t >= max_episode_length :
            
            # Calculate sum of the rewards
        episode_rewards_sum = sum(episode_rewards)     
        G.append(episode_rewards_sum)
        total_G = np.sum(G)
        maximumReturn = np.amax(G)
        print("%f" % (episode_rewards_sum), file=open("PPO_Prosthetic_5000.txt", "a"))
        if ep % 10 == 0:
                
            print("==========================================")
            print("Episode: ", ep)
            print("Rewards: ", episode_rewards_sum)
            print("Max reward so far: ", maximumReturn)
            # Mean reward
            total_reward_mean = np.divide(total_G, ep+1)
            G_mean.append(total_reward_mean)
            print("Mean Reward", total_reward_mean)
            print("%f" % (total_reward_mean), file=open("PPO_MEAN_Prosthetic_5000.txt", "a"))
                     
    agent.stop_episode_and_train(obs, reward, done)
    
    
print('Finished.')
plt.xlabel('episdes')
plt.ylabel('reword')
plt.plot(G)   
plt.savefig('PPO_prosthetic_5000', dpi = 1000)


plt.plot(G_mean)
plt.ylabel('Average of Returns')
plt.xlabel('Number of episodes/10')

plt.savefig("ReturnsAverage_VS_Episodes_PPO_prosthetic_5000", dpi = 1000)

Episode:  10
Rewards:  -540.4483801846826
Max reward so far:  -525.5893674184133
Mean Reward -491.9189284847414
Episode:  20
Rewards:  -539.7766174618978
Max reward so far:  -200.8862833155811
Mean Reward -486.6985488763451


NameError: name 'agent' is not defined

In [None]:
plt.plot(G)
plt.ylabel('Returns')
plt.xlabel('Number of episodes')

In [None]:
plt.plot(G_mean)
plt.ylabel('Average of Returns ')
plt.xlabel('Number of episodes')