In [1]:
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from future import standard_library
standard_library.install_aliases()  # NOQA
import argparse
import logging
import sys

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import gym
gym.undo_logger_setup()  # NOQA
from gym import spaces
import gym.wrappers


import chainer
from chainer import optimizers
from chainerrl.agents.ddpg import DDPG
from chainerrl.agents.ddpg import DDPGModel
from chainerrl import experiments
from chainerrl import explorers
from chainerrl import misc
from chainerrl import policy
from chainerrl import q_functions
from chainerrl import replay_buffer

style.use('ggplot')



In [2]:
# environment settings

#env_name='BipedalWalker-v2'
env_name='Pendulum-v0'

In [3]:
# Chainer's settings
seed=0
gpu=0

In [4]:
# Network Setting

actor_hidden_layers=3
actor_hidden_units=300
actor_lr=1e-4


critic_hidden_layers=3
critic_hidden_units=300
critic_lr=1e-3

In [5]:
# other settings

number_of_episodes=10000
max_episode_length=200

replay_buffer_size=5 * 10 ** 5
replay_start_size=50000
number_of_update_times=1

target_update_interval=1
target_update_method='soft'

soft_update_tau=1e-2
update_interval=4
number_of_eval_runs=100
eval_interval=10 ** 5

final_exploration_steps=10 ** 6

gamma=0.995
minibatch_size=128


In [6]:
# Helper's functions

def clip_action_filter(a):
    return np.clip(a, action_space.low, action_space.high)

def reward_filter(r):        # reward scale is: 1
    return r * 1


def phi(obs):
    return obs.astype(np.float32)

def random_action():
    a = action_space.sample()
    if isinstance(a, np.ndarray):
        a = a.astype(np.float32)
    return a


def make_env(test,env_name,render=False):
    env = gym.make(env_name)
    # Use different random seeds for train and test envs
    env_seed = 2 ** 32 - 1 - seed if test else seed
    env.seed(env_seed)
    #if args.monitor:
        #env = gym.wrappers.Monitor(env, args.outdir)
    if isinstance(env.action_space, spaces.Box):
        misc.env_modifiers.make_action_filtered(env, clip_action_filter)
    if not test:
        misc.env_modifiers.make_reward_filtered(env, reward_filter)
    if render and not test:
        misc.env_modifiers.make_rendered(env)
    return env

In [7]:
# Set a random seed used in ChainerRL
misc.set_random_seed(seed)

In [13]:
env = make_env(test=False,env_name=env_name,render=False)
timestep_limit = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
obs_size = np.asarray(env.observation_space.shape).prod()
action_space = env.action_space

action_size = np.asarray(action_space.shape).prod()


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [14]:
# Critic Network

q_func = q_functions.FCSAQFunction(
            obs_size, 
            action_size,
            n_hidden_channels=critic_hidden_units,
            n_hidden_layers=critic_hidden_layers)

# policy Network

pi = policy.FCDeterministicPolicy(
            obs_size, 
            action_size=action_size,
            n_hidden_channels=actor_hidden_units,
            n_hidden_layers=actor_hidden_layers,
            min_action=action_space.low, 
            max_action=action_space.high,
            bound_action=True)


In [15]:
# The Model

model = DDPGModel(q_func=q_func, policy=pi)
opt_actor = optimizers.Adam(alpha=actor_lr)
opt_critic = optimizers.Adam(alpha=critic_lr)
opt_actor.setup(model['policy'])
opt_critic.setup(model['q_function'])
opt_actor.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a')
opt_critic.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c')

rbuf = replay_buffer.ReplayBuffer(replay_buffer_size)
ou_sigma = (action_space.high - action_space.low) * 0.2

explorer = explorers.AdditiveOU(sigma=ou_sigma)

In [16]:
# The agent
agent = DDPG(model, opt_actor, opt_critic, rbuf, gamma=gamma,
                 explorer=explorer, replay_start_size=replay_start_size,
                 target_update_method=target_update_method,
                 target_update_interval=target_update_interval,
                 update_interval=update_interval,
                 soft_update_tau=soft_update_tau,
                 n_times_update=number_of_update_times,
                 phi=phi,minibatch_size=minibatch_size
            )

In [None]:
G=[]
G_mean=[]
for ep in range(1, number_of_episodes+ 1):
    if ep%100:
        agent.save("DDPG_Pendulum_10000")
    obs = env.reset()
    reward = 0
    done = False
    R = 0  # return (sum of rewards)
    t = 0  # time step
    episode_rewards=[]
    while not done and t < max_episode_length:
        # Uncomment to watch the behaviour
        #env.render()
        action = agent.act_and_train(obs, reward)
        obs, reward, done, _ = env.step(action)
        R += reward
        episode_rewards.append(reward)
        t += 1
        
    if done or t >= max_episode_length :
            
            # Calculate sum of the rewards
        episode_rewards_sum = sum(episode_rewards)     
        G.append(episode_rewards_sum)
        total_G = np.sum(G)
        maximumReturn = np.amax(G)
        print("%f" % (episode_rewards_sum), file=open("DDPG_Pendulum_reward_10000.txt", "a"))
        if ep % 10 == 0:
                
            print("==========================================")
            print("Episode: ", ep)
            print("Rewards: ", episode_rewards_sum)
            print("Max reward so far: ", maximumReturn)
            # Mean reward
            total_reward_mean = np.divide(total_G, ep+1)
            G_mean.append(total_reward_mean)
            print("Mean Reward", total_reward_mean)
            print("%f" % (total_reward_mean), file=open("DDPG_Pendulum_MEAN_Reward_10000.txt", "a"))    
                
    agent.stop_episode_and_train(obs, reward, done)
    
    
print('Finished.')


plt.xlabel('episdes')
plt.ylabel('reword')    
plt.plot(G)   
plt.savefig('DDPG_Pendulum_10000episodes.png',dpi=1000)


plt.plot(G_mean)
plt.ylabel('Average of Returns')
plt.xlabel('Number of episodes/10')
plt.savefig("ReturnsAverage_VS_Episodes DDPG_Pendulum_10000",dpi=1000)

Episode:  10
Rewards:  -1420.1520712496447
Max reward so far:  -1035.1203805818989
Mean Reward -1192.7269671117201
Episode:  20
Rewards:  -1149.2969592483244
Max reward so far:  -959.8272729705341
Mean Reward -1226.1514915300652
Episode:  30
Rewards:  -1533.982744557468
Max reward so far:  -959.8272729705341
Mean Reward -1253.7982974671856
Episode:  40
Rewards:  -1160.1434068889396
Max reward so far:  -959.8272729705341
Mean Reward -1286.0403882683024
Episode:  50
Rewards:  -890.2554837456627
Max reward so far:  -890.2554837456627
Mean Reward -1268.5837102460523
Episode:  60
Rewards:  -1100.3916348131195
Max reward so far:  -890.2554837456627
Mean Reward -1276.4830958213768
Episode:  70
Rewards:  -1211.5650274173609
Max reward so far:  -890.2554837456627
Mean Reward -1259.9717898043377
Episode:  80
Rewards:  -1206.5719342005498
Max reward so far:  -778.9234707114813
Mean Reward -1244.8458231802724
Episode:  90
Rewards:  -1250.815086093162
Max reward so far:  -778.9234707114813
Mean Rew

In [None]:
agent.save("DDPG_Pendulum_model")

In [None]:
plt.plot(G)
plt.ylabel('Returns')
plt.xlabel('Number of episodes')
plt.savefig("Returns_VS_Episodes")

In [None]:
plt.plot(G_mean)
plt.ylabel('Average of Returns ')
plt.xlabel('Number of episodes')
plt.savefig("ReturnsAverage_VS_Episodes")