# [PPO](https://arxiv.org/abs/1707.06347)

## import libaries (Chainer-rl, Opensim)

In [None]:
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
from builtins import *  # NOQA
from future import standard_library
standard_library.install_aliases()  # NOQA
import argparse
import logging
import sys

import chainer
from chainer import functions as F
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
# import gym
# gym.undo_logger_setup()  # NOQA
# import gym.wrappers


from osim.env import ProstheticsEnv


from chainerrl.agents import a3c
from chainerrl.agents import PPO
from chainerrl import experiments
from chainerrl import links
from chainerrl import misc
from chainerrl.optimizers.nonbias_weight_decay import NonbiasWeightDecay
from chainerrl import policies

style.use('ggplot')

## For all experiments we use the same random seed
    GPU =-1 if you use CPU 
    GPU = 1 if you use GPU

In [None]:
# Chainer's settings
seed=1
gpu=-1 # select CPU 

## Setting Hyper-parameters, Numbers of episodes and timesteps

In [None]:
# Network Setting
# actor learning rate
actor_lr=1e-4


# other settings

number_of_episodes=5000
max_episode_length=1000 # number of time-steps per episode

update_interval=4

number_of_eval_runs=10
eval_interval=10 ** 4

epochs=10 ###number of epochs 
gamma=0.995 ## discont factor
batch_size=128
entropy_coef=0.0


## Defining reward, environment, random actions, Action value functions

In [None]:
# Helper's functions

# Linearly decay the learning rate to zero
def lr_setter(env, agent, value):
    agent.optimizer.alpha = value

# Linearly decay the clipping parameter to zero
def clip_eps_setter(env, agent, value):
    agent.clip_eps = value


def clip_action_filter(a):
    return np.clip(a, action_space.low, action_space.high)

def reward_filter(r):
    return r


def phi(obs):
    obs=np.array(obs)
    return obs.astype(np.float32)


def make_env(test,render=False):
    env = ProstheticsEnv(visualize=render)
    # Use different random seeds for train and test envs
    env_seed = 2 ** 32 - 1 - seed if test else seed
    env.seed(env_seed)

    if not test:
        misc.env_modifiers.make_reward_filtered(env, reward_filter)
    if render and not test:
        misc.env_modifiers.make_rendered(env)
    return env

In [None]:
# Set a random seed used in ChainerRL
misc.set_random_seed(seed)
env = make_env(test=False,render=False)
obs_space=env.observation_space
obs_size = obs_space.low.size
action_space = env.action_space

In [None]:
model = A3CFFGaussian(obs_size, action_space,bound_mean=True,normalize_obs=True)

# setting optimizer function adam 

In [None]:
opt = chainer.optimizers.Adam(alpha=actor_lr, eps=1e-5)
opt.setup(model)

## Agent (algorithm) function 

In [None]:
agent = PPO(model, opt,
                #gpu=args.gpu,
                phi=phi,
                update_interval=update_interval,
                minibatch_size=batch_size, epochs=epochs,
                clip_eps_vf=None, entropy_coef=entropy_coef,
                #standardize_advantages=args.standardize_advantages,
)

In [None]:
eval_env = make_env(test=True,render=False)

## Training, save reward in Text file and plot results

In [None]:
G=[]
G_mean=[]
for ep in range(1, number_of_episodes+ 1):
    if ep%100:
        agent.save("PPO_Prosthetic_5000")
    obs = env.reset()
    reward = 0
    done = False
    R = 0  # return (sum of rewards)
    t = 0  # time step
    episode_rewards=[]
    while not done and t < max_episode_length:
        # Uncomment to watch the behaviour
        #env.render()
        action = agent.act_and_train(obs, reward)
        obs, reward, done, _ = env.step(action)
        R += reward
        episode_rewards.append(reward)
        t += 1
        
    if done or t >= max_episode_length :
            
            # Calculate sum of the rewards
        episode_rewards_sum = sum(episode_rewards)     
        G.append(episode_rewards_sum)
        total_G = np.sum(G)
        maximumReturn = np.amax(G)
        print("%f" % (episode_rewards_sum), file=open("PPO_Prosthetic_5000.txt", "a"))
        if ep % 10 == 0:
                
            print("==========================================")
            print("Episode: ", ep)
            print("Rewards: ", episode_rewards_sum)
            print("Max reward so far: ", maximumReturn)
            # Mean reward
            total_reward_mean = np.divide(total_G, ep+1)
            G_mean.append(total_reward_mean)
            print("Mean Reward", total_reward_mean)
            print("%f" % (total_reward_mean), file=open("PPO_MEAN_Prosthetic_5000.txt", "a"))
                     
    agent.stop_episode_and_train(obs, reward, done)
    
    
print('Finished.')
plt.xlabel('episdes')
plt.ylabel('reword')
plt.plot(G)   
plt.savefig('PPO_prosthetic_5000', dpi = 1000)


plt.plot(G_mean)
plt.ylabel('Average of Returns')
plt.xlabel('Number of episodes/10')

plt.savefig("ReturnsAverage_VS_Episodes_PPO_prosthetic_5000", dpi = 1000)