In [35]:
import numpy as np
import time

import gym

from keras.models import Model
from keras.layers import Input, Dense, Dropout
from keras import backend as K
from keras.optimizers import Adam

import numba as nb
from tensorboardX import SummaryWriter

In [53]:
from gym import error, spaces

class Env(object):
    def __init__(self):
        self.reward_range = (-np.inf, np.inf)
        self.action_space = spaces.Box(low=0., high=1., shape=(1,))
        self.observation_space = spaces.Box(low=0., high=1., shape=(1,))

    def step(self, action):
        self.step_num += 1
        done = False
        self.actions.append(action)
        
        previous_obs = self.observation.copy()
        
        reward = 0.001/(abs(self.observation[0] - action[0]) + 0.001)
        
        #if abs(self.observation[0] - action[0]) < 0.001:
        #    reward = 1.
        #else:
        #    reward = 0.
            
        #self.observation[0] = np.random.rand()

        if self.step_num > 100:
            done = True
        
        return self.observation, reward, done, {}

    def reset(self):
        self.step_num = 0
        self.observation = np.array([0.2])
        #self.observation = np.random.rand(1)
        self.actions = []
        return self.observation

    def render(self, mode='human', close=False):
        print(np.mean(self.actions))
    
    def close(self):
        pass

    def seed(self, seed=None):
        pass

    def configure(self, *args, **kwargs):
        raise NotImplementedError()

    def __del__(self):
        self.close()

    def __str__(self):
        return '<{} instance>'.format(type(self).__name__)

In [None]:
EPISODES = 1000000

LOSS_CLIPPING = 0.2 # Only implemented clipping for the surrogate loss, paper said it was best
EPOCHS = 100
NOISE = 0.01

GAMMA = 0.99

BATCH_SIZE = 256
NUM_ACTIONS = 1
NUM_STATE = 1
HIDDEN_SIZE = 256
ENTROPY_LOSS = 5 * 1e-3 # Does not converge without entropy penalty
LR = 1e-4 # Lower lr stabilises training greatly

DUMMY_ACTION, DUMMY_VALUE = np.zeros((1, NUM_ACTIONS)), np.zeros((1, 1))

def proximal_policy_optimization_loss_continuous(advantage, old_prediction):
    def loss(y_true, y_pred):
        var = K.square(NOISE)
        pi = 3.1415926
        denom = K.sqrt(2 * pi * var)
        prob_num = K.exp(- K.square(y_true - y_pred)/ (2 * var))
        old_prob_num = K.exp(- K.square(y_true - old_prediction)/ (2 * var))

        prob = prob_num/denom
        old_prob = old_prob_num/denom
        r = prob/(old_prob + 1e-10)

        return -K.mean(K.minimum(r * advantage, K.clip(r, min_value=1 - LOSS_CLIPPING, max_value=1 + LOSS_CLIPPING) * advantage))
    return loss


class Agent:
    def __init__(self):
        self.critic = self.build_critic()
        self.actor = self.build_actor_continuous()

        self.env = Env()
        print(self.env.action_space, 'action_space', self.env.observation_space, 'observation_space')
        self.episode = 0
        self.observation = self.env.reset()
        self.reward = []
        self.reward_over_time = []
        self.writer = SummaryWriter('AllRuns/continuous/' + str(int(time.time())))
        self.gradient_steps = 0

    def build_actor_continuous(self):
        state_input = Input(shape=(NUM_STATE,))
        advantage = Input(shape=(1,))
        old_prediction = Input(shape=(NUM_ACTIONS,))

        x = Dense(HIDDEN_SIZE, activation='relu')(state_input)
        x = Dropout(0.5)(x)
        x = Dense(HIDDEN_SIZE, activation='relu')(x)
        x = Dropout(0.5)(x)

        out_actions = Dense(NUM_ACTIONS, name='output')(x)

        model = Model(inputs=[state_input, advantage, old_prediction], outputs=[out_actions])
        model.compile(optimizer=Adam(lr=LR),
                      loss=[proximal_policy_optimization_loss_continuous(
                          advantage=advantage,
                          old_prediction=old_prediction)])
        model.summary()

        return model

    def build_critic(self):

        state_input = Input(shape=(NUM_STATE,))
        x = Dense(HIDDEN_SIZE, activation='relu')(state_input)
        x = Dropout(0.5)(x)
        x = Dense(HIDDEN_SIZE, activation='relu')(x)
        x = Dropout(0.5)(x)

        out_value = Dense(1)(x)

        model = Model(inputs=[state_input], outputs=[out_value])
        model.compile(optimizer=Adam(lr=LR), loss='mse')

        return model

    def reset_env(self):
        self.episode += 1
        self.observation = self.env.reset()
        self.reward = []

    def get_action_continuous(self):
        p = self.actor.predict([self.observation.reshape(1, NUM_STATE), DUMMY_VALUE, DUMMY_ACTION])
        action = action_matrix = p[0] + np.random.normal(loc=0, scale=NOISE, size=p[0].shape)
        return action, action_matrix, p

    def transform_reward(self):
        if self.episode % 100 == 0:
            print('Episode #', self.episode, '\tfinished with reward', np.array(self.reward).sum(),
                  '\tAverage reward of last 100 episode :', np.mean(self.reward_over_time[-100:]))
        self.reward_over_time.append(np.array(self.reward).sum())
        self.writer.add_scalar('Episode reward', np.array(self.reward).sum(), self.episode)
        for j in range(len(self.reward) - 2, -1, -1):
            self.reward[j] += self.reward[j + 1] * GAMMA

    def get_batch(self):
        batch = [[], [], [], []]

        tmp_batch = [[], [], []]
        while len(batch[0]) < BATCH_SIZE:
            action, action_matrix, predicted_action = self.get_action_continuous()
            observation, reward, done, info = self.env.step(action)
            self.reward.append(reward)

            tmp_batch[0].append(self.observation)
            tmp_batch[1].append(action_matrix)
            tmp_batch[2].append(predicted_action)
            self.observation = observation

            if done:
                self.transform_reward()
                for i in range(len(tmp_batch[0])):
                    obs, action, pred = tmp_batch[0][i], tmp_batch[1][i], tmp_batch[2][i]
                    r = self.reward[i]
                    batch[0].append(obs)
                    batch[1].append(action)
                    batch[2].append(pred)
                    batch[3].append(r)
                tmp_batch = [[], [], []]
                self.reset_env()

        obs, action, pred, reward = np.array(batch[0]), np.array(batch[1]), np.array(batch[2]), np.reshape(np.array(batch[3]), (len(batch[3]), 1))
        pred = np.reshape(pred, (pred.shape[0], pred.shape[2]))
        return obs, action, pred, reward

    def run(self):
        while self.episode < EPISODES:
            obs, action, pred, reward = self.get_batch()
            old_prediction = pred
            pred_values = self.critic.predict(obs)

            advantage = reward - pred_values

            actor_loss = []
            critic_loss = []
            for e in range(EPOCHS):
                actor_loss.append(self.actor.train_on_batch([obs, advantage, old_prediction], [action]))
                critic_loss.append(self.critic.train_on_batch([obs], [reward]))
            self.writer.add_scalar('Actor loss', np.mean(actor_loss), self.gradient_steps)
            self.writer.add_scalar('Critic loss', np.mean(critic_loss), self.gradient_steps)
            self.writer.add_scalars('Action Observation', 
                                    {'action': np.mean(action),
                                     'observation': np.mean(obs)}, self.gradient_steps)

            self.gradient_steps += 1

In [None]:
ag = Agent()
ag.run()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_66 (InputLayer)        (None, 1)                 0         
_________________________________________________________________
dense_84 (Dense)             (None, 256)               512       
_________________________________________________________________
dropout_67 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_85 (Dense)             (None, 256)               65792     
_________________________________________________________________
dropout_68 (Dropout)         (None, 256)               0         
_________________________________________________________________
output (Dense)               (None, 1)                 257       
Total params: 66,561
Trainable params: 66,561
Non-trainable params: 0
_________________________________________________________________
[33mW

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Episode # 0 	finished with reward 0.48166654779542317 	Average reward of last 100 episode : nan
Episode # 100 	finished with reward 0.5022418277375246 	Average reward of last 100 episode : 0.49906782058428223
Episode # 200 	finished with reward 0.504733924109995 	Average reward of last 100 episode : 0.5080676660050569
Episode # 300 	finished with reward 0.5251881125053428 	Average reward of last 100 episode : 0.5128887165166031
Episode # 400 	finished with reward 0.5335635742130193 	Average reward of last 100 episode : 0.528298173602544
Episode # 500 	finished with reward 0.5287803967248712 	Average reward of last 100 episode : 0.5280077922009377
Episode # 600 	finished with reward 0.5136729661410431 	Average reward of last 100 episode : 0.5245889480784267
Episode # 700 	finished with reward 0.517708411517717 	Average reward of last 100 episode : 0.5170960958936796
Episode # 800 	finished with reward 0.5442012074629949 	Average reward of last 100 episode : 0.5357967412119198
Episode # 

Episode # 7300 	finished with reward 0.5917861470226322 	Average reward of last 100 episode : 0.5930995583734046
Episode # 7400 	finished with reward 0.6402919563655224 	Average reward of last 100 episode : 0.6108240792144372
Episode # 7500 	finished with reward 0.6463244508728695 	Average reward of last 100 episode : 0.6582492179648005
Episode # 7600 	finished with reward 0.6221637212963631 	Average reward of last 100 episode : 0.6380780851345995
Episode # 7700 	finished with reward 0.639034430219936 	Average reward of last 100 episode : 0.6287793167901452
Episode # 7800 	finished with reward 0.6739374050864619 	Average reward of last 100 episode : 0.6554394622609692
Episode # 7900 	finished with reward 0.6843694930732882 	Average reward of last 100 episode : 0.6906970374953738
Episode # 8000 	finished with reward 0.6724673605265818 	Average reward of last 100 episode : 0.6795789074593384
Episode # 8100 	finished with reward 0.7232292871420578 	Average reward of last 100 episode : 0.7

In [25]:
action, action_matrix, predicted_action = ag.get_action_continuous()
print(action, predicted_action)

[-0.00934278] [[-0.0090393]]


In [17]:
ag.observation

array([0.86121804])

In [24]:
observation, reward, done, info = ag.env.step(action)

R: 0.17527005408275634


In [20]:
observation

array([0.00263249])