In [98]:
import numpy as np

import gym

from keras.models import Model
from keras.layers import Input, Dense, Dropout
from keras import backend as K
from keras.optimizers import Adam

import matplotlib.pyplot as plt

In [48]:
EPISODES = 1000000

LOSS_CLIPPING = 0.2 # Only implemented clipping for the surrogate loss, paper said it was best
EPOCHS = 10
GAMMA = 0.99

BATCH_SIZE = 256
NUM_ACTIONS = 2
NUM_STATE = 1
HIDDEN_SIZE = 256
ENTROPY_LOSS = 5 * 1e-3 # Does not converge without entropy penalty
LR = 1e-4 # Lower lr stabilises training greatly

DUMMY_ACTION, DUMMY_VALUE = np.zeros((1, NUM_ACTIONS)), np.zeros((1, 1))

In [240]:
class Environment:
    def __init__(self):
        self.action_space = (1, )
        self.observation_space = (1, )
    
    def reset(self):
        self.count = 0
        self.state = np.array(np.random.randint(0,2))
        self.state = np.array(0)
        return np.array([self.state])
    
    def step(self, action):
        if action == self.state:
            reward = 1.
        else:
            reward = -1.
            
        #self.state = np.array(np.random.randint(0,2))
        self.count += 1
        done = False
        
        if self.count >= 10:
            done = True
        return self.state, reward, done, []

In [247]:
def proximal_policy_optimization_loss(advantage, old_prediction):
    def loss(y_true, y_pred):
        prob = K.sum(y_true * y_pred)
        old_prob = K.sum(y_true * old_prediction)
        r = prob/(old_prob + 1e-10)

        return -K.mean(K.minimum(r * advantage, K.clip(r, min_value=1 - LOSS_CLIPPING, max_value=1 + LOSS_CLIPPING) * advantage)) + ENTROPY_LOSS * (prob * K.log(prob + 1e-10))
    return loss

class Agent:
    def __init__(self):
        self.critic = self.build_critic()
        self.actor = self.build_actor()

        self.env = Environment()
        self.episode = 0
        self.observation = self.env.reset()
        self.val = False
        self.reward = []
        self.reward_over_time = []
        self.gradient_steps = 0

    def build_actor(self):

        state_input = Input(shape=(NUM_STATE,))
        advantage = Input(shape=(1,))
        old_prediction = Input(shape=(NUM_ACTIONS,))

        x = Dense(HIDDEN_SIZE, activation='relu')(state_input)
        x = Dropout(0.5)(x)
        x = Dense(HIDDEN_SIZE, activation='relu')(x)
        x = Dropout(0.5)(x)

        out_actions = Dense(NUM_ACTIONS, activation='softmax', name='output')(x)

        model = Model(inputs=[state_input, advantage, old_prediction], outputs=[out_actions])
        model.compile(optimizer=Adam(lr=LR),
                      loss=[proximal_policy_optimization_loss(
                          advantage=advantage,
                          old_prediction=old_prediction)])

        return model


    def build_critic(self):

        state_input = Input(shape=(NUM_STATE,))
        x = Dense(HIDDEN_SIZE, activation='relu')(state_input)
        x = Dropout(0.5)(x)
        x = Dense(HIDDEN_SIZE, activation='relu')(x)
        x = Dropout(0.5)(x)

        out_value = Dense(1)(x)

        model = Model(inputs=[state_input], outputs=[out_value])
        model.compile(optimizer=Adam(lr=LR), loss='mse')

        return model

    def reset_env(self):
        self.episode += 1
        if self.episode % 100 == 0:
            self.val = True
        else:
            self.val = False
        self.observation = self.env.reset()
        self.reward = []

    def get_action(self):
        p = self.actor.predict([self.observation.reshape(1, NUM_STATE), DUMMY_VALUE, DUMMY_ACTION])
        if self.val is False:
            action = np.random.choice(NUM_ACTIONS, p=np.nan_to_num(p[0]))
        else:
            action = np.argmax(np.nan_to_num(p[0]))
        action_matrix = np.zeros(NUM_ACTIONS)
        action_matrix[action] = 1
        return action, action_matrix, p

    def transform_reward(self):
        for j in range(len(self.reward) - 2, -1, -1):
            self.reward[j] += self.reward[j + 1] * GAMMA

    def get_batch(self):
        batch = [[], [], [], []]

        tmp_batch = [[], [], []]
        while len(batch[0]) < BATCH_SIZE:
            action, action_matrix, predicted_action = self.get_action()
            observation, reward, done, info = self.env.step(action)
            self.reward.append(reward)

            tmp_batch[0].append(self.observation)
            tmp_batch[1].append(action_matrix)
            tmp_batch[2].append(predicted_action)
            self.observation = observation

            if done:
                self.transform_reward()
                for i in range(len(tmp_batch[0])):
                    obs, action, pred = tmp_batch[0][i], tmp_batch[1][i], tmp_batch[2][i]
                    r = self.reward[i]
                    batch[0].append(obs)
                    batch[1].append(action)
                    batch[2].append(pred)
                    batch[3].append(r)
                tmp_batch = [[], [], []]
                self.reset_env()

        obs, action, pred, reward = np.array(batch[0]), np.array(batch[1]), np.array(batch[2]), np.reshape(np.array(batch[3]), (len(batch[3]), 1))
        pred = np.reshape(pred, (pred.shape[0], pred.shape[2]))
        return obs, action, pred, reward

    def run(self):
        while self.episode < EPISODES:
            print(self.episode)
            obs, action, pred, reward = self.get_batch()
            old_prediction = pred
            pred_values = self.critic.predict(obs)

            advantage = reward - pred_values

            actor_loss = []
            critic_loss = []
            for e in range(EPOCHS):
                actor_loss.append(self.actor.train_on_batch([obs, advantage, old_prediction], [action]))
                critic_loss.append(self.critic.train_on_batch([obs], [reward]))

            self.gradient_steps += 1
            
            for i in range(2):
                obs = np.array(i)
                p = self.actor.predict([obs.reshape(1, NUM_STATE), DUMMY_VALUE, DUMMY_ACTION])
                print(obs, p)
                
            print()

In [None]:
EPISODES = 10000

LOSS_CLIPPING = 0.2 # Only implemented clipping for the surrogate loss, paper said it was best
EPOCHS = 10
GAMMA = 0.99

BATCH_SIZE = 256
NUM_ACTIONS = 2
NUM_STATE = 1
HIDDEN_SIZE = 256
ENTROPY_LOSS = 5 * 1e-3 # Does not converge without entropy penalty
LR = 1e-6 # Lower lr stabilises training greatly

DUMMY_ACTION, DUMMY_VALUE = np.zeros((1, NUM_ACTIONS)), np.zeros((1, 1))

agent = Agent()
agent.run()
env = Environment()

0
0 [[0.50000465 0.49999532]]
1 [[0.4895404 0.5104596]]

26
0 [[0.5000061  0.49999392]]
1 [[0.4895418 0.5104582]]

52
0 [[0.5000072  0.49999282]]
1 [[0.48954293 0.51045704]]

78
0 [[0.5000047  0.49999532]]
1 [[0.4895404 0.5104596]]

104
0 [[0.50000304 0.49999693]]
1 [[0.48953876 0.5104612 ]]

130
0 [[0.49999988 0.5000001 ]]
1 [[0.48953563 0.5104644 ]]

156
0 [[0.49999422 0.5000058 ]]
1 [[0.48952997 0.51047003]]

182
0 [[0.49999693 0.50000304]]
1 [[0.48953268 0.5104673 ]]

208
0 [[0.4999984 0.5000016]]
1 [[0.48953414 0.51046586]]

234
0 [[0.49999908 0.50000095]]
1 [[0.48953483 0.5104652 ]]

260
0 [[0.4999999  0.50000006]]
1 [[0.48953566 0.5104643 ]]

286
0 [[0.5000024  0.49999765]]
1 [[0.48953807 0.5104619 ]]

312
0 [[0.5000052  0.49999484]]
1 [[0.48954087 0.51045907]]

338
0 [[0.5000069  0.49999312]]
1 [[0.48954263 0.5104574 ]]

364
0 [[0.50000674 0.4999933 ]]
1 [[0.48954245 0.5104575 ]]

390
0 [[0.5000037  0.49999633]]
1 [[0.48953938 0.51046056]]

416
0 [[0.5000064 0.4999936]]
1 [[0.4