In [31]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow import nn
import numpy as np
import gym
import random
import math

In [32]:
tf.keras.backend.set_floatx('float64')

class Actor:
    def __init__(self, num_layers, input_dim, layer_dim, output_dim):
        self.input_dim = input_dim
        self.output_dim = output_dim
        
        model = Sequential()
        model.add(Dense(layer_dim, input_dim=input_dim, activation='relu'))
        for layer in range(num_layers):
            model.add(Dense(layer_dim, activation='relu'))
            model.add(Dense(output_dim))
        self.model = model
        
    def get_policy(self, inputs):
        return nn.softmax(self.model(inputs[None, :]))

    def sample_action(self, inputs):
        policy = self.get_policy(inputs)
        action = tf.random.categorical(policy, num_samples=1)
        return action[0][0]


In [63]:
class Trainer:
    def __init__(self, actor=None):
        self.env = gym.make('CartPole-v0')
        self.actor = actor if actor else \
            Actor(1, self.env.observation_space.shape[0], 100, 2)
        self.episode_length = 0
        self.opt = tf.keras.optimizers.Adam(learning_rate=0.1)
        self.variables = self.actor.model.trainable_variables
        self.trainable_weights = self.actor.model.trainable_weights
        self.discount_factor = 0.99

    def reset(self):
        self.epsiode_length = 0

    def record_episode(self, iterations):
        done = False
        inital_state = self.env.reset()
        states = np.ones([iterations, self.env.observation_space.shape[0]])
        states[0] = inital_state
        rewards = np.zeros([iterations])
        actions = np.ones([iterations, self.env.action_space.n])
        gradients = []
            
        for i in range(iterations-1):
            if done:
                break
            self.episode_length = i
            with tf.GradientTape() as tape:
                policy = self.actor.get_policy(states[i])
                action = tf.random.categorical(policy, num_samples=1).numpy()[0][0]
                action_log_prob = tf.math.log(policy[:, action] + 1e-50)

            gradients.append(tape.gradient(action_log_prob, self.variables))
            state, rewards[i], done, _ = self.env.step(action)
            states[i+1] = state
            actions[i] = action

        self.env.close()
        self.states = states
        self.rewards = rewards
        self.actions = actions
        self.gradients = gradients
        return states, rewards, actions, gradients
    
    def train(self):
        states = self.states[:self.episode_length]
        rewards = self.rewards[:self.episode_length]
        actions = self.actions[:self.episode_length]
        gradients = self.gradients[:self.episode_length]
        discounted_rewards, mean_score = self.discount_rewards(rewards)
        gradients = gradients*discounted_rewards[:, None]
        for grad in gradients:
            self.opt.apply_gradients(zip(grad, self.trainable_weights))
        return mean_score

    def discount_rewards(self, rewards):
        for i, reward in enumerate(rewards):
            rewards[i] = math.pow(self.discount_factor, len(rewards) - 1 - i) * -1 * reward
        mean = rewards.mean()
        return rewards - mean, mean

In [54]:
trainer = Trainer()
trainer.record_episode(100)
trainer.train()

NUM_EPISODES = 100
NUM_STEPS = 100

for i in range(NUM_EPISODES):
    trainer.record_episode(NUM_STEPS)
    score = trainer.train()
    print('-----------------------------')
    print('episode:', i)
    print('    score:', score)
    print('    length', trainer.episode_length)
    trainer.reset()
    

-----------------------------
episode: 0
    score: -0.9561792499119551
    length 10
-----------------------------
episode: 1
    score: -0.9561792499119551
    length 10
-----------------------------
episode: 2
    score: -0.9561792499119551
    length 10
-----------------------------
episode: 3
    score: -0.9375299087930121
    length 14
-----------------------------
episode: 4
    score: -0.9514704158298506
    length 11
-----------------------------
episode: 5
    score: -0.9656913196509987
    length 8
-----------------------------
episode: 6
    score: -0.9561792499119551
    length 10
-----------------------------
episode: 7
    score: -0.96091947240399
    length 9
-----------------------------
episode: 8
    score: -0.9561792499119551
    length 10
-----------------------------
episode: 9
    score: -0.9329443023914099
    length 15
-----------------------------
episode: 10
    score: -0.9467927356989226
    length 12
-----------------------------
episode: 11
    score: -0.9

-----------------------------
episode: 95
    score: -0.9149019798127013
    length 19
-----------------------------
episode: 96
    score: -0.9375299087930121
    length 14
-----------------------------
episode: 97
    score: -0.9467927356989226
    length 12
-----------------------------
episode: 98
    score: -0.9514704158298506
    length 11
-----------------------------
episode: 99
    score: -0.9421459769310155
    length 13
