In [31]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow import nn
import numpy as np
import gym
import random
import math

In [32]:
tf.keras.backend.set_floatx('float64')

class Actor:
    def __init__(self, num_layers, input_dim, layer_dim, output_dim):
        self.input_dim = input_dim
        self.output_dim = output_dim
        
        model = Sequential()
        model.add(Dense(layer_dim, input_dim=input_dim, activation='relu'))
        for layer in range(num_layers):
            model.add(Dense(layer_dim, activation='relu'))
            model.add(Dense(output_dim))
        self.model = model
        
    def get_policy(self, inputs):
        return nn.softmax(self.model(inputs[None, :]))

    def sample_action(self, inputs):
        policy = self.get_policy(inputs)
        action = tf.random.categorical(policy, num_samples=1)
        return action[0][0]


In [44]:
class Trainer:
    def __init__(self, actor=None):
        self.env = gym.make('CartPole-v0')
        self.actor = actor if actor else \
            Actor(1, self.env.observation_space.shape[0], 100, 2)
        self.episode_length = 0
        self.opt = tf.keras.optimizers.Adam(learning_rate=0.1)
        self.variables = self.actor.model.trainable_variables
        self.trainable_weights = self.actor.model.trainable_weights
        self.discount_factor = 0.99

    def reset(self):
        self.epsiode_length = 0

    def record_episode(self, iterations):
        done = False
        inital_state = self.env.reset()
        states = np.ones([iterations, self.env.observation_space.shape[0]])
        states[0] = inital_state
        rewards = np.zeros([iterations])
        actions = np.ones([iterations, self.env.action_space.n])
        gradients = []
            
        for i in range(iterations-1):
            if done:
                break
            self.episode_length = i
            with tf.GradientTape() as tape:
                policy = self.actor.get_policy(states[i])
                print('--------------')
                tf.print(states[i])
                tf.print(policy)
                action = tf.random.categorical(policy, num_samples=1).numpy()[0][0]
                action_log_prob = tf.math.log(policy[:, action] + 1e-50)

            gradients.append(tape.gradient(action_log_prob, self.variables))
            state, rewards[i], done, _ = self.env.step(action)
            states[i+1] = state
            actions[i] = action

        self.env.close()
        self.states = states
        self.rewards = rewards
        self.actions = actions
        self.gradients = gradients
        return states, rewards, actions, gradients
    
    def train(self):
        states = self.states[:self.episode_length]
        rewards = self.rewards[:self.episode_length]
        actions = self.actions[:self.episode_length]
        gradients = self.gradients[:self.episode_length]
        discounted_rewards, mean_score = self.discount_rewards(rewards)
        gradients = gradients*discounted_rewards[:, None]
        for grad in gradients:
            self.opt.apply_gradients(zip(grad, self.trainable_weights))
        return mean_score

    def discount_rewards(self, rewards):
        for i, reward in enumerate(rewards):
            rewards[i] = math.pow(self.discount_factor, len(rewards) - 1 - i) * -1 * reward
        mean = rewards.mean()
        return rewards - mean, mean

In [45]:
trainer = Trainer()
trainer.record_episode(100)
trainer.train()

NUM_EPISODES = 10 
NUM_STEPS = 100

for i in range(NUM_EPISODES):
    trainer.record_episode(NUM_STEPS)
    score = trainer.train()
    trainer.reset()
    print('-----------------------------')
    print(i)
    print(score)
    

--------------
array([-0.04299282, -0.00121561, -0.0191124 ,  0.04666647])
[[0.50144476987498388 0.49855523012501618]]
--------------
array([-0.04301713,  0.19417511, -0.01817907, -0.25198479])
[[0.49971478642984657 0.50028521357015343]]
--------------
array([-0.03913363, -0.0006826 , -0.02321877,  0.03490912])
[[0.50103560571360783 0.49896439428639222]]
--------------
array([-0.03914728,  0.19476449, -0.02252059, -0.26500819])
[[0.49913237776703723 0.50086762223296277]]
--------------
array([-3.52519950e-02, -2.89107138e-05, -2.78207514e-02,  2.04873198e-02])
[[0.50043722712384564 0.49956277287615436]]
--------------
array([-0.03525257, -0.19474106, -0.027411  ,  0.30426428])
[[0.5131561495191207 0.48684385048087925]]
--------------
array([-0.03914739,  0.00076058, -0.02132572,  0.00306415])
[[0.49972268076968229 0.50027731923031771]]
--------------
array([-0.03913218,  0.19618178, -0.02126444, -0.29627028])
[[0.4985676477772859 0.501432352222714]]
--------------
array([-0.03520855,  

InvalidArgumentError: slice index 2 of dimension 1 out of bounds. [Op:StridedSlice] name: strided_slice/