In [None]:
import os
import gym
import pylab
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from collections import deque
import tensorflow_probability as tfp
from keras.layers import Input, Dense
from keras.models import Model, load_model
from keras.optimizers import Adam, RMSprop



class Callback(tf.keras.callbacks.Callback):
    SHOW_NUMBER = 10
    counter = 0
    epoch = 0

    def on_epoch_begin(self, epoch, logs=None):
        self.epoch = epoch

    def on_train_batch_end(self, batch, logs=None):
        if self.counter == self.SHOW_NUMBER or self.epoch == 1:
            # print('Epoch: ' + str(self.epoch) + ' loss: ' + str(logs['loss']))
            if self.epoch > 1:
                self.counter = 0
        self.counter += 1



RANDOM_SEED = 6
tf.random.set_seed(RANDOM_SEED)

env = gym.make('CartPole-v1',new_step_api=True)
np.random.seed(RANDOM_SEED)
env.reset(seed = RANDOM_SEED)
action_shape = env.action_space.n
state_shape = env.observation_space.shape


train_episodes = 300
learning_rate = 0.001
gamma = .9


tf.keras.backend.clear_session()


def actor_model(state_shape, action_shape):
    init = tf.keras.initializers.HeUniform()
    model = keras.Sequential()
    model.add(keras.layers.Dense(24, input_shape=state_shape, activation=tf.keras.layers.LeakyReLU(), kernel_initializer=init))
    model.add(keras.layers.Dense(12, activation=tf.keras.layers.LeakyReLU(), kernel_initializer=init))
    model.add(keras.layers.Dense(action_shape, activation='softmax', kernel_initializer=init))
    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), metrics=['accuracy'])
    return model

def critic_model(state_shape):
    init = tf.keras.initializers.HeUniform()
    model = keras.Sequential()
    model.add(keras.layers.Dense(24, input_shape=state_shape, activation=tf.keras.layers.LeakyReLU(), kernel_initializer=init))
    model.add(keras.layers.Dense(12, activation=tf.keras.layers.LeakyReLU(), kernel_initializer=init))
    model.add(keras.layers.Dense(1, activation='linear', kernel_initializer=init))
    model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), metrics=['accuracy'])
    return model



def one_hot_encode_action(action, n_actions):
    encoded = np.zeros(n_actions, np.float32)
    encoded[action] = 1
    return encoded

def main():

    actor = actor_model(state_shape, action_shape)
    critic = critic_model(state_shape)

    for episode in range(train_episodes):
        state = env.reset()
        done = False
        num_steps = 0
        while not done:
            num_steps += 1

            # model dims are (batch, state_shape)
            state_reshaped = state.reshape([1, state.shape[0]])
            action_probs = actor.predict(state_reshaped, verbose=0, callbacks=[Callback()]).flatten()

            # Sampling from the prob distribution instead of using argmax
            action = np.random.choice(action_shape, 1, p=action_probs)[0]
            encoded_action = one_hot_encode_action(action, action_shape)

            next_state, reward, done, _,_ = env.step(action)
            next_state_reshaped = next_state.reshape([1, next_state.shape[0]])


            # Calculating Advantage
            v_curr = np.ndarray.item(critic.predict(state_reshaped, verbose=0, callbacks=[Callback()]))
            v_next = np.ndarray.item(critic.predict(next_state_reshaped, verbose=0, callbacks=[Callback()]))
            TD_target = reward + (1 - done) * gamma * v_next
            advantage = TD_target - v_curr


            # Training Critic model
            advantage_reshaped = np.vstack([advantage])
            TD_target = np.vstack([TD_target])
            critic.train_on_batch(state_reshaped, TD_target)
            #critic.fit(state_reshaped, TD_target, verbose=0)


            # Training Actor model
            gradient = encoded_action - action_probs
            gradient_with_advantage = .0001 * gradient * advantage_reshaped + action_probs
            actor.train_on_batch(state_reshaped, gradient_with_advantage)
            #actor.fit(state_reshaped, gradient_with_advantage, verbose=0)
            state = next_state

            if done:
                print('Episode => {} ::: Total steps=> {}'.format(episode,num_steps))

    env.close()

main()

Episode => 0 ::: Total steps=> 23
Episode => 1 ::: Total steps=> 13
Episode => 2 ::: Total steps=> 37
Episode => 3 ::: Total steps=> 46
Episode => 4 ::: Total steps=> 60
Episode => 5 ::: Total steps=> 25
Episode => 6 ::: Total steps=> 23
Episode => 7 ::: Total steps=> 11
Episode => 8 ::: Total steps=> 11
Episode => 9 ::: Total steps=> 13
Episode => 10 ::: Total steps=> 9
Episode => 11 ::: Total steps=> 8
Episode => 12 ::: Total steps=> 13
Episode => 13 ::: Total steps=> 14
Episode => 14 ::: Total steps=> 12
Episode => 15 ::: Total steps=> 9
Episode => 16 ::: Total steps=> 16
Episode => 17 ::: Total steps=> 9
Episode => 18 ::: Total steps=> 12
Episode => 19 ::: Total steps=> 14
Episode => 20 ::: Total steps=> 11
Episode => 21 ::: Total steps=> 9
Episode => 22 ::: Total steps=> 26
Episode => 23 ::: Total steps=> 15
Episode => 24 ::: Total steps=> 10
Episode => 25 ::: Total steps=> 9
Episode => 26 ::: Total steps=> 10
Episode => 27 ::: Total steps=> 8
Episode => 28 ::: Total steps=> 10
Ep