In [1]:
import numpy as np
import tensorflow as tf
import gym
from collections import namedtuple
from gym.wrappers import Monitor
import scipy.signal
import os

  from ._conv import register_converters as _register_converters


In [27]:
env = gym.make('CartPole-v0')
obs_dim = env.observation_space.shape[0]
num_actions = env.action_space.n
hidden_layers = [64,64]
epoch = 50
steps_per_epoch = 4000
gamma = 0.99
gae_lambda = 0.97
train_v = 60
video_freq = 10
save_freq = 5
checkpointDir = "checkpoint"
monitorDir = "monitor"
obs_dim,num_actions

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


(4, 2)

In [3]:
def policy_estimator(x,action,hidden_layers,num_actions,output_activation,activation):
    for l in hidden_layers:
        x = tf.layers.dense(x,units=l,activation=activation)
    logits = tf.layers.dense(x,units=num_actions,activation=output_activation)
    
    probs = tf.nn.log_softmax(logits)
    pi_a = tf.squeeze(tf.multinomial(logits,1), axis=1)
    prob_a = tf.reduce_sum(tf.one_hot(action, depth=num_actions) * probs, axis=1)
    prob_pi = tf.reduce_sum(tf.one_hot(pi_a, depth=num_actions) * probs, axis=1)
    return pi_a,prob_pi,prob_a

In [4]:
def value_estimator(x,hidden_layers,output_activation=None,activation=tf.tanh):
    for l in hidden_layers:
        x = tf.layers.dense(x,units=l,activation=activation)
    logits = tf.layers.dense(x,units=1,activation=output_activation)
    return tf.squeeze(logits,axis=1)

In [5]:
def actor_critic(x,act,hidden_layers,num_actions,output_activation=None,activation=tf.tanh):
    pi_a,prob_pi,prob_a = policy_estimator(x,act,hidden_layers,num_actions,output_activation=None,activation=tf.tanh)
    v = value_estimator(x,hidden_layers,output_activation=None,activation=tf.tanh)
    return pi_a,prob_pi,prob_a,v

In [6]:
x = tf.placeholder(dtype = tf.float32,shape = (None,obs_dim),name="observations")
actions = tf.placeholder(dtype = tf.int32,shape = (None,),name="actions")
ret = tf.placeholder(dtype = tf.float32,shape = (None,),name="ret")
advantages = tf.placeholder(dtype = tf.float32,shape = (None,),name="advs")

pi_a,prob_pi,prob_a,value = actor_critic(x,actions,hidden_layers,num_actions)

policy_loss = -tf.reduce_mean(prob_a * advantages)
value_loss = tf.reduce_mean((ret - value)**2)

In [7]:
optimize_policy = tf.train.AdamOptimizer(learning_rate=3e-4).minimize(policy_loss)
optimize_value = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(value_loss)

In [8]:
def calculate_advantage(rews,values,final_value):
    values = np.append(values,final_value)
    rews = np.append(rews,final_value)
    dels = rews[:-1] + gamma * values[1:] - values[:-1]
    return scipy.signal.lfilter([1], [1, float(-gamma*gae_lambda)], dels[::-1], axis=0)[::-1]
    

In [9]:
def rewards_to_go(rews,final_value):
    rews = np.append(rews,final_value)
    return scipy.signal.lfilter([1], [1, float(-gamma)], rews[::-1], axis=0)[::-1][:-1]

In [13]:
def update_policy(sess,obs_memory,action_memory,rtgs_memory,adv_memory):
    feed_dict = {x:obs_memory,actions:action_memory,ret:rtgs_memory,advantages:adv_memory}
    
    policy_loss_e,value_loss_e = sess.run([policy_loss,value_loss],feed_dict=feed_dict)
    
    sess.run(optimize_policy,feed_dict=feed_dict)
    
    for i in range(train_v):
        sess.run(optimize_value,feed_dict=feed_dict)

    return policy_loss_e,value_loss_e
    

In [28]:
def train():
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    env = gym.make('CartPole-v0')
    
    if not os.path.exists(checkpointDir):
        os.makedirs(checkpointDir)
    #if not os.path.exists(monitorDir):
     #   os.makedirs(monitorDir)
        
    checkpoint = os.path.join(checkpointDir,"model")
    #monitor = os.path.join(monitorDir,"game")
    
    saver = tf.train.Saver()
        
        
    #env = Monitor(env, directory=monitor, video_callable=lambda e: e % \
    #                  video_freq == 0, resume=True)
    
    ckpt = tf.train.latest_checkpoint(checkpointDir)
    if ckpt:
        saver.restore(sess,ckpt)
        print("Existing checkpoint {} restored...".format(ckpt))
            
    obs,rew,done = env.reset(),0,False
    total_rew = 0
    episode_length = 0
    episode_stats = []
    losses = []
    obs_memory = np.zeros((steps_per_epoch,obs_dim), dtype=np.float32)
    action_memory = np.zeros(steps_per_epoch, dtype=np.int32)
    rew_memory = np.zeros(steps_per_epoch, dtype=np.float32)
    value_memory = np.zeros(steps_per_epoch, dtype=np.float32)
    prob_pi_memory = np.zeros(steps_per_epoch, dtype=np.float32)
    adv_memory = np.zeros(steps_per_epoch, dtype=np.float32)
    rtgs_memory = np.zeros(steps_per_epoch, dtype=np.float32)

    for e in range(epoch):
        buf_head = 0
        for t in range(steps_per_epoch):
            pi_a_t,value_t,prob_pi_t = sess.run([pi_a,value,prob_pi],feed_dict={x:obs.reshape(1,-1)})
            
            obs_memory[t],action_memory[t],rew_memory[t],\
            value_memory[t],prob_pi_memory[t] = obs,pi_a_t,rew,value_t,prob_pi_t
            
            obs,rew,done,_ = env.step(pi_a_t[0])
            total_rew += rew
            episode_length += 1
            
            if done or (t==steps_per_epoch-1):
                if not done:
                    print("Alert:Final episode terminated without completion...")
                    final_value = sess.run([value],feed_dict={x:obs.reshape(1,-1)})
                else:
                    final_value = rew
                
                
                adv_memory[buf_head:t] = calculate_advantage(rew_memory[buf_head:t],\
                                                             value_memory[buf_head:t],final_value)
                rtgs_memory[buf_head:t] = rewards_to_go(rew_memory[buf_head:t],final_value)
                buf_head = t
                episode_stats.append([total_rew,episode_length])
                obs,rew,done,total_rew,episode_length = env.reset(),0,False,0,0
        
        if(e%save_freq == 0) or (e == epoch-1):
            saver.save(sess, checkpoint)
            
        policy_loss_e,value_loss_e = update_policy(sess,obs_memory,action_memory,rtgs_memory,adv_memory)
        print(f"\n\nEpoch : {e}\nTotal Episodes : {len(episode_stats)}\n\
Total rewards : {np.mean(episode_stats,axis=0)[0]}\nAverage episode Length : {np.mean(episode_stats,axis=0)[1]}")
        
        print(f"Policy Loss : {policy_loss_e} Value Loss : {value_loss_e}\n")
        episode_stats = []

In [29]:
train()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
INFO:tensorflow:Restoring parameters from checkpoint/model
Existing checkpoint checkpoint/model restored...
Alert:Final episode terminated without completion...


Epoch : 0
Total Episodes : 198
Total rewards : 20.2020202020202
Average episode Length : 20.2020202020202
Policy Loss : 6.782705307006836 Value Loss : 220.69940185546875

Alert:Final episode terminated without completion...


Epoch : 1
Total Episodes : 191
Total rewards : 20.94240837696335
Average episode Length : 20.94240837696335
Policy Loss : 3.926971435546875 Value Loss : 134.0321502685547

Alert:Final episode terminated without completion...


Epoch : 2
Total Episodes : 184
Total rewards : 21.73913043478261
Average episode Length : 21.73913043478261
Policy Loss : 0.2852747142314911 Value Loss : 81.26679992675781

Alert:Final episode terminated without completion...


Epoch : 3
Total Episodes : 181
Total rewards : 2