In [None]:
import minority_agent
import gym
import tensorflow as tf
import itertools
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# from tensorflow.python import debug as tf_debug # debug
import importlib # debug
importlib.reload(minority_agent) # debug
tf.reset_default_graph()
env = gym.make('CartPole-v0')

In [None]:
learning_rate = 0.01
rollout = [[] for i in range(4)]
# rollout is [states | actions | rewards | next_states]

In [None]:
# sess = tf.Session()
REINFORCE = minority_agent.REINFORCE_MG(name = 'Tester',
                                 s_size = env.observation_space.shape[0],
                                 a_size = 1,
                                 trainer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),
                                 )

In [None]:
sess = tf.Session(graph = REINFORCE.graph)
# sess = tf_debug.LocalCLIDebugWrapperSession(sess) # debug

REINFORCE.init_graph(sess)
# sess.run(tf.global_variables_initializer())
# sess.run(REINFORCE.init_var())
def process_state(state):
    # helperfunction to make state the correct dims for tensorflow
    # (4,) -> (1, 4)
    return np.expand_dims(state,0)

In [None]:
print('Baselining untrained pnet...')
uplen0 = []
for k in range(1000):
    state = env.reset()
    done = False
    rewards = []
    while done is False:
        # env.render() # cannot use this on OSX because OpenAI GYM causes segfaults
        state = process_state(state)
        action = REINFORCE.generate_action(sess, state)
        next_state, reward, done, _ = env.step(np.squeeze(action))
        rewards.append(reward)
        state = next_state
    uplen0.append(len(rewards))
#assert sum(rewards)==len(rewards), "env error?!"
base_perf = np.mean(uplen0)
print("Cartpole stays up for an average of {} steps".format(base_perf))
#plt.plot(uplen0)

In [None]:
# Train pnet on cartpole episodes
num_episodes = 750
episode_rewards = np.zeros(num_episodes)
episode_lengths = np.zeros(num_episodes)

for i_episode in range(num_episodes):
    state = env.reset()
    episode = []
    # One step in the environment
    for t in itertools.count():
        state = process_state(state)
        # Take a step
        # tensor_states = tf.get_variable('Tester/states:0')
        # tensor_actions = tf.get_variable('Tester/output_action:0')
        # action = sess.run(tensor_actions, feed_dict={tensor_states: state})
        # action = sess.run(REINFORCE.a, feed_dict={REINFORCE.states: state})
        action = REINFORCE.generate_action(sess, state)
        next_state, reward, done, _ = env.step(np.squeeze(action))
        # Keep track of the transition
        rollout[0].append(state)
        rollout[1].append(action)
        rollout[2].append(reward)
        rollout[3].append(next_state)

        # Update statistics
        episode_rewards[i_episode] += reward
        episode_lengths[i_episode] = t

        # Print out which step we're on, useful for debugging.
        #print("\rStep {} @ Episode {}/{} ({})".format(t, i_episode + 1, num_episodes, episode_rewards[i_episode - 1]), end="")
        # sys.stdout.flush()
        if done:
            break
        state = next_state

     # Go through the episode and make policy updates
    REINFORCE.train(rollout, sess, 1.0)
    if (i_episode % 50) == 1:
        print("\rCartpole up for {} steps on Episode {}".format(episode_lengths[i_episode], i_episode),
              end="")

In [None]:
# Now test it!

print('Testing...')
uplen = []
for k in range(1000):
    state = env.reset()
    done = False
    rewards = []
    while done is False:
        # env.render() # cannot use this on OSX because OpenAI GYM causes segfaults
        state = process_state(state)
        action = REINFORCE.generate_action(sess, state)
        next_state, reward, done, _ = env.step(np.squeeze(action))
        rewards.append(reward)
        state = next_state
    uplen.append(len(rewards))
#assert sum(rewards)==len(rewards), "env error?!"
trained_perf = np.mean(uplen)
# print("Cartpole stays up for an average of {} steps".format(
#     trained_perf) )
print("Cartpole stays up for an average of {} steps compared to baseline {} steps".format(
    trained_perf, base_perf)
     )

In [None]:
fig, axs = plt.subplots(2, 1, sharex=True)
sns.boxplot(uplen0, ax = axs[0])
axs[0].set_title('Baseline Episode Lengths')
sns.boxplot(uplen, ax = axs[1])
axs[1].set_title('Trained Episode Lengths')