In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf



In [2]:
env = gym.make('CartPole-v0')
path_to_recording = './tmp/CartPole-v0-gradient'

[2017-01-14 11:33:45,777] Making new env: CartPole-v0


In [3]:
def DQN():
    """
    4 layer(2 hidden layer) network
    input: state array
    output: estimated q_values(future reward for choosing that action)
    """

    state = tf.placeholder(dtype = tf.float32, shape = (None, 4))
    Q_actuals = tf.placeholder(dtype=tf.float32, shape = (None, 2))

    w1 = tf.Variable(tf.truncated_normal([4, 10], stddev = 0.1))
    b1 = tf.Variable(tf.truncated_normal([10], stddev = 0.1))
    h1 = tf.nn.tanh(tf.matmul(state, w1) + b1)

    w2 = tf.Variable(tf.truncated_normal([10, 10], stddev=0.1))
    b2 = tf.Variable(tf.truncated_normal([10], stddev=0.1))
    h2 = tf.nn.tanh(tf.matmul(h1, w2) + b2)

    w3 = tf.Variable(tf.random_normal([10, 2]))
    b3 = tf.Variable(tf.zeros([2]))
    Q_est = tf.tanh(tf.matmul(h2, w3) + b3)

    loss = tf.nn.l2_loss(Q_est - Q_actuals)
    optimizer = tf.train.AdamOptimizer(0.0001).minimize(loss)
    return optimizer, state, Q_actuals, Q_est, loss, w1, w2, w3


class ExperienceReplay:
    
    def __init__(self):
        self.memory = []
        self.max_size = 5000
        self.batch_size = 250
        
    def add_observation(self, step, obs, action, reward, is_done, new_obs, future_reward):
        if len(memory) > self.max_size:
            self.memory.pop()
        
        self.memory.append((step, obs, action, reward, is_done, new_obs, future_reward))
    
    def get_mini_batch(self):
        return np.random.choice(self.memory, self.batch_size)

In [4]:
np.random.seed(42)
tf.set_random_seed(42)

dqn_opt, dqn_state, dqn_q_actuals, dqn_q_est, dqn_loss, w1, w2, w3 = DQN()
sess = tf.Session()
sess.run(tf.global_variables_initializer())

TOTAL_NUMBER_OF_TRAINING_EPISODES = 0
dqn_loss_hist = []
reward_hist = []
steps_hist = []
max_prob_eps_mean = []
max_prob_eps_std = []
mean_advantage = []

In [5]:
num_episodes = 1
gamma = 0.97

softmax = lambda np_arr : np.exp(np_arr)/np.sum(np.exp(np_arr))

for episode in range(num_episodes):
    obs = env.reset()
    total_reward = 0.0
    total_steps = 0.0
    episode_history = []
    states = []
    max_prob = []
    while True:
        
        q_est = sess.run(dqn_q_est, feed_dict={dqn_state: np.expand_dims(obs, axis=0)})
        q_est = q_est[0] #unpack the values

        #need to define a better exploration_function
        q_est_softmax = softmax(q_est)
        max_prob.append(q_est_softmax.max())
        action = np.random.choice([0, 1], p = q_est_softmax)
        print action, q_est_softmax
        next_obs, reward, is_done, _ = env.step(action)
        
        unk_future_reward = None
        episode_history.append([total_steps, obs, action, reward, is_done, next_obs, unk_future_reward]) 
        total_reward += reward
        total_steps += 1.0
        obs = next_obs
        if is_done:
            break
    
    future_rewards = []
    cumm_reward = 0.0
    for idx, trans in enumerate(episode_history[::-1]): 
        total_steps, obs, action, reward, is_done, next_obs, future_reward = trans
        cumm_reward += (gamma**idx) * reward
        future_rewards.append(cumm_reward)
    future_rewards = np.array(future_rewards[::-1]) #go through the array in reverse and then flip it back at then end
    fr_mu, fr_sigma = future_rewards.mean(), future_rewards.std()
    future_rewards = (future_rewards - fr_mu)/fr_sigma
    
    for trans, future_reward in zip(episode_history, future_rewards):
        trans[-1] = future_reward
        print trans[0], trans[2], trans[3], trans[4], trans[-1]
        
    
    #load monitoring dbs
    max_prob_eps_mean.append(np.array(max_prob).mean())
    max_prob_eps_std.append(np.array(max_prob).std())
    reward_hist.append( total_reward )
    steps_hist.append( total_steps )

0 [ 0.58397526  0.41602477]
1 [ 0.57893062  0.42106941]
1 [ 0.58393383  0.41606623]
1 [ 0.58886158  0.41113847]
0 [ 0.59369504  0.4063049 ]
0 [ 0.58897781  0.41102222]
0 [ 0.58408457  0.41591537]
1 [ 0.57904118  0.42095888]
1 [ 0.58401978  0.41598022]
1 [ 0.58891571  0.41108435]
0 [ 0.59370989  0.40629011]
1 [ 0.58899695  0.41100302]
1 [ 0.59378433  0.40621567]
0 [ 0.59845114  0.40154886]
0 [ 0.59395075  0.40604928]
0 [ 0.58925498  0.41074502]
0 [ 0.5843842   0.41561577]
0 [ 0.5793584  0.4206416]
0 [ 0.57419723  0.42580274]
0 [ 0.5689202  0.4310798]
1 [ 0.5635466   0.43645337]
0 [ 0.56840211  0.43159789]
0 [ 0.56296897  0.437031  ]
0 [ 0.55745757  0.44254246]
0 [ 0.55188936  0.44811064]
1 [ 0.54628664  0.45371336]
0 [ 0.5507918   0.44920817]
0 [ 0.54510796  0.45489204]
1 [ 0.53942376  0.46057624]
0 [ 0.54373538  0.45626464]
1 [ 0.53799266  0.46200734]
0 [ 0.54227424  0.45772573]
0 [ 0.53650147  0.4634985 ]
1 [ 0.53079313  0.46920681]
1 [ 0.53487241  0.46512762]
0.0 0 1.0 False 1.415739

In [17]:
obs = env.reset()

In [19]:
obs.shape

(4,)

In [20]:
obs.reshape(1, obs.shape[0])

array([[-0.01165618,  0.02830083, -0.03131809, -0.00572727]])

In [22]:
print np.expand_dims(obs, axis=0)
print np.expand_dims(obs, axis=1)

[[-0.01165618  0.02830083 -0.03131809 -0.00572727]]
[[-0.01165618]
 [ 0.02830083]
 [-0.03131809]
 [-0.00572727]]


In [34]:
q_est

array([[ 0.59836054,  0.48353952]], dtype=float32)

In [36]:
softmax = lambda np_arr : np.exp(np_arr)/np.sum(np.exp(np_arr))
softmax(q_est)

array([[ 0.52867377,  0.47132623]], dtype=float32)

In [47]:
sum([ np.random.choice([0, 1], p = softmax(q_est[0])) for _ in range(100000)])

47258

In [58]:
np.inf

inf

In [61]:
[1,2, 3, _]

[1, 2, 3, [1, 2, 3, [1, 2, 3, inf]]]