In [1]:
import gym
from keras.layers import Input, Dense
import tensorflow as tf
import numpy as np
import time
from collections import deque
import matplotlib.pyplot as plt
from keras.optimizers import Adam
from keras.models import Model

batch_size = 5000
env = gym.make("CartPole-v0")

Using TensorFlow backend.


In [2]:
class Memory():
    def __init__(self, batch_size):
        self.actions = []
        self.states = []
        self.cum_rews = []
        self.cum_lens = []
        self.episode_rews = []
        self.weights = []
        
    def add_to_trajectory(self,trajectory):
        action, state, reward = trajectory
        self.actions.append(action)
        self.states.append(state)
        self.episode_rews.append(reward)
    
    def finish(self):
        self.cum_rews.append(np.sum(self.episode_rews))
        self.cum_lens.append(len(self.episode_rews))
        self.weights += [np.sum(self.episode_rews)] * len(self.episode_rews)
        self.episode_rews = []
    
    def check_limit(self, limit):
        if len(self.states) > limit:
            return True
        return False
    
    def sample(self):
        return (self.actions, self.states, self.weights)
    
    def reset(self):
        self.actions = []
        self.states = []
        self.cum_rews = []
        self.cum_lens = []
        self.episode_rews = []
        self.weights = []

In [3]:
import keras.backend as K

class Agent():
    def __init__(self, state_space, action_space):
        self.action_dim = action_space
        
        self.input = Input((state_space,))
        self.first_layer = Dense(128, activation='relu')(self.input)
        self.second_layer = Dense(64, activation='relu')(self.first_layer)
        self.logits = Dense(action_space, activation=None)(self.second_layer)
        
        self.weights = K.placeholder(shape=(None,))
        self.actions = K.placeholder(shape=(None,), dtype=tf.int32)
        
        action_masks = K.one_hot(self.actions, action_space)
        log_probs = K.sum(action_masks * self.log_softmax(self.logits), axis=1)
        self.loss = -K.mean(self.weights * log_probs)
        
        self.train_op = tf.train.AdamOptimizer(learning_rate=0.01).minimize(self.loss)

        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        
        
    def predict(self, state):
        distrib = self.sess.run(self.logits, {self.input: np.array([state])})[0]
        distrib = self.softmax(distrib)
        action = np.random.choice(self.action_dim, p=distrib, size = 1)
        return action[0]
        
    
    def train(self, _, sample):
        actions, states, weights = sample
        self.sess.run([self.loss, self.train_op], feed_dict={self.input: np.array(states),
                                                            self.actions: np.array(actions),
                                                            self.weights: np.array(weights)})
 
    def softmax(self, x):
        e_x = np.exp(x)
        return e_x / np.sum(e_x)
    
    def log_softmax(self,x):
        logs = K.logsumexp(x,axis=1)
        ret = x - K.stack([logs,logs], axis=1)
        return ret

In [4]:
agent = Agent(env.observation_space.shape[0], env.action_space.n)

In [5]:
def train():
    state = env.reset()
    memory = Memory(batch_size)

    while True:
        action = agent.predict(state)
        next_state, reward, done, _ = env.step(action)
        memory.add_to_trajectory((action, state, reward))
    
        if done:
            memory.finish()
            if memory.check_limit(batch_size):
                agent.train(memory.states, memory.sample())
                env.close()
                break
            state = env.reset()
        
        else:
            state = next_state

In [6]:
def test():
    rewards = 0
    state = env.reset()

    while True:
        env.render()
        action = agent.predict(state)
        next_state, reward, done, _ = env.step(action)
        rewards +=reward
        if done:
            env.close()
            return rewards
        else:
            state = next_state

In [7]:
reward = 0
rewards = []

for i in range(1,51):
    train()
    print(i)
    reward += test()
    rewards.append(reward/i)
    
plt.plot(rewards, label = "rewards")

1
2


KeyboardInterrupt: 