In [1]:
# %load cartpole_reinforce.py
import sys
import gym
import pylab
import numpy as np
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam

EPISODES = 1000


# This is Policy Gradient agent for the Cartpole
# In this example, we use REINFORCE algorithm which uses monte-carlo update rule
class REINFORCEAgent:
    def __init__(self, state_size, action_size, load_model=False, save_model=False):
        # if you want to see Cartpole learning, then change to True
        self.render = False        
        # get size of state and action
        self.state_size = state_size
        self.action_size = action_size
        self.save = save_model 

        # These are hyper parameters for the Policy Gradient
        self.discount_factor = 0.99
        #self.learning_rate = 0.001
        self.learning_rate = 0.01
        self.hidden1, self.hidden2 = 24, 12

        # create model for policy network
        self.model = self.build_model()

        # lists for the states, actions and rewards
        self.states, self.actions, self.rewards = [], [], []

        if load_model:
            self.model.load_weights("./save_model/cartpole_reinforce.h5")

    # approximate policy using Neural Network
    # state is input and probability of each action is output of network
    def build_model(self):
        model = Sequential()
        model.add(Dense(self.hidden1, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform'))
        model.add(Dense(self.hidden2, activation='relu', kernel_initializer='glorot_uniform'))
        model.add(Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform'))
        model.summary()
        # Using categorical crossentropy as a loss is a trick to easily
        # implement the policy gradient. Categorical cross entropy is defined
        # H(p, q) = sum(p_i * log(q_i)). For the action taken, a, you set 
        # p_a = advantage. q_a is the output of the policy network, which is
        # the probability of taking the action a, i.e. policy(s, a). 
        # All other p_i are zero, thus we have H(p, q) = A * log(policy(s, a))
        model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=self.learning_rate))
        return model

    # using the output of policy network, pick action stochastically
    def get_action(self, state):
        policy = self.model.predict(state, batch_size=1).flatten()
        return np.random.choice(self.action_size, 1, p=policy)[0]

    # In Policy Gradient, Q function is not available.
    # Instead agent uses sample returns for evaluating policy
    def discount_rewards(self, rewards):
        discounted_rewards = np.zeros_like(rewards)
        running_add = 0
        for t in reversed(range(0, len(rewards))):
            running_add = running_add * self.discount_factor + rewards[t]
            discounted_rewards[t] = running_add
        return discounted_rewards

    # save <s, a ,r> of each step
    def append_sample(self, state, action, reward):
        self.states.append(state)
        self.rewards.append(reward)
        self.actions.append(action)

    # update policy network every episode
    def train_model(self):
        episode_length = len(self.states)

        discounted_rewards = self.discount_rewards(self.rewards)
        discounted_rewards -= np.mean(discounted_rewards)
        discounted_rewards /= np.std(discounted_rewards)

        update_inputs = np.zeros((episode_length, self.state_size))
        advantages = np.zeros((episode_length, self.action_size))

        for i in range(episode_length):
            update_inputs[i] = self.states[i]
            advantages[i][self.actions[i]] = discounted_rewards[i]

        self.model.fit(update_inputs, advantages, epochs=1, verbose=0)
        self.states, self.actions, self.rewards = [], [], []

if __name__ == "__main__":
    # In case of CartPole-v1, you can play until 500 time step
    env = gym.make('CartPole-v1')
    #env = gym.make('MountainCar-v0')
    # get size of state and action from environment
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    # make REINFORCE agent
    agent = REINFORCEAgent(state_size, action_size, load_model=False, save_model=True)
    agent.render = True

    scores, episodes = [], []

    for e in range(EPISODES):
        done = False
        score = 0
        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            # get action for the current state and go one step in environment
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            
            if agent.render and e % 100 == 0:
                env.render()
                #print(next_state)
            
            #reward = reward + 1*((next_state[0]-0.5)**2)
            #reward = reward + 0.2*(next_state[1]**2)
            next_state = np.reshape(next_state, [1, state_size])
            reward = reward if not done or score == 499 else -100

            # save the sample <s, a, r> to the memory
            agent.append_sample(state, action, reward)

            score += reward
            state = next_state

            if done:
                # every episode, agent learns from sample returns
                agent.train_model()

                # every episode, plot the play time
                score = score if score == 500 else score + 100
                scores.append(score)
                episodes.append(e)
                pylab.plot(episodes, scores, 'b')
                pylab.savefig("./save_graph/cartpole_reinforce.png")
                print("episode:", e, "  score:", score)

                # if the mean of scores of last 10 episode is bigger than 490
                # stop training
                if np.mean(scores[-min(10, len(scores)):]) > 490:
                    #break
                    agent.model.save_weights("./save_model/cartpole_reinforce.h5")
                    print("Final model saved")
                    sys.exit()

        # save the model
        if e % 50== 49 and agent.save==True:
            agent.model.save_weights("./save_model/cartpole_reinforce.h5")
            print("Model saved")


Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_2 (Dense)              (None, 12)                300       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 26        
Total params: 446
Trainable params: 446
Non-trainable params: 0
_________________________________________________________________
episode: 0   score: 58.0
episode: 1   score: 22.0
episode: 2   score: 13.0
episode: 3   score: 20.0
episode: 4   score: 16.0
episode: 5   score: 9.0
episode: 6   score: 17.0
episode: 7   score: 23.0
episode: 8   score: 33.0
episode: 9   score: 37.0
episode: 10   score: 18.0
episode: 11   score: 59.0
episode: 12   score: 35.0
episode: 13   score: 12.0
episode: 14   score: 20.0
episode: 15   score: 34.0
epi

episode: 282   score: 8.0
episode: 283   score: 8.0
episode: 284   score: 9.0
episode: 285   score: 9.0
episode: 286   score: 8.0
episode: 287   score: 8.0
episode: 288   score: 9.0
episode: 289   score: 10.0
episode: 290   score: 9.0
episode: 291   score: 9.0
episode: 292   score: 8.0
episode: 293   score: 9.0
episode: 294   score: 9.0
episode: 295   score: 7.0
episode: 296   score: 8.0
episode: 297   score: 8.0
episode: 298   score: 8.0
episode: 299   score: 9.0
Model saved
episode: 300   score: 9.0
episode: 301   score: 7.0
episode: 302   score: 9.0
episode: 303   score: 9.0
episode: 304   score: 9.0
episode: 305   score: 9.0
episode: 306   score: 9.0
episode: 307   score: 9.0
episode: 308   score: 7.0
episode: 309   score: 8.0
episode: 310   score: 8.0
episode: 311   score: 9.0
episode: 312   score: 9.0
episode: 313   score: 8.0
episode: 314   score: 8.0
episode: 315   score: 7.0
episode: 316   score: 9.0
episode: 317   score: 9.0
episode: 318   score: 8.0
episode: 319   score: 9.0

episode: 594   score: 8.0
episode: 595   score: 7.0
episode: 596   score: 9.0
episode: 597   score: 9.0
episode: 598   score: 7.0
episode: 599   score: 8.0
Model saved
episode: 600   score: 10.0
episode: 601   score: 8.0
episode: 602   score: 8.0
episode: 603   score: 7.0
episode: 604   score: 8.0
episode: 605   score: 9.0
episode: 606   score: 8.0
episode: 607   score: 9.0
episode: 608   score: 8.0
episode: 609   score: 8.0
episode: 610   score: 7.0
episode: 611   score: 7.0
episode: 612   score: 8.0
episode: 613   score: 9.0
episode: 614   score: 8.0
episode: 615   score: 8.0
episode: 616   score: 8.0
episode: 617   score: 9.0
episode: 618   score: 8.0
episode: 619   score: 7.0
episode: 620   score: 8.0
episode: 621   score: 8.0
episode: 622   score: 8.0
episode: 623   score: 9.0
episode: 624   score: 8.0
episode: 625   score: 8.0
episode: 626   score: 9.0
episode: 627   score: 9.0
episode: 628   score: 9.0
episode: 629   score: 8.0
episode: 630   score: 9.0
episode: 631   score: 9.0

KeyboardInterrupt: 

In [2]:
#nv.reset()
env.step(0)


You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


(array([-0.23971613, -2.17364323,  0.30046116,  3.43871043]), 0.0, True, {})

In [6]:
env = gym.make('CartPole-v1')
# get size of state and action from environment
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# make REINFORCE agent
agent = REINFORCEAgent(state_size, action_size, load_model=True, save_model=False)
agent.render = True

done = False
score = 0
state = env.reset()
state = np.reshape(state, [1, state_size])
act = {0:'<', 1:'>'}

while not done:
    if agent.render:
        env.render()                     

    # get action for the current state and go one step in environment
        action = agent.get_action(state)
        print(act[action], end='')
        next_state, reward, done, info = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        reward = reward if not done or score == 499 else -100

        # save the sample <s, a, r> to the memory
        agent.append_sample(state, action, reward)

        score += reward
        state = next_state

if score>499:
    print("Mission complished!")
else:
    print(":(")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 24)                120       
_________________________________________________________________
dense_14 (Dense)             (None, 12)                300       
_________________________________________________________________
dense_15 (Dense)             (None, 2)                 26        
Total params: 446
Trainable params: 446
Non-trainable params: 0
_________________________________________________________________
<<<<<<<<<<:(
