In [1]:
import gym
import pylab
import random
import numpy as np
from collections import deque
import tflearn
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import Adagrad


gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)

Using TensorFlow backend.


## Task: fill empty spaces in the following agent code

In [2]:
class DeepQAgent:
    def __init__(self, state_size, action_size, render=True):
        # Tip: if you are training this on AWS the best way is to turn off rendering
        # and load it later with the serialized model
        self.render = render
        self.state_size = state_size
        self.action_size = action_size

        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.005
        self.epsilon_decay = (self.epsilon - self.epsilon_min) / 50000
        self.batch_size = 64
        self.train_start = 1000
        # replay memory
        self.memory = deque(maxlen=10000)

        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_model()

    def build_model(self):
        model = Sequential()
        model.add(Dense(2, input_dim = self.state_size))
        model.add(Dense(self.action_size, activation='softmax'))
        
        adagrad = Adagrad(lr=self.learning_rate)
        model.compile(loss='categorical_crossentropy',
              optimizer=adagrad,
              metrics=['accuracy'])
        
        
        #tflearn.init_graph()
        #print(self.state_size)
        #net = tflearn.input_data(shape=(None, self.state_size), dtype=np.float32)
        #net = tflearn.fully_connected(net, self.state_size, name='fc1')
        #net = tflearn.fully_connected(net, self.action_size, activation='softmax', name='fc2')
        #net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')
        
        #model = tflearn.DNN(net)
        return model

    def update_target_model(self):
        print("update")
        """Update your target model to the model you are currently learning at regular time intervals"""        
        self.target_model.set_weights(self.model.get_weights())


    def get_action(self, state):
        """The choice of action uses the epsilon-greedy policy for the current network."""
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            #update_input = np.zeros((1, self.state_size))
            #print('get_action')
            #update_input[0][0][0] = state[0][0]
            #update_input[0][1][0] = state[0][1]
            #print("get_action", np.array(state))
            #print(np.shape(update_input[0][0]))
            q_value = self.model.predict(state)
            return np.argmax(q_value[0])

    def replay_memory(self, state, action, reward, next_state, done):
        """Save <s, a, r, s'> to replay_memory"""
        if action == 2:
            action = 1
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
            # print(len(self.memory))

    def train_replay(self):
        """Random sampling of batch_size samples from replay memory"""
        if len(self.memory) < self.train_start:
            return
        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        update_input = np.zeros((batch_size, self.state_size))
        update_target = np.zeros((batch_size, self.action_size))

        for i in range(batch_size):
            state, action, reward, next_state, done = mini_batch[i]
            target = self.model.predict(state)[0]

            # As in queuing, it gets the maximum Q Value at s'. However, it is imported from the target model.
            if done:
                target[action] = reward
            else:
                #tminput = np.zeros((1, self.state_size))
                #tminput[0] = next_state
                #print('tminput')
                #print("shape", np.shape(tminput[0][0]))
                target[action] = reward + self.discount_factor * \
                                          np.amax(self.target_model.predict(next_state)[0])
            update_input[i] = state
            update_target[i] = target

        # You can create a minibatch of the correct target answer and the current value of your own,
        self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0)

    def load_model(self, name):
        self.model.load_model(name)

    def save_model(self, name):
        self.model.save(name)


In [3]:
env = gym.make('MountainCar-v0')
state_size = env.observation_space.shape[0] # should be equal 2
ACTION_SIZE = 2
agent = DeepQAgent(state_size, ACTION_SIZE)
#agent.load_model("./save_model/model")
scores, episodes = [], []
N_EPISODES = 4000

update


In [4]:
for e in range(N_EPISODES):
    done = False
    score = 0
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    print("state", state)

    # Action 0 (left), 1 (do nothing), 3 (declare fake_action to avoid doing nothing
    fake_action = 0

    # Counter for the same action 4 times
    action_count = 0

    while not done:
        #if agent.render:
        #    env.render()

        # Select an action in the current state and proceed to a step
        action_count = action_count + 1

        if action_count == 4:
            action = agent.get_action(state)
            action_count = 0

            if action == 0:
                fake_action = 0
            elif action == 1:
                fake_action = 2

        # Take 1 step with the selected action
        next_state, reward, done, info = env.step(fake_action)
        next_state = np.reshape(next_state, [1, state_size])
        # Give a penalty of -100 for actions that end an episode
        # reward = reward if not done else -100

        # Save <s, a, r, s'> to replay memory
        agent.replay_memory(state, fake_action, reward, next_state, done)
        # Continue to learn every time step
        agent.train_replay()
        score += reward
        state = next_state

        if done:
            env.reset()
            # Copy the learning model for each episode to the target model
            agent.update_target_model()

            # For each episode, the time step where cartpole stood is plot
            scores.append(score)
            episodes.append(e)
            print("episode:", e, "  score:", score, "  memory length:", len(agent.memory),
                  "  epsilon:", agent.epsilon)

    # Save model for every 50 episodes
    if e % 50 == 0:
        agent.save_model("./save_model/model")

state [[-0.4177859  0.       ]]
update
episode: 0   score: -200.0   memory length: 200   epsilon: 0.9960200000000077
state [[-0.42205977  0.        ]]
update
episode: 1   score: -200.0   memory length: 400   epsilon: 0.9920400000000154
state [[-0.54851713  0.        ]]
update
episode: 2   score: -200.0   memory length: 600   epsilon: 0.988060000000023
state [[-0.49966982  0.        ]]
update
episode: 3   score: -200.0   memory length: 800   epsilon: 0.9840800000000307
state [[-0.45597145  0.        ]]
update
episode: 4   score: -200.0   memory length: 1000   epsilon: 0.9801000000000384
state [[-0.59382136  0.        ]]
update
episode: 5   score: -200.0   memory length: 1200   epsilon: 0.9761200000000461
state [[-0.5371984  0.       ]]
update
episode: 6   score: -200.0   memory length: 1400   epsilon: 0.9721400000000537
state [[-0.53666873  0.        ]]
update
episode: 7   score: -200.0   memory length: 1600   epsilon: 0.9681600000000614
state [[-0.40517908  0.        ]]


KeyboardInterrupt: 