In [None]:
from __future__ import division, print_function

*original post: *[Simple Reinforcement Learning with Tensorflow](https://medium.com/@awjuliani/simple-reinforcement-learning-with-tensorflow-part-4-deep-q-networks-and-beyond-8438a3e2b8df)
## Double Dueling Deep Q Network Model
Model with laser vector input and output action scores

**Separated network**
* target network for estimate the target q value in training (the label)
* main network for estimate the predict q value in training (the output)

Why not use just use one network for both estimations? The issue is that at every step of training, the Q-network’s values shift, and if we are using a constantly shifting set of values to adjust our network values, then the value estimations can easily spiral out of control. The network can become destabilized by falling into feedback loops between the target and estimated Q-values. In order to mitigate that risk, the target network’s weights are fixed, and only periodically or slowly updated to the primary Q-networks values. In this way training can proceed in a more stable manner.

**Double Q**

$$ Q-target = r + \gamma Q(s', \operatorname{arg\,max}_a Q(s', a, \theta), \theta ') $$
* $\theta$ is the parameters of the main model
* $\theta'$ is the parameters of the target model

**Dueling Q**
$$ Q(s, a) = V(s) + A(a) $$

* $V(s)$ tells how good the state is
* $A(a)$ tells how better taking a certain action would be compared to others


In [None]:
import numpy
import keras
from keras import optimizers
from keras.models import Model
from keras.layers import Input, Dense, Lambda
import keras.backend as K

class Qnet:
    
    def __init__(self, numstate, num_actions):
        self.input_size = numstate
        self.output_size = num_actions
        self.model = self.create_model()
        self.target_model = self.create_model()
    
    def create_model(self):

        inp = Input(shape=(self.input_size[0], ))
        inner = Dense(300, activation='relu')(inp)
        inner = Dense(300, activation='relu')(inner)
        out = Dense(self.output_size + 1)(inner)

        # out = value + avantage - mean(advantage)
        out = Lambda(lambda a: K.expand_dims(a[:, 0], axis=-1) + a[:, 1:] - K.mean(a[:, 1:], keepdims=True, axis=1))(out)

        model = Model(inp, out)
        
        model.summary()

        optimizer = optimizers.Adam(0.0001)
        model.compile(loss="mse", optimizer=optimizer)
        return model
    
    def get_qvalues(self, states):
        predicted = self.model.predict(states)
        return predicted

    def get_target_qvalues(self, states):
        predicted = self.target_model.predict(states)
        return predicted
    
    def get_actions(self, states):
        qvalues = self.get_qvalues(states)
        actions = np.argmax(qvalues, axis=1)
        return actions
    
    def update_target_model(self, tau):
        main_weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i, layer_weights in enumerate(main_weights):
            target_weights[i] *= (1-tau)
            target_weights[i] += tau * layer_weights
        self.target_model.set_weights(target_weights)
        
    def save(self, path):
        self.target_model.save(path + "/target.h5")
        self.model.save(path + "/main.h5")
        
    def load(self, path):
        self.target_model.load_weights(path + "/target.h5")
        self.model.load_weights(path + "/main.h5")
        
    def learn_on_minibatch(self, minibatch, gamma):
        states = np.vstack(minibatch[:,0])
        actions = minibatch[:, 1]
        rewards = minibatch[:, 2]
        dones = minibatch[:, 3]
        newstates = np.vstack(minibatch[:, 4])

        actions_newstate = self.get_actions(newstates)
        target_qvalues_newstate = self.get_target_qvalues(newstates)
        double_q = target_qvalues_newstate[range(target_qvalues_newstate.shape[0]), actions_newstate]

        done_multiplier = 1 - dones
        target_q = rewards + gamma * double_q * done_multiplier
        
        qvalues = self.get_qvalues(states)
        for i in range(qvalues.shape[0]):
            qvalues[i, actions[i]] = target_q[i]

        loss = self.model.train_on_batch(states, qvalues)
        return loss

In [None]:
qnet = Qnet([100], 11)

## Experience Replay
To store the agent's experiences, and then randomly drawing batche of them to train the network. By keeping the network random, we can prevent the network from only learning about the immidiate actions and allow it to learn vastly from the past.

Each of these experiences are stored as a tuple of `<state, action, reward, next_state>`

In [None]:
import numpy as np
import random

class ExperienceReplay:
    """
    Class for storing experience
    1 experience is an array of [state, action, reward, done, newstate]
    """
    def __init__(self, output_dir, buffer_size):
        self.buffer = []
        self.buffer_size = buffer_size
        self.output_dir = output_dir
    
    def add(self,experience):
        if len(self.buffer) + len(experience) >= self.buffer_size:
            self.buffer[0:(len(experience)+len(self.buffer))-self.buffer_size] = []
            
        self.buffer.extend(experience)

    def sample(self, size):
        return np.reshape(np.array(random.sample(self.buffer, min(size, len(self.buffer)))), [size, 5])

    def save(self):
        np.save(self.output_dir + "/experience", self.buffer)

    def load(self, path):
        self.buffer = np.load(path + "/experience.npy").tolist()

### Load the game evironment
The custom game enviroments:

![env3](screenshots/env3.png)

In [None]:
import gym
import gym_gazebo

In [None]:
env = gym.make('GazeboTurtlebotMazeColor-v0')

### Training the network
Setting the hyper parameters

In [None]:
batch_size = 32
gamma = 0.95
start_epsilon = 1.0
end_epsilon = 0.05
annealing_steps = 100000
num_training_step = 1000
num_pretrain_step = 10000
tau = 0.001
target_update_freq = 4
online_update_freq = 4
save_model_freq = 50
buffer_size = 100000
output_dir = "output/test"
num_episode = 3000

Training...

In [None]:
epsilon = start_epsilon
epsilon_decay = (start_epsilon - end_epsilon)/annealing_steps

episode = 0
total_step = 0

replay = ExperienceReplay(output_dir, buffer_size)

while episode < num_episode:
    state = env.reset()
    replay_ep = ExperienceReplay(output_dir, buffer_size)
    total_reward = 0
    num_random_step = 0
    total_loss = 0
    num_training = 0
    start_step = total_step
    
    if(total_step >= num_pretrain_step):
        episode += 1
        
    for i in range(num_training_step):
        
        # get action
        if(total_step < num_pretrain_step or np.random.rand(1) < epsilon):
            action = np.random.randint(env.num_action)
            num_random_step += 1

        else:
            action = qnet.get_actions(state.reshape(1, -1))[0]
            
        # get after take action
        newstate, reward, done, _ = env.step(action)
        if(newstate == []):
            print("Terminate")
            # state = env.reset()
            break
        replay_ep.add(np.reshape(np.array([state, action, reward, done, newstate]), [1, 5]))
        
        # train
        if total_step > num_pretrain_step:
            if epsilon > end_epsilon:
                epsilon -= epsilon_decay

            if total_step % online_update_freq == 0:
                train_batch = replay.sample(batch_size)
                loss = qnet.learn_on_minibatch(train_batch, gamma)
                total_loss += loss
                num_training += 1
                sys.stdout.write("\rTrain step at {}th step | loss {} | epsilon {}".format(total_step, loss, epsilon))
                sys.stdout.flush()
            
            if total_step % target_update_freq == 0:

                # print("Update target net")
                qnet.update_target_model(tau)
                
        total_step += 1
        total_reward += reward
        state = newstate
        if done:
            break
            
    replay.add(replay_ep.buffer)
    
    if(num_training == 0):
        num_training = 1
        
    print("\nDone epoch in {} steps, {} random steps, Total reward: {}, Total step: {}, Average loss: {}".format(total_step - start_step, num_random_step, total_reward, total_step, total_loss/num_training))
    
    # save model
    if(episode % save_model_freq == 0 and total_step > num_pretrain_step):
        qnet.save(output_dir)
        replay.save()
        print("Save model at {}".format(output_dir))