In [None]:
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
import random
import time
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # comment this to use GPUs (if available)
os.environ["CUDA_VISIBLE_DEVICES"] = ""          # if having multiple GPUs, explicitly define which GPU to use
                                                 # tf has a nasty habit of taking up ALL available memory

# gym stuff
import gym
import gym_gazebo

# keras stuff
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

import action_heatmap

model_save_path = './models/'

## Make environment and set parameters


In [None]:
SEED = 0
MAX_PATH_LENGTH = 5000
NUM_EPISODES = 1000

# Make the turtlebot-in-empty-world environment
env = gym.make('GazeboEmptyTurtlebotLidarNn-v0')
env.reset()
env._max_episode_steps = MAX_PATH_LENGTH

np.random.seed(SEED)


## Create an experience replay buffer
Has the ability to store state transitions and return randomly sampled transitions. Vital for breaking correlations between data for training a neural network.

In [None]:
from collections import namedtuple, deque

class ReplayMemory:
    def __init__(self, capacity = 10000):
        self.memory = deque(maxlen=capacity) # max size for replay buffer
        self.Transition = namedtuple('Transition',
                        ('state', 'action', 'reward', 'next_state', 'done'))    
    
    def push(self, *args):
        """Saves a transition."""
        self.memory.append(self.Transition(*args))
        
    def sample(self, batch_size):
        if batch_size <= self.__len__():
            return random.sample(self.memory, batch_size)
        else:
            print('Tried to sample more samples than are in buffer')
            return -1

    def __len__(self):
        return len(self.memory)

## Define a DQN class

Has methods for creating the main and target networks, sampling actions, and syncing weights

In [None]:
class QNetwork:
    def __init__(self, state_size, action_size, activation_type, learning_rate, 
                 alpha_decay, gamma, batch_size):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.alpha_decay = alpha_decay
        self.activation_type = activation_type
        self.gamma = gamma
        self.replay_buffer = ReplayMemory()
        self.batch_size = batch_size
        
    def create_model(self, hidden_layers):
        model = Sequential()
        if len(hidden_layers) == 0:
            model.add(Dense(self.action_size, input_shape=self.state_size, activation="linear"))
        else:
            for i, hid in enumerate(hidden_layers):
                if i == 0:
                    model.add(Dense(hid, input_dim=state_size, activation=self.activation_type))
                else:
                    model.add(Dense(hid, activation=self.activation_type))
            model.add(Dense(action_size, activation='linear'))
        optimizer = Adam(lr=self.learning_rate, decay=self.alpha_decay)
        model.compile(loss='mse', optimizer=optimizer)
        return model
        
    def init_models(self, hidden_layers):
        self.mainDQN = self.create_model(hidden_layers)
        self.targetDQN = self.create_model(hidden_layers)
        self.sync_weights()
    
    def model_descriptions(self):
        print('Main DQN model architecture --->')
        self.mainDQN.summary()
        
        print('Target DQN model architecture --->')
        self.targetDQN.summary()
        
    def sync_weights(self):
        self.targetDQN.set_weights(self.mainDQN.get_weights())
        
    def save_weights(self):
        self.mainDQN.save('./models/turtlebot_dqn_2.h5')
        
    def action_sampler(self, state, exploration_rate):
        if epsilon > np.random.rand():
            
            # Make a random action
            action = np.random.randint(0, self.action_size)
        else:
            
            # Get action from Q-network
            Qs = self.mainDQN.predict(state)[0]
            action = np.argmax(Qs)
        return action
    
    def train(self):
        transitions = self.replay_buffer.sample(self.batch_size)
        batch = self.replay_buffer.Transition(*zip(*transitions))
        x_batch = np.squeeze(np.asarray(batch.state))
        y_batch = self.mainDQN.predict(x_batch)
        for i, sample in enumerate(transitions):
            y_batch[i, sample.action] = sample.reward + self.gamma * \
            np.amax(self.targetDQN.predict(sample.next_state)[0]) * (1 - sample.done)
   
        self.sync_weights()
        
        self.mainDQN.fit(x_batch, y_batch, epochs=1, verbose=0)
        
        self.save_weights()
        
               

## Set hyperparameters and initialize DQN

Print out network architecture for both models to make sure everything is per plan.

In [None]:
learning_rate = 0.01
state_size = 2
action_size = 21
activation_type = 'relu'
hidden_size = [100, 100]
alpha_decay = 0.01
gamma = 0.99
batch_size = 64
learn_start = 64

DQN = QNetwork(state_size, action_size, activation_type, learning_rate, alpha_decay, gamma, batch_size)
DQN.init_models(hidden_size)
DQN.model_descriptions()

## Start training!

Epsilon is set to decay at an exponential rate.

In [None]:
# Exploration parameters
min_epsilon = 0.01           # minimum exploration probability
decay_rate = 20/NUM_EPISODES

returns = np.zeros(NUM_EPISODES)

for ep in range(1, NUM_EPISODES):
    # Start new episode
    state = env.reset()
    state = np.reshape(state, [1, state.size])

    total_reward = 0
    episode_step = 0
    done = False

    epsilon = min_epsilon + (1.0 - min_epsilon)*np.exp(-decay_rate*ep)

    while not done:
        episode_step += 1
        
        action = DQN.action_sampler(state, epsilon)
        
        # Take action, get new state and reward
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state.size])

        total_reward += reward
        
        # add to replay buffer
        DQN.replay_buffer.push(state, action, reward, next_state, done)
        state = next_state
    
    # Progress updates
    returns[ep]=total_reward

    # Plot policy learning process
    if ep % 10 == 0:
        ax = action_heatmap.main(env, DQN.mainDQN)
        fig = ax.get_figure()
        fig.savefig('./heatmaps/action_heatmap_ep_' + str(ep) +'.png')
    
    print('Return recorded for episode ' + str(ep) + ' : ', total_reward)
    if ep % 10 == 0 or ep == NUM_EPISODES-1: #print out gradual improvement
        print('Episode {}'.format(ep-10),'..{}'.format(ep),'  Avg reward: {}'.format(sum(returns[ep-10:ep])/10),'  Epsilon: {:.4f}'.format(epsilon))
    
    # Train network!
    if len(DQN.replay_buffer) >= learn_start:
        DQN.train()



In [None]:
# plot average returns
returns_over_each_episode = []
x = []
for i in range(ep):
    returns_over_each_episode.append(sum(returns[1*i:1*(i+1)-1])/1)
    x.append((i+1)*1)
plt.plot(returns[:ep], '.-r')
plt.ylabel('Average Returns per Episode')
plt.xlabel('Episodes')
plt.show()
    

## Ignore the rest of this file

Called only for testing helper scripts

In [None]:
import importlib
importlib.reload(action_heatmap)
print(env)
ax = action_heatmap.main(env, DQN.mainDQN)
fig = ax.get_figure()
fig.savefig('./heatmaps/action_heatmap_ep_' + str(ep) +'.png')


In [None]:
 %matplotlib qt
import reward_surface
import importlib
importlib.reload(reward_surface)
ax = reward_surface.main(env)
fig = ax.get_figure()