In [1]:
import random
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

# https://docs.python.org/2/library/collections.html#collections.deque
from collections import deque

Using TensorFlow backend.


In [78]:
class Model(object):
    
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = 0.001
        self.model = self.build_model()
        
    def build_model(self):
        # Neural Network for Deep Q Learning
        print('inside build_model')
        # Sequential() creates the foundation of the layers
        model = Sequential()
        
        # 'Dense' is the basic form of a neural network layer
        # Input Layer of state size(5) and Hidden Layer with 30 nodes
        model.add(Dense(units=30, input_dim=self.state_size, activation='relu'))
        
        # Hidden layer with 30 nodes
        model.add(Dense(units=30, activation='relu'))
        
        # Output Layer with # of actions: 3 nodes (straight, left, right)
        model.add(Dense(units=self.action_size, activation='softmax'))
        
        # Create the model based on the information above
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        
        return model
    
    def predict(self, state):
        return self.model.predict(state)
    
    def save(self, name):
        self.model.save_weights(name)
    
    def load(self, name):
        self.model.load_weights(name)

In [79]:
class Memory(object):
    
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)
        
    def push(self, state, action, reward, next_state): # a.k.a. remember function
        self.memory.append((state, action, reward, next_state))
        
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)

In [80]:
class Dqn(object):
    
    def __init__(self, state_size, action_size, gamma):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.model = Model(state_size=state_size, action_size=action_size)
        self.memory = Memory(capacity=100000)
        self.last_state = np.zeros((1,5))
        self.last_action = 0
        self.last_reward = 0.0
        self.reward_bag = deque(maxlen=1000)
        
    def act(self, state):
        state = np.array(state)
        act_values = self.model.predict(state)
        print(act_values)
        return self.sampler(act_values[0])
    
        # TODO: EXPLORATION
        return np.argmax(act_values[0])  # returns action
    
    def sampler(self, distrbution):
        r = random.random()
        cumulative = 0.0
        action = 0
        for probability in distrbution:
            cumulative += probability
            if r < cumulative:
                return action
            else:
                action += 1.0
        
    def learn(self, batch_size):
        minibatch = self.memory.sample(batch_size)
        for state, action, reward, next_state in minibatch:
            
            # the original Q-learning formula
            # http://neuro.cs.ut.ee/demystifying-deep-reinforcement-learning/
            
            # first feed forward pass to get the predicted Q-values of all actions
            target_predicted = self.model.predict(state)
            
            # second feed forward pass to get the maximum Q-value over all network outputs
            target_q = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
            
            # setting the Q-value for the action a to target_q. 
            # For all other action, set the Q-value target to the same as originally returned from step 1, making the error 0 for those outputs
            target_predicted[0][action] = target_q
            
            self.model.fit(state, target_predicted, epochs=1, verbose=0)
              
    def update(self, reward, new_state):
        new_state = np.array([new_state])
        self.memory.push(self.last_state, self.last_action, self.last_reward, new_state)
        action = self.act(new_state)
        if len(self.memory) > 100:
            self.learn(batch_size=100)
            
        self.last_action = action
        self.last_state = new_state
        self.last_reward = reward
        self.reward_bag.append(reward)
        
    def score(self):
        return sum(self.reward_bag)/(len(self.reward_bag)+1)
    
    def load(self, name):
        self.model.load(name)

    def save(self, name):
        self.model.save(name)