In [None]:

from collections import deque, namedtuple
from PIL import Image
import itertools
import numpy as np
import gym
import random

from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Flatten, Convolution2D, Permute
from keras.optimizers import RMSprop
from keras import backend as K

np.random.seed(1337)  # for reproducibility


In [None]:
#set hyperparameters
MINI_BATCH_SIZE = 32
REPLAY_MEMORY_SIZE = 1000000
AGENT_HISTORY_LENGTH = 4
TARGET_NETWORK_UPDATE_FREQUENCY = 10000
DISCOUNT_FACTOR = 0.99
ACTION_REPEAT = 4
UPDATE_FREQUENCY = 4
LEARNING_RATE = 0.00025
GRADIENT_MOMENTUM = 0.95
SQUARED_GRADIENT_MOMENTUM = 0.95
MIN_SQUARED_GRADIENT = 0.01
INITIAL_EXPLORATION = 1
FINAL_EXPLORATION = 0.1
FINAL_EXPLORATION_FRAME = 1000000
REPLAY_START_SIZE = 50000
NOOP_MAX = 30

GAME = "Breakout-v0"
FRAME_WIDTH = 150 
FRAME_HEIGHT = 170 
CROP_SIDE = 5
CROP_TOP = 30
CROP_BOTTOM = 10
NUM_EPISODES = 10000

In [None]:

#setup game env
env = gym.envs.make(GAME)
env.frameskip = ACTION_REPEAT
NUMBER_OF_ACTIONS = env.action_space.n


In [None]:
# define the loss function

def mean_squared_error(y_true, y_pred):
    return K.mean(K.square(y_pred - y_true), axis=-1)

def cliped_mean_squared_error(y_true, y_pred):
    return K.clip(K.mean(K.square(y_pred - y_true), axis=-1), -1, 1)


In [None]:
# process image
def preprocess_state(state):
    img = Image.fromarray(state)
    img = img.crop(box=(CROP_SIDE, CROP_TOP, img.size[0]-CROP_SIDE, img.size[1]-CROP_BOTTOM))
    img = img.resize((FRAME_WIDTH, FRAME_HEIGHT))
    img = img.convert('L') 
    return img;

In [None]:
# build the model
# input shape is  AGENT_HISTORY_LENGTH, FRAME_WIDTH, FRAME_HEIGHT
# output shape is NUMBER_OF_ACTIONS
# using 1 relu hidden layer 
# mean_squared_error as a loss function
def build_model():
    model = Sequential()
    input_shape = AGENT_HISTORY_LENGTH, FRAME_WIDTH, FRAME_HEIGHT
    if K.image_dim_ordering() == 'tf':
        # (width, height, channels)
        model.add(Permute((2, 3, 1), input_shape=input_shape))
    elif K.image_dim_ordering() == 'th':
        # (channels, width, height)
        model.add(Permute((1, 2, 3), input_shape=input_shape))
    model.add(Convolution2D(32, 8, 8, subsample=(4, 4)))
    model.add(Activation('relu'))
    model.add(Convolution2D(64, 4, 4, subsample=(2, 2)))
    model.add(Activation('relu'))
    model.add(Convolution2D(64, 3, 3, subsample=(1, 1)))
    model.add(Activation('relu'))
    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(NUMBER_OF_ACTIONS))
    model.add(Activation('linear'))
    model.compile(loss=mean_squared_error, optimizer=RMSprop(lr=LEARNING_RATE))
    return model


In [None]:

# Initialize everything
episode_rewards = np.zeros(NUM_EPISODES)
episode_lengths = np.zeros(NUM_EPISODES)
loss = np.zeros(NUM_EPISODES)
total_frame = 0
max_reward = 0
max_ep = 0


# replay memory
replay_memory =  deque(maxlen = REPLAY_MEMORY_SIZE);
Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])

# state history
state_history = deque(maxlen = AGENT_HISTORY_LENGTH);

# The epsilon decay schedule
epsilons = np.linspace(INITIAL_EXPLORATION, FINAL_EXPLORATION, FINAL_EXPLORATION_FRAME)

# build model
train_model = build_model()
target_model = build_model()
target_model.set_weights(train_model.get_weights())

In [None]:
#init replay memmory

replay_size = 0

for i in itertools.count():

    state = env.reset()
    # init state
    life = 0
    state = preprocess_state(state)
    state = np.array(state).astype('uint8')
    for _ in xrange(AGENT_HISTORY_LENGTH):
        state_history.append(state)
    state = np.array(state_history)
    

    for t in itertools.count():
        #  random action
        action = np.random.randint(NUMBER_OF_ACTIONS)       
        next_state, reward, done, info = env.step(action)
        
        # set the negative reward when lose lives  
        if life > info['ale.lives'] :
            reward = -5
        life = info['ale.lives']
        
        # clip reward [-5,1]
        reward = max(-5, min(1, reward))
        
        # append next state
        next_state = preprocess_state(next_state)
        next_state = np.array(next_state).astype('uint8')
        state_history.append(next_state)
        next_state = np.array(state_history)
        
        # Save transition to replay memory
        replay_memory.append(Transition(state, action, reward, next_state, done))   
        
        # check if terminated
        replay_size +=1
        if done:
            break
        else:
            state = next_state
         
    # stop when replay memory full    
    if replay_size > REPLAY_START_SIZE :
        print "Done init replay memory (ep:%i)" %(i)
        break


In [None]:
for i_episode in xrange(NUM_EPISODES):

    state = env.reset()
    # init state
    state = preprocess_state(state)
    state = np.array(state).astype('uint8')
    for _ in xrange(AGENT_HISTORY_LENGTH):
        state_history.append(state)
    state = np.array(state_history)

    count_ran = 0
    count_q = 0
    life = 0
    
    for t in itertools.count():
        # step random action
        if np.random.random() < epsilons[min(total_frame,FINAL_EXPLORATION_FRAME-1)]:
                count_ran += 1
                action = np.random.randint(NUMBER_OF_ACTIONS)       
        else:
                count_q += 1
                q_values = train_model.predict(np.array([state]))[0]
                action = np.argmax(q_values)

        next_state, reward, done, info = env.step(action)
        
        # set the negative reward when lose lives  
        if life > info['ale.lives'] :
            reward = -5
        life = info['ale.lives']
        
        # clip reward [-5,1]
        reward = max(-5, min(1, reward))
        
        # append next state
        next_state = preprocess_state(next_state)
        next_state = np.array(next_state).astype('uint8')
        state_history.append(next_state)
        next_state = np.array(state_history)
        
        # Save transition to replay memory
        replay_memory.append(Transition(state, action, reward, next_state, done))   
        
        # Update statistics
        total_frame += 1
        episode_rewards[i_episode] += reward
        episode_lengths[i_episode] = t
        
        # train network    
        if total_frame % UPDATE_FREQUENCY == 0 and total_frame != 0 :
            # Sample a minibatch from the replay memory
            samples = random.sample(replay_memory, MINI_BATCH_SIZE)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples))

            # Calculate q values and targets
            q_values = train_model.predict(states_batch)
            q_values_next = target_model.predict(next_states_batch)
            new_q_values_batch = reward_batch + np.invert(done_batch).astype(np.float32) * DISCOUNT_FACTOR * np.amax(q_values_next, axis=1)
            for b in xrange(MINI_BATCH_SIZE) :
                q_values[b][action_batch[b]] = new_q_values_batch[b]
            targets_batch = q_values

            # Perform gradient descent update
            states_batch = np.array(states_batch)
            loss[i_episode] = train_model.train_on_batch(states_batch, targets_batch)
        

        # check if terminated
        if done :
            break
        else:
            state = next_state
                
        # update target network    
        if total_frame != 0 and total_frame % TARGET_NETWORK_UPDATE_FREQUENCY == 0:
            target_model.set_weights(train_model.get_weights())      
    
    ####### EPISODE END
    
        
    if max_reward < episode_rewards[i_episode] :
        max_reward = episode_rewards[i_episode]
        max_ep = i_episode
    # print statistics    
    print 'Ep:%i\treward:%i\trandom_act:%i\tpredict_act:%i' % (i_episode,episode_rewards[i_episode],count_ran, count_q)





In [None]:
# print statistics  
print "max score:{} at episode:{}".format(max_reward,max_ep)
save_name = 'dqn.h5'
train_model.save(save_name)