## Deep Reinforcement Learning for Atari with Improvised Architecture for Breakout-v0 

### Import Required Packages

In [1]:
import numpy as np
import tensorflow as tf

from __future__ import division
from PIL import Image

# From openai.gym (pip install gym)
import gym

# From Keras-RL (pip install keras-rl)

from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Dropout, concatenate, Permute
from keras.layers import Input, Conv2D
from keras.activations import relu, linear
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor

Using TensorFlow backend.


### Defining class for pre-processing the game_frames

In [2]:
class GameProcess(Processor):
    def process_observation(self, observation):
        img = Image.fromarray(observation)
        img = np.array(img.resize(frame_shape).convert('L'))
        return img.astype('uint8')  

    def process_state_batch(self, batch):
        batch = batch.astype('float32') / 255.
        return batch

    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)

### Breakout Environment

In [3]:
env = gym.make('BreakoutDeterministic-v4')

# random seed to reproduce game states
np.random.seed(125)
env.seed(125)

env.reset()
env.render()

#### 1. Number of possible action

In [4]:
nb_actions = env.action_space.n
nb_actions

4

#### 2. Taking stack of 4 consecutive frames

In [5]:
frame_shape = (84, 84)
window_length = 4
input_shape = (window_length,) + frame_shape
input_shape

(4, 84, 84)

#### Define fire_incept modules

In [6]:
def fire_incept(x, squeeze=16, expand=64):
    x = Conv2D(squeeze, (3,3))(x)
    x = Activation('relu')(x)
    
    left = Conv2D(expand, (3,3), padding='same')(x)
    left = Activation('relu')(left)
    
    right = Conv2D(expand, (5,5), padding='same')(x)
    right = Activation('relu')(right)
    
    x = concatenate([left, right], axis=3)
    return x

### New Architecture

In [7]:
game_input=Input(shape=input_shape)

x = Permute((2, 3, 1))(game_input)

x = Conv2D(64, (7,7), strides=(3, 3), activation='relu')(x)

x = Conv2D(64, (5,5), strides=(2, 2), activation='relu')(x)

x = fire_incept(x, squeeze=16, expand=64)

x = fire_incept(x, squeeze=32, expand=64)

x = Dropout(0.4)(x)

x = Conv2D(64, (3, 3), strides=(1, 1), activation='relu')(x)

x = Flatten()(x)

x = Dense(512, activation='relu')(x)

x = Dropout(0.3)(x)

out = Dense(nb_actions, activation='linear')(x)

model_fire_incept= Model(game_input, out)
model_fire_incept.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 4, 84, 84)     0                                            
____________________________________________________________________________________________________
permute_1 (Permute)              (None, 84, 84, 4)     0           input_1[0][0]                    
____________________________________________________________________________________________________
conv2d_1 (Conv2D)                (None, 26, 26, 64)    12608       permute_1[0][0]                  
____________________________________________________________________________________________________
conv2d_2 (Conv2D)                (None, 11, 11, 64)    102464      conv2d_1[0][0]                   
___________________________________________________________________________________________

### Configuring the Agent

#### 1. Allocating memory for experience replay

In [9]:
memory = SequentialMemory(limit=1000000, window_length=window_length)

#### 2. Policy: Boltzmann Exploration
: Gradually exploration will be decreased

In [10]:
policy = LinearAnnealedPolicy(BoltzmannQPolicy(), attr='tau', value_max=1., value_min=.2, value_test=.1, nb_steps=1000000)

#### 3. Compiling DQN Agent

In [11]:
dqn = DQNAgent(model=model_fire_incept, nb_actions=nb_actions, policy=policy, memory=memory, processor=GameProcess(),
               nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.)

In [12]:
dqn.compile(Adam(lr=.00025), metrics=['mae'])

### Training the model

#### 1. Check if Agent is learning for first 0.5M Steps

In [13]:
dqn.fit(env, nb_steps=500000)

Training for 500000 steps ...
Interval 1 (0 steps performed)
54 episodes - episode_reward: 1.241 [0.000, 5.000] - ale.lives: 2.906

Interval 2 (10000 steps performed)
55 episodes - episode_reward: 1.236 [0.000, 6.000] - ale.lives: 2.941

Interval 3 (20000 steps performed)
54 episodes - episode_reward: 1.370 [0.000, 6.000] - ale.lives: 2.841

Interval 4 (30000 steps performed)
56 episodes - episode_reward: 1.089 [0.000, 6.000] - ale.lives: 2.902

Interval 5 (40000 steps performed)
56 episodes - episode_reward: 1.161 [0.000, 5.000] - ale.lives: 2.934

Interval 6 (50000 steps performed)
57 episodes - episode_reward: 1.000 [0.000, 3.000] - loss: 0.003 - mean_absolute_error: 0.010 - mean_q: 0.013 - mean_tau: 0.956 - ale.lives: 2.966

Interval 7 (60000 steps performed)
57 episodes - episode_reward: 1.053 [0.000, 4.000] - loss: 0.003 - mean_absolute_error: 0.013 - mean_q: 0.016 - mean_tau: 0.948 - ale.lives: 2.962

Interval 8 (70000 steps performed)
60 episodes - episode_reward: 0.850 [0.000,

<keras.callbacks.History at 0x1ea7bb18a58>

#### 2. Saving the weights

In [14]:
dqn.save_weights('fire_incept_weights.h5f')

#### 3. Loading the saved weights (of 0.5M steps)

In [15]:
dqn.load_weights('fire_incept_weights.h5f')

#### 4. Re-Training the model (for 2M steps)

In [16]:
env.reset()

dqn = DQNAgent(model=model_fire_incept, nb_actions=nb_actions, policy=policy, memory=memory, processor=GameProcess(),
               gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.)

dqn.compile(Adam(lr=0.00025), metrics=['mae'])

In [17]:
dqn.fit(env, nb_steps=2000000)

#### 5. Saving final weights

In [18]:
dqn.save_weights('fire_incept_weights.h5f', overwrite=True)

### Testing the Model

In [19]:
dqn.load_weights('fire_incept_weights.h5f')

In [20]:
env.reset()
dqn.test(env, nb_episodes=2, visualize=True)

In [21]:
env.close()

In [22]:
# Trained on: Intel® Xeon® Processor E5, 2.40 GHz, Nvidia Quadro K4200
# Bhartendu Thakur, Machine Learning & Computing
# https://in.mathworks.com/matlabcentral/profile/authors/10083740-bhartendu?&detail=fileexchange
# https://in.linkedin.com/in/bhartendu-thakur-56bb6285