### Import dependencies

In [12]:
import random
import gym
import numpy as np
import os
from collections import deque
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten
from keras.optimizers import Adam

### Set parameters

In [13]:
env = gym.make('Pong-v0')

In [14]:
state_size = env.observation_space
state_size

In [15]:
action_size = env.action_space.n
action_size

In [16]:
batch_size = 32
n_episodes = 10000000

In [17]:
output_dir = 'model_output/pong'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

### Preprocessing functions

In [18]:
def rgb2gray(rgb):
    small_frame = np.dot(rgb[...,:3], [0.2989, 0.5870, 0.1140])
    return small_frame

In [19]:
def preprocess_frame(frame):
    state = rgb2gray(frame)
    state = state[::2,::2]
    state = np.expand_dims(state, axis=0)
    state = np.expand_dims(state, axis=3)
    state = state[0:1, 25:110, 0:80, 0:1]
    return state

### Define agent

In [20]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        
        self.memory = deque(maxlen=40000)
        
        self.gamma = 0.95
        
        self.epsilon = 1.0
        self.epsilon_decay = 0.0000009
        self.epsilon_min = 0.1
        
        self.learning_rate = 0.001
        
        self.model = self._build_model()
        
    def _build_model(self):
        model = Sequential()
    
        model.add(Conv2D(1, kernel_size=3, activation='relu', input_shape=(80, 80, 1)))
        model.add(Conv2D(16, kernel_size=8, strides=4, activation='relu'))
        model.add(Conv2D(32, kernel_size=4, strides=2, activation='relu'))
        model.add(Flatten())
        model.add(Dense(256, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        
        return model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
        
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
    
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.model.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            
            self.model.fit(state, target_f, epochs=1, verbose=0)
    
    def load(self, name):
        self.model.load_weights(name)
        
    def save(self, name):
        self.model.save_weights(name)


In [21]:
agent = DQNAgent(state_size, action_size)

### Interact with environment

In [22]:
done = False
time = 0
episode = 0
max_score = 0
k = 4
while time < n_episodes:
    state = preprocess_frame(env.reset())

    player_score = 0
    enemy_score = 0
    
    while not done:
        #env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)

        if reward > 0: 
            player_score += reward
        else: 
            enemy_score -= reward

        if time % k == 0: # only process every kth frame
            next_state = preprocess_frame(next_state)
            agent.remember(state, action, reward, next_state, done)
            state = next_state
        time += 1
    episode += 1

    if player_score > max_score: 
        max_score = player_score
    
    print("frame: {}/{}    enemy_score: {}    player_score: {}    max_score: {}    e: {:.2}" # print the episode's score and agent's epsilon
      .format(time, n_episodes, enemy_score, player_score, max_score, agent.epsilon))
    
    done = False
    
    if len(agent.memory) > batch_size:
        agent.replay(batch_size) # train the agent by replaying the experiences of the episode
    
    if episode % 50 == 0: # save weights every 50th episode (game)
        agent.save(output_dir + "weights_" + '{:04d}'.format(episode) + ".hdf5")