### Import dependencies

In [1]:
import random
import gym
import numpy as np
import os
from collections import deque
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten
from keras.optimizers import Adam

Using TensorFlow backend.


### Set parameters

In [3]:
env = gym.make('Pong-v0')

In [4]:
state_size = env.observation_space
state_size

Box(210, 160, 3)

In [5]:
action_size = env.action_space.n
action_size

6

In [6]:
batch_size = 32
n_episodes = 10000000

In [7]:
output_dir = 'model_output/cartpols'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

### Define agent

In [8]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        
        self.memory = deque(maxlen=30000)
        
        self.gamma = 0.95
        
        self.epsilon = 1.0
        self.epsilon_decay = 0.0000009
        self.epsilon_min = 0.1
        
        self.learning_rate = 0.001
        
        self.model = self._build_model()
        
    def _build_model(self):
        model = Sequential()
    
        model.add(Conv2D(1, kernel_size=3, activation='relu', input_shape=(105, 80, 1)))
        model.add(Conv2D(16, kernel_size=8, strides=4, activation='relu'))
        model.add(Conv2D(32, kernel_size=4, strides=2, activation='relu'))
        model.add(Flatten())
        model.add(Dense(256, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        
        return model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
        
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = np.expand_dims(state, axis=0)
        state = np.expand_dims(state, axis=3)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
    
    def rgb2gray(self, rgb):
        return np.dot(rgb[...,:3], [0.2989, 0.5870, 0.1140])
    
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                next_state = np.expand_dims(next_state, axis=0)
                next_state = np.expand_dims(next_state, axis=3)
                target = reward + self.gamma * np.amax(self.model.predict(next_state)[0])
            state = np.expand_dims(state, axis=0)
            state = np.expand_dims(state, axis=3)
            target_f = self.model.predict(state)
            target_f[0][action] = target
            
            self.model.fit(state, target_f, epochs=1, verbose=0)
            

            
    
    def load(self, name):
        self.model.load_weights(name)
        
    def save(self, name):
        self.model.save_weights(name)


In [9]:
agent = DQNAgent(state_size, action_size)

### Preprocessing functions

In [10]:
def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.2989, 0.5870, 0.1140])

### Interact with environment

In [11]:
state = env.reset()
state = rgb2gray(state)
state = state[::2,::2]
state.shape

(105, 80)

In [12]:
done = False
time = 0
episode = 0
while time < n_episodes:
    state = env.reset()
    state = rgb2gray(state)
    state = state[::2,::2]
    
    while not done:
        env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        next_state = rgb2gray(next_state)
        next_state = next_state[::2,::2]
        #next_state = next_state.reshape(-1, next_state.shape[0])
        reward = reward
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        time += 1
    episode += 1
    print("frame: {}/{}, score: {}, e: {:.2}" # print the episode's score and agent's epsilon
      .format(time, n_episodes, reward, agent.epsilon))
    done = False
    if len(agent.memory) > batch_size:
        agent.replay(batch_size) # train the agent by replaying the experiences of the episode
    if episode % 50 == 0: # save weights every 50th episode (game)
        agent.save(output_dir + "weights_" + '{:04d}'.format(time) + ".hdf5")

frame: 1224/10000000, score: -1.0, e: 1.0
frame: 2434/10000000, score: -1.0, e: 1.0
frame: 3919/10000000, score: -1.0, e: 1.0
frame: 5134/10000000, score: -1.0, e: 1.0
frame: 6327/10000000, score: -1.0, e: 0.99
frame: 7916/10000000, score: -1.0, e: 0.99
frame: 9009/10000000, score: -1.0, e: 0.99
frame: 10425/10000000, score: -1.0, e: 0.99
frame: 11958/10000000, score: -1.0, e: 0.99
frame: 13157/10000000, score: -1.0, e: 0.99
frame: 14365/10000000, score: -1.0, e: 0.99
frame: 15673/10000000, score: -1.0, e: 0.99
frame: 17052/10000000, score: -1.0, e: 0.98
frame: 18298/10000000, score: -1.0, e: 0.98
frame: 19705/10000000, score: -1.0, e: 0.98
frame: 21152/10000000, score: -1.0, e: 0.98
frame: 22345/10000000, score: -1.0, e: 0.98
frame: 23463/10000000, score: -1.0, e: 0.98
frame: 24789/10000000, score: -1.0, e: 0.98
frame: 26055/10000000, score: -1.0, e: 0.98
frame: 27066/10000000, score: -1.0, e: 0.98
frame: 28156/10000000, score: -1.0, e: 0.97
frame: 29641/10000000, score: -1.0, e: 0.97

KeyboardInterrupt: 