In [5]:
import gym
import numpy as np
from collections import deque

from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten
from keras.optimizers import Adam

import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
env = gym.make('Pong-v0')

In [4]:
EPISODES = 300
GAMMA = 0.99

EXPLORE_INIT = 1.0
EXPLORE_FINAL = 0.01

MEMORY_SIZE = 17500
MEMORT_START_SIZE = 7500

BATCH_SIZE = 4
ACTION_SIZE = env.action_space.n

Remove outer game area and normalize to 0-1

In [5]:
def preprocess(image):
    image = image / 255
    image = image[34:194:3,0:160:3,0:3]
    return image

In [6]:
def create_Q_model(learning_rate=0.001):
    model = Sequential()

    model.add(Conv2D(16, (3,3), padding='same', activation='relu', input_shape=(54, 54, 3)))
    model.add(MaxPooling2D())
    model.add(Conv2D(24, (3,3), padding='same', activation='relu'))
    model.add(MaxPooling2D())
    model.add(Conv2D(32, (3,3), padding='same', activation='relu'))
    model.add(MaxPooling2D())
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(ACTION_SIZE, activation='linear'))

    optimizer = Adam(lr=learning_rate)
    model.compile(loss='mse', optimizer=optimizer)

    return model

In [7]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen=max_size)

    def add(self, experience):
        self.buffer.append(experience)
    
    def size(self):
        return len(self.buffer)

    def sample(self, batch_size):
        idx = np.random.choice(
            np.arange(len(self.buffer)),
            size=batch_size,
            replace=False
        )
        return [self.buffer[ii] for ii in idx]

In [8]:
model = create_Q_model()
memory = Memory(max_size = MEMORY_SIZE)

In [9]:
f = open("log.txt", "w")

In [None]:
explore_rate = EXPLORE_INIT
frame = 0

for ep in range(0, EPISODES):
    env.reset()
    state, reward, done, _ = env.step(env.action_space.sample())
    state = preprocess(state)
    state= np.expand_dims(state, axis=0)
    frame += 1
    
    total_reward = 0
    
    explore_rate = EXPLORE_INIT - (EXPLORE_INIT - EXPLORE_FINAL) * (ep + 1) / EPISODES
    
    while True:
        if np.random.rand() > explore_rate:
            action = np.argmax(model.predict(state)[0])
        else:
            action = env.action_space.sample()
        
        next_state, reward, done, _ = env.step(env.action_space.sample())
        next_state = preprocess(next_state)
        next_state = np.expand_dims(next_state, axis=0)
        frame += 1
        state = next_state
        
#         env.render()
        
        total_reward += reward
        
        memory.add((state, action, reward, done, next_state))
        
        if memory.size() >= MEMORT_START_SIZE:
            minibatch = memory.sample(BATCH_SIZE)
            
            inputs = np.zeros((BATCH_SIZE, 54, 54, 3))
            targets = np.zeros((BATCH_SIZE, ACTION_SIZE))
            
            for i, (state_b, action_b, reward_b, done_b, next_state_b) in enumerate(minibatch):
                inputs[i:i+1] = state_b[0]
                if done_b:
                    target = reward_b
                else:
                    target = reward_b + GAMMA * np.amax(model.predict(next_state_b))
                targets[i] = model.predict(next_state_b)[0]
                targets[i][action_b] = target
                
            model.fit(inputs, targets, epochs=1, verbose=0)
        
        if done:
            log_message = "Episode {}, Total Reward {}, Explore Rate {}".format(ep + 1, total_reward, explore_rate)
            f.write(log_message + "\n")
            f.flush()
            print(log_message)
            break
            
    if ep % 5 == 0:
        model.save('model.h5')

f.close()

Episode 1, Total Reward -20.0, Explore Rate 0.99802
Episode 2, Total Reward -21.0, Explore Rate 0.99604
Episode 3, Total Reward -19.0, Explore Rate 0.99406
Episode 4, Total Reward -20.0, Explore Rate 0.99208
Episode 5, Total Reward -21.0, Explore Rate 0.9901
Episode 6, Total Reward -19.0, Explore Rate 0.98812
Episode 7, Total Reward -20.0, Explore Rate 0.98614
Episode 8, Total Reward -19.0, Explore Rate 0.98416
Episode 9, Total Reward -21.0, Explore Rate 0.98218
Episode 10, Total Reward -21.0, Explore Rate 0.9802
Episode 11, Total Reward -21.0, Explore Rate 0.97822
Episode 12, Total Reward -21.0, Explore Rate 0.97624
Episode 13, Total Reward -21.0, Explore Rate 0.97426
Episode 14, Total Reward -21.0, Explore Rate 0.97228
Episode 15, Total Reward -20.0, Explore Rate 0.9703
Episode 16, Total Reward -20.0, Explore Rate 0.96832
Episode 17, Total Reward -20.0, Explore Rate 0.96634
Episode 18, Total Reward -20.0, Explore Rate 0.96436
Episode 19, Total Reward -21.0, Explore Rate 0.96238
Episo

Episode 154, Total Reward -21.0, Explore Rate 0.6950799999999999
Episode 155, Total Reward -19.0, Explore Rate 0.6931
Episode 156, Total Reward -21.0, Explore Rate 0.69112
Episode 157, Total Reward -19.0, Explore Rate 0.68914
Episode 158, Total Reward -21.0, Explore Rate 0.68716
Episode 159, Total Reward -21.0, Explore Rate 0.68518
Episode 160, Total Reward -21.0, Explore Rate 0.6832
Episode 161, Total Reward -21.0, Explore Rate 0.68122
Episode 162, Total Reward -21.0, Explore Rate 0.6792400000000001
Episode 163, Total Reward -18.0, Explore Rate 0.67726
Episode 164, Total Reward -20.0, Explore Rate 0.6752800000000001
Episode 165, Total Reward -20.0, Explore Rate 0.6733
Episode 166, Total Reward -21.0, Explore Rate 0.6713199999999999
Episode 167, Total Reward -20.0, Explore Rate 0.66934
Episode 168, Total Reward -19.0, Explore Rate 0.66736
Episode 169, Total Reward -21.0, Explore Rate 0.66538
Episode 170, Total Reward -20.0, Explore Rate 0.6634
Episode 171, Total Reward -21.0, Explore R

Episode 301, Total Reward -21.0, Explore Rate 0.40401999999999993
Episode 302, Total Reward -20.0, Explore Rate 0.40203999999999995
Episode 303, Total Reward -20.0, Explore Rate 0.4000600000000001
Episode 304, Total Reward -19.0, Explore Rate 0.39808
Episode 305, Total Reward -21.0, Explore Rate 0.3961
Episode 306, Total Reward -21.0, Explore Rate 0.39412
Episode 307, Total Reward -21.0, Explore Rate 0.39213999999999993
Episode 308, Total Reward -21.0, Explore Rate 0.39015999999999995
Episode 309, Total Reward -21.0, Explore Rate 0.38817999999999997
Episode 310, Total Reward -19.0, Explore Rate 0.3862000000000001
Episode 311, Total Reward -20.0, Explore Rate 0.38422
Episode 312, Total Reward -21.0, Explore Rate 0.38224
Episode 313, Total Reward -17.0, Explore Rate 0.38026000000000004
Episode 314, Total Reward -20.0, Explore Rate 0.37827999999999995
Episode 315, Total Reward -19.0, Explore Rate 0.37629999999999997
Episode 316, Total Reward -19.0, Explore Rate 0.3743200000000001
Episode 

Episode 434, Total Reward -21.0, Explore Rate 0.14068000000000003
Episode 435, Total Reward -19.0, Explore Rate 0.13870000000000005
Episode 436, Total Reward -21.0, Explore Rate 0.13672000000000006
Episode 437, Total Reward -21.0, Explore Rate 0.13473999999999997
Episode 438, Total Reward -21.0, Explore Rate 0.13276
Episode 439, Total Reward -21.0, Explore Rate 0.13078
Episode 440, Total Reward -20.0, Explore Rate 0.12879999999999991
Episode 441, Total Reward -20.0, Explore Rate 0.12682000000000004
Episode 442, Total Reward -21.0, Explore Rate 0.12484000000000006
Episode 443, Total Reward -21.0, Explore Rate 0.12285999999999997
Episode 444, Total Reward -21.0, Explore Rate 0.12087999999999999
Episode 445, Total Reward -20.0, Explore Rate 0.1189
Episode 446, Total Reward -20.0, Explore Rate 0.11691999999999991
Episode 447, Total Reward -19.0, Explore Rate 0.11494000000000004
Episode 448, Total Reward -19.0, Explore Rate 0.11296000000000006
Episode 449, Total Reward -20.0, Explore Rate 0

In [None]:
print("!")