In [1]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense,LSTM,Flatten,Conv2D,MaxPooling2D,Dropout, Activation,TimeDistributed
from keras.optimizers import Adam
import cv2
import os # for creating directories
from atari_wrappers import *

Using TensorFlow backend.


In [2]:
def wrap_dqn(env):
    env = ProcessFrame84(env)
    env = NoopResetEnv(env, noop_max=30)
    env = ClippedRewardsWrapper(env)
    env = MaxAndSkipEnv(env, skip=4)
    return env

env = wrap_dqn(gym.make('PongNoFrameskip-v4'))


In [3]:
state_size = env.observation_space.shape
input_shape=(env.observation_space.shape[0],env.observation_space.shape[1],1)
input_shape

(84, 84, 1)

In [4]:

action_size=2

In [5]:
batch_size = 32
n_episodes = 20000 # n games we want agent to play (default 1001)
output_dir = 'model_output/pong/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [6]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=10000) # double-ended queue; acts like list, but elements can be added/removed from either end
        self.gamma = 0.99 # decay or discount rate: enables agent to take into account future actions in addition to the immediate ones, but discounted at this rate
        self.epsilon = 1.0 # exploration rate: how much to act randomly; more initially than later due to epsilon decay
        self.epsilon_decay = 0.99 # decrease number of random explorations as the agent's performance (hopefully) improves over time
        self.epsilon_min = 0.01 # minimum amount of random exploration permitted
        self.learning_rate = 0.001 # rate at which NN adjusts models parameters via SGD to reduce cost 
        self.model = self._build_model() # private method 
        self.history=[]
        self.score=[]
    
    def _build_model(self):
        # neural net to approximate Q-value function:
        model = Sequential()
        model.add(Conv2D(32, (8, 8), strides=(4, 4), padding='same',input_shape=input_shape))
        model.add(Activation('relu'))
        model.add(Conv2D(64, (4, 4), strides=(2, 2), padding='same'))
        model.add(Activation('relu'))
        model.add(Conv2D(64, (3, 3), strides=(1, 1), padding='same'))
        model.add(Activation('relu'))
        model.add(Flatten())
        model.add(Dense(512))
        model.add(Activation('relu'))
        model.add(Dense(self.action_size))   
        adam = Adam(lr=self.learning_rate)
        model.compile(loss='mse',optimizer=adam)
        return model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done)) # list of previous experiences, enabling re-training later

    def act(self, state):
        if np.random.rand() <= self.epsilon: # if acting randomly, take random action
            return random.randrange(self.action_size)+2
        act_values = self.model.predict(np.reshape(state,(1,state.shape[0],state.shape[1],1))) # if not acting randomly, predict reward value based on current state
        return np.argmax(act_values[0])+2
    
    
    # pick the action that will give the highest reward (i.e., go left or right?)
    def replay(self, batch_size): # method that trains NN with experiences sampled from memory
        minibatch = random.sample(self.memory, batch_size) # sample a minibatch from memory
        state_minibatch=[]
        target_minibatch=[]
        for state, action, reward, next_state, done in minibatch: # extract data for each minibatch sample
            target = reward # if done (boolean whether game ended or not, i.e., whether final state or not), then target = reward
            if not done: # if not done, then predict future discounted reward
                n=np.reshape(next_state,(1,next_state.shape[0],next_state.shape[1],1))
                target = (reward + self.gamma * # (target) = reward + (discount rate gamma) * 
                          np.amax(self.model.predict(n)[0])) # (maximum target Q based on future action a')
            target_f = self.model.predict(np.reshape(state,(1,state.shape[0],state.shape[1],1))) # approximately map current state to future discounted reward
            target_f[0][action-2] = target
            state_minibatch.append(state)
            target_minibatch.append(target_f[0])
        target_minibatch=np.array(target_minibatch).reshape((32,action_size*1))
        self.history.append(self.model.fit(np.array(state_minibatch), target_minibatch,epochs=1,verbose=0)) # single epoch of training with x=state, y=target_f; fit decreases loss btwn target_f and y_hat
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [7]:
agent = DQNAgent(state_size, action_size) # initialise agent
agent.model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 21, 21, 32)        2080      
_________________________________________________________________
activation_1 (Activation)    (None, 21, 21, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 11, 11, 64)        32832     
_________________________________________________________________
activation_2 (Activation)    (None, 11, 11, 64)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 11, 11, 64)        36928     
_________________________________________________________________
activation_3 (Activation)    (None, 11, 11, 64)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 7744)             

In [10]:
done = False
for e in range(n_episodes): # iterate over new episodes of the game
    state= env.reset() # reset state at start of each new episode of the game
    #state=np.concatenate(state,axis=0)
    #state=cv2.cvtColor(np.float32(state), cv2.COLOR_BGR2GRAY)
    state=state/255
    sc=0
    for time in range(5000):  # time represents a frame of the game; goal is to keep pole upright as long as possible up to range, e.g., 500 or 5000 timesteps
        action = agent.act(state) # action is either 0 or 1 (move cart left or right); decide on one or other here
        next_state, reward, done, _ = env.step(action) # agent interacts with env, gets feedback; 4 state data points, e.g., pole angle, cart position        
        sc+=reward
        #next_state=np.concatenate(next_state,axis=0)
        #next_state=cv2.cvtColor(np.float32(next_state), cv2.COLOR_BGR2GRAY)
        next_state=next_state/255
        next_state=np.reshape(next_state,(next_state.shape[0],next_state.shape[1],1))
        agent.remember(state, action, reward, next_state, done) # remember the previous timestep's state, actions, reward, etc.        
        state = next_state # set "current state" for upcoming iteration to the current next state        
        if done: # episode ends if agent drops pole or we reach timestep 5000
            print("episode: {}/{}, score: {}, e: {:.2}" # print the episode's score and agent's epsilon
                  .format(e, n_episodes, sc, agent.epsilon))
            break # exit loop
    if len(agent.memory) > batch_size:
        agent.replay(batch_size) # train the agent by replaying the experiences of the episode
    if e % 1000 == 0:
        agent.save(output_dir + "weights_" + '{:04d}'.format(e) + ".hdf5")

episode: 0/20000, score: -17.0, e: 1.0


KeyboardInterrupt: 

In [None]:
f = open(output_dir+"history.txt","w")
f.write( str(agent.history))
f.close()

In [None]:
state=env.reset()
state=cv2.cvtColor(state, cv2.COLOR_BGR2GRAY)
state=np.reshape(state,(state.shape[0],state.shape[1],1))
p=40000

for t in range(p):
    env.render()
    action=agent.act(state)
    state,reward,done,info=env.step(action)
    state=cv2.cvtColor(state, cv2.COLOR_BGR2GRAY)
    state=np.reshape(state,(state.shape[0],state.shape[1],1))
    if done:
        print(t)
        print("episode has finished")
        break
env.close()