# DQN + Transfer learning for Centipede -> SpaceInvaders

In [1]:
import gym
import cv2
import numpy as np
import random
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization, Conv2D, MaxPool2D
from keras.optimizers import Adam

from collections import deque

def to_grayscale(img):
    return np.mean(img, axis=2).astype(np.uint8)

def downsample(img):
    return img[::2, ::2]

def preprocess(img):
    if img.shape == (250, 160, 3):
        return to_grayscale(downsample(img))
    else:
        img2 = cv2.resize(img, (80, 125))
        return to_grayscale(img2)


class DQN:
    def __init__(self, env, loaded, freeze_on, num_freeze):
        self.env     = env
        self.memory  = deque(maxlen=500) # appends to queue and pops at the other end
        
        self.gamma = 0.85
        self.freeze_on = freeze_on
        self.loaded = loaded
        self.num_freeze = num_freeze
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.02
        self.tau = .125
        if self.loaded == None:
            self.model        = self.create_model()
            self.target_model = self.create_model()
        else:
            self.model = self.transfer_model()
            self.target_model = self.transfer_model()
        
    def freeze_layer(self, num):
        for layer in self.model.layers[:num]:
            layer.trainable = False
        for layer in self.target_model.layers[:num]:
            layer.trainable = False
        self.model.compile(loss="mean_squared_error",
            optimizer=Adam(lr=self.learning_rate))
        self.target_model.compile(loss="mean_squared_error",
            optimizer=Adam(lr=self.learning_rate))
        hidden = Dense(120, activation='relu')(model.layers[-2].output). model.layers[-1].output

    def create_model(self):
        model = Sequential()
        model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(125, 80, 1)))
        model.add(BatchNormalization())
        model.add(Conv2D(16, kernel_size=(3, 3), activation='relu'))
        model.add(Flatten())
        model.add(Dense(self.env.action_space.n))
        model.compile(loss="mean_squared_error",
            optimizer=Adam(lr=self.learning_rate))
        return model
    
    def transfer_model(self):
        trans_model = load_model(self.loaded)
        model = Sequential()
        model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(125, 80, 1), 
                         weights=trans_model.layers[0].get_weights()))
        model.add(BatchNormalization())
        model.add(Conv2D(16, kernel_size=(3, 3), activation='relu', weights=trans_model.layers[2].get_weights()))
        model.add(Flatten())
        model.add(Dense(self.env.action_space.n))
        if self.freeze_on:
             for layer in model.layers[:self.num_freeze]:
                    layer.trainable = False
        model.compile(loss="mean_squared_error",
                      optimizer=Adam(lr=self.learning_rate))
        return model
    
    # take action
    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        state = preprocess(state).reshape((1, 125,80,1))
        return np.argmax(self.model.predict(state)[0])

    # remember history
    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])

    # memory
    def replay(self):
        batch_size = 32
        if len(self.memory) < batch_size: 
            return

        samples = random.sample(self.memory, batch_size)
        for sample in samples:
            state, action, reward, new_state, done = sample
            new_state = preprocess(new_state).reshape((1, 125,80,1))
            state = preprocess(state).reshape((1, 125,80,1))
            target = self.target_model.predict(state)
            if done:
                target[0][action] = reward
            else:
                Q_future = max(self.target_model.predict(new_state)[0])
                target[0][action] = reward + Q_future * self.gamma
            self.model.fit(state, target, epochs=1, verbose=0)

    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)

    def save_model(self, fn):
        self.model.save(fn)

    def save_model(self, fn):
        self.model.save(fn)




Using TensorFlow backend.


In [2]:
#def main():
env     = gym.make("SpaceInvaders-v0")
gamma   = 0.95
epsilon = .95
trials  = 10000
num_freeze = 1
freeze = True
#trial_len = 500

# updateTargetNetwork = 1000
dqn_agent = DQN(env=env, loaded='dqn_centipede.model', freeze_on = True, num_freeze=num_freeze)
rewards = []
for trial in range(trials):
    cur_state = env.reset()#.reshape(1,2)  (250, 160, 3)
    #cur_state = preprocess(cur_state).reshape((125,80,1))
    reward_new = 0
    while True:
        # take action from current state (epsilon greedy)
        action = dqn_agent.act(cur_state)
        # take one step
        new_state, reward, done, life = env.step(action)
        reward_new += reward
        #new_state = preprocess(new_state).reshape((125,80,1))
        # reward = reward if not done else -20
        #new_state = new_state.reshape(1,2)
        # store in memory
        dqn_agent.remember(cur_state, action, reward, new_state, done)
        dqn_agent.replay()       # internally iterates default (prediction) model
        #print(reward)
        dqn_agent.target_train() # iterates target model
        cur_state = new_state
        if done:
            break
    rewards.append(reward_new)
    print("Died with {} points".format(reward_new))
    dqn_agent.save_model("dqn_centipede_transfer_spaceinvaders.model")
    #dqn_agent.save_model("dqn_centipede.model")


#main()

Died with 215.0 points
Died with 195.0 points
Died with 125.0 points
Died with 155.0 points
Died with 235.0 points
Died with 90.0 points
Died with 280.0 points
Died with 200.0 points
Died with 210.0 points
Died with 85.0 points
Died with 300.0 points
Died with 90.0 points
Died with 50.0 points
Died with 180.0 points
Died with 75.0 points
Died with 210.0 points
Died with 105.0 points
Died with 95.0 points
Died with 100.0 points
Died with 85.0 points
Died with 230.0 points
Died with 140.0 points
Died with 220.0 points
Died with 130.0 points
Died with 290.0 points
Died with 140.0 points
Died with 220.0 points
Died with 180.0 points
Died with 95.0 points
Died with 105.0 points
Died with 135.0 points
Died with 75.0 points
Died with 105.0 points
Died with 110.0 points
Died with 275.0 points
Died with 410.0 points
Died with 370.0 points
Died with 260.0 points
Died with 205.0 points
Died with 105.0 points
Died with 335.0 points
Died with 135.0 points
Died with 305.0 points
Died with 270.0 poin

KeyboardInterrupt: 

In [3]:
rewards

[215.0,
 195.0,
 125.0,
 155.0,
 235.0,
 90.0,
 280.0,
 200.0,
 210.0,
 85.0,
 300.0,
 90.0,
 50.0,
 180.0,
 75.0,
 210.0,
 105.0,
 95.0,
 100.0,
 85.0,
 230.0,
 140.0,
 220.0,
 130.0,
 290.0,
 140.0,
 220.0,
 180.0,
 95.0,
 105.0,
 135.0,
 75.0,
 105.0,
 110.0,
 275.0,
 410.0,
 370.0,
 260.0,
 205.0,
 105.0,
 335.0,
 135.0,
 305.0,
 270.0,
 40.0]