In [1]:
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import time 
import random
from collections import deque
import gym
import retro
from skimage import transform # Help us to preprocess the frames
from skimage.color import rgb2gray # Help us to gray our frames
import keras.backend.tensorflow_backend as backend
from keras.models import Sequential
from keras.layers import Conv2D, Dense, Activation, Dropout, MaxPooling2D, Flatten
from keras.optimizers import Adam
from keras.callbacks import TensorBoard
import os
from PIL import Image
import cv2

Using TensorFlow backend.


In [2]:
env = retro.make(game='SpaceInvaders-Atari2600')
print("The size of our frame is: ", env.observation_space)
print("The action size is : ", env.action_space.n)

# Here we create an hot encoded version of our actions
# possible_actions = [[1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0]...]
possible_actions = np.array(np.identity(env.action_space.n,dtype=int).tolist())

In [3]:
DISCOUNT = 0.99
REPLAY_MEMORY_SIZE = 50_000  # How many last steps to keep for model training
MIN_REPLAY_MEMORY_SIZE = 1_000  # Minimum number of steps in a memory to start training
MINIBATCH_SIZE = 64  # How many steps (samples) to use for training
UPDATE_TARGET_EVERY = 5  # Terminal states (end of episodes)
MODEL_NAME = '2x256'
MIN_REWARD = -200  # For model save
MEMORY_FRACTION = 0.20

# Environment settings
EPISODES = 20_000

# Exploration settings
epsilon = 1  # not a constant, going to be decayed
EPSILON_DECAY = 0.99975
MIN_EPSILON = 0.001

#  Stats settings
AGGREGATE_STATS_EVERY = 50  # episodes
SHOW_PREVIEW = False


In [5]:

# # Own Tensorboard class
# class ModifiedTensorBoard(TensorBoard):

#     # Overriding init to set initial step and writer (we want one log file for all .fit() calls)
#     def __init__(self, **kwargs):
#         super().__init__(**kwargs)
#         self.step = 1
# #         self.writer = tf.summary.FileWriter(self.log_dir)
#         self.writer = tf.summary.create_file_writer(self.log_dir) 

#     # Overriding this method to stop creating default log writer
#     def set_model(self, model):
#         pass

#     # Overrided, saves logs with our step number
#     # (otherwise every .fit() will start writing from 0th step)
#     def on_epoch_end(self, epoch, logs=None):
#         self.update_stats(**logs)

#     # Overrided
#     # We train for one batch only, no need to save anything at epoch end
#     def on_batch_end(self, batch, logs=None):
#         pass

#     # Overrided, so won't close writer
#     def on_train_end(self, _):
#         pass

#     # Custom method for saving own metrics
#     # Creates writer, writes custom metrics and closes writer
#     def update_stats(self, **stats):
#         self._write_logs(stats, self.step)
        
#     def _write_logs(self, logs, index):
#         for name, value in logs.items():
#             if name in ['batch', 'size']:
#                 continue
#             with self.writer.as_default():
#                 tf.summary.scalar(name, value)
#                 self.writer.flush()
# #             self.writer.add_summary(summary, index)
# #         self.writer.flush()

In [3]:
class DQNAgent:
    def __init__(self):
        #double DQN
        self.model = self.create_model()
        self.target_model = self.create_model()
        self.target_model.set_weights(self.model.get_weights())
        
        self.replay_memory = deque(maxlen = REPLAY_MEMORY_SIZE)
        
        self.target_update_counter = 0
        
        # self.tensorboard = ModifiedTensorBoard(log_dir = 'logs/{}-{}'.format(MODEL_NAME, int(time.time())))
        
    def create_model(self):
        model = Sequential()
        model.add(Conv2D(256, (3,3), input_shape = (110, 84, 1)))

        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size = (2,2)))
        model.add(Dropout(0.2))
        
        model.add(Conv2D(256, (3,3)))
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size = (2,2)))
        model.add(Dropout(0.2))
        
        model.add(Flatten())
        model.add(Dense(64))
#         model.add(Activation('relu'))
#         model.add(Dropout(0.1))
        
        model.add(Dense(env.action_space.n, activation = 'linear'))

        model.compile(loss = 'mse', optimizer = Adam(lr=0.001), metrics = ['accuracy'])
        
        return model
    
    def update_replay_memory(self, transition):
        self.replay_memory.append(transition)
    
    def train(self, terminal_state, step):
        
        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
            return
        
        minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)
        
        current_states = np.array([transition[0] for transition in minibatch])
        current_qs_list = self.model.predict(current_states)
        
        new_states = np.array([transition[3] for transition in minibatch])
        future_qs_list = self.target_model.predict(new_states)
        
        X = []
        y = []
        
        for index, (current_state, action, reward, new_state, done) in enumerate(minibatch):
            
            if not done:
                max_future_q = np.max(future_qs_list[index])
                new_q = reward + DISCOUNT*max_future_q
            else:
                new_q = reward
            
            current_qs = current_qs_list[index]
            current_qs[action] = new_q
            
            X.append(current_state)
            y.append(current_qs)
            
            # self.model.fit(np.array(X), np.array(y), batch_size=MINIBATCH_SIZE, verbose=0, shuffle=False, callbacks = [self.tensorboard] if terminal_state else None)
            self.model.fit(np.array(X), np.array(y), batch_size=MINIBATCH_SIZE, verbose=0, shuffle=False if terminal_state else None)
            if terminal_state:
                self.target_update_counter += 1
                
            if self.target_update_counter == UPDATE_TARGET_EVERY:
                self.target_model.set_weights(self.model.get_weights())
                self.target_update_counter = 0
                
    def get_qs(self, state):
        return self.model.predict(np.array(state).reshape(-1, *state.shape))[0]
    
    def preprocess(self, state):
        # Grayscale frame 
        gray = rgb2gray(state)
        # Crop the screen (remove the part below the player)
        # [Up: Down, Left: right]
        cropped_frame = gray[8:-12,4:-12]
        # Normalize Pixel Values
        normalized_frame = cropped_frame/255.0
        # Resize
        preprocessed_frame = transform.resize(normalized_frame, [110,84])
        # 110x84x1 frame
        preprocessed_frame = preprocessed_frame.reshape((110,84,1))
        
        return preprocessed_frame 
        
                

In [4]:
agent = DQNAgent()

In [None]:
# uncomment and indent to use gpu
# with tf.device('/device:GPU:0'):

for episode in tqdm(range(1, EPISODES+1), ascii = True, unit='episodes'):
    print(episode)
    #updating tensorboard step
    # agent.tensorboard.step = episode

    episode_reward = 0
    step = 1

    current_state = env.reset()
    current_state = agent.preprocess(current_state)

    done = False

    while not done:

        if np.random.random() > epsilon:
            choice = np.argmax(agent.get_qs(current_state))
            action = possible_actions[choice]

        else:
            choice = random.randint(1,len(possible_actions))-1
            action = possible_actions[choice]

        
        new_state, reward, done, info = env.step(action)
        new_state = agent.preprocess(new_state)


        episode_reward += reward 
        if SHOW_PREVIEW and not episode % AGGREGATE_STATS_EVERY:
            env.render()

        agent.update_replay_memory((current_state, action, reward, new_state, done))
        agent.train(done, step)

        current_state = new_state
        step+=1

    ep_rewards.append(episode_reward)

    if not episode % AGGREGATE_STATS_EVERY or episode == 1:
        average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
        min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
        max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
        # agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon)
        
# To save models if reward greater than min reward
        # if min_reward >= MIN_REWARD:
        #     agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')

    if epsilon > MIN_EPSILON:
        epsilon *= EPSILON_DECAY
        epsilon = max(MIN_EPSILON, epsilon)


  0%|          | 0/20000 [00:00<?, ?episodes/s]

1


In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))