In [1]:
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import gym
import imageio
from collections import deque
from skimage.transform import resize

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.set_visible_devices([], 'GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4*1024)])
    except RuntimeError as e:
        print(e)
        
tf.compat.v1.enable_eager_execution()
tf.keras.backend.clear_session()
print('executing eagerly? ',tf.executing_eagerly())

executing eagerly?  True


In [2]:
def preprocessImage(img):
    img = np.mean(img,axis=2)
    img = img[::2,::2]
    return img

def make_DDQN():
    
    Input = keras.layers.Input(shape=(105,80,4))
    
    Conv1 = keras.layers.Conv2D(32,(8,8),strides=4,activation='relu',
                                kernel_initializer=keras.initializers.VarianceScaling(scale=2),
                               use_bias=False)(Input)
    
    Conv2 = keras.layers.Conv2D(64,(4,4),strides=2,activation='relu',
                                kernel_initializer=keras.initializers.VarianceScaling(scale=2),
                               use_bias=False)(Conv1)
    
    Conv3 = keras.layers.Conv2D(64,(3,3),strides=1,activation='relu',
                                kernel_initializer=keras.initializers.VarianceScaling(scale=2),
                               use_bias=False)(Conv2)
    
    Conv4 = keras.layers.Conv2D(1024,(2,2),strides=1,activation='relu',
                               kernel_initializer=keras.initializers.VarianceScaling(scale=2),
                               use_bias=False)(Conv3)
    
    
    split_state_values = keras.layers.Flatten()(Conv4)
    
    split_action_values = keras.layers.Flatten()(Conv4)
    
    state_values = keras.layers.Dense(1)(split_state_values)
    
    raw_advantages = keras.layers.Dense(6)(split_action_values)
    
    advantages = raw_advantages - keras.backend.max(raw_advantages, axis=1, keepdims=True)
    
    Q_values = state_values + advantages
    
    model = keras.Model(inputs=[Input],outputs=[Q_values])
    
    model.compile(optimizer=keras.optimizers.RMSprop(lr=0.00025,rho=0.95,epsilon=0.01),loss=keras.losses.Huber())
    
    return model

def get_Epsilon(current_frame,n_decay_frames,max_epsilon,min_epsilon,exploration_time=50000):
    current_frame = current_frame - exploration_time
    slope = (min_epsilon - max_epsilon) / n_decay_frames
    
    return max_epsilon - slope*current_frame


def epsilon_greedy_policy(state,model,eps,env):
    if np.random.uniform() < epsilon:
        action = env.action_space.sample()
    else:
        Q_vales = model.predict(state[np.newaxis].astype(np.float32))
        action = np.argmax(Q_vals[0])
    return action

def clip_rewards(reward):
    return np.sign(reward)

def sample_experiences(batch_size,replay_buffer):
    indices = np.random.randint(len(replay_buffer),size=batch_size)
    batch = [replay_buffer[index] for index in indices]
    
    states, actions, rewards, next_states, dones = [
        np.array([experience[field_index] for experience in batch])
        for field_index in range(5)
    ]
    return states, actions, rewards, next_states, dones

def generate_gif(frame_number, frames_for_gif, reward, path):
    """
        Args:
            frame_number: Integer, determining the number of the current frame
            frames_for_gif: A sequence of (210, 160, 3) frames of an Atari game in RGB
            reward: Integer, Total reward of the episode that es ouputted as a gif
            path: String, path where gif is saved
    """
    for idx, frame_idx in enumerate(frames_for_gif): 
        frames_for_gif[idx] = resize(frame_idx, (420, 320, 3), 
                                     preserve_range=True, order=0).astype(np.uint8)
        
    imageio.mimsave(f'{path}{"ATARI_frame_{0}_reward_{1}.gif".format(frame_number, reward)}', 
                    frames_for_gif, duration=1/30)
    
def skip_3_frames(env,action,p_obs):
    obs1, r, done, info = env.step(action)
    obs2, r, done, info = env.step(action)
    obs3, r, done, info = env.step(action)

    p_obs1,p_obs2,p_obs3 = preprocessImage(obs1),preprocessImage(obs2),preprocessImage(obs3)
    #print(p_obs.shape,p_obs1.shape)
    p_obs_k = np.stack([p_obs,p_obs1,p_obs2,p_obs3],axis=2).astype(np.uint8)

    return p_obs_k

In [3]:
BATCH_SIZE = 32
GAMMA = 0.99
MAX_FRAMES = 10000000
EPS_DECAY_FRAMES = 1000000
FREE_EXPLORE_FRAMES = 50000
TARGET_UPDATE_FREQUENCY = 10000
ENV_NAME = "PongDeterministic-v4"
N_OUTPUTS = 6
REPLAY_LENGTH = 300000
MAX_EPISODE_FRAMES = 2000
EPS_MIN = 0.1

In [8]:
env = gym.make(ENV_NAME)
model = make_DDQN()
target = keras.models.clone_model(model)
target.set_weights(model.get_weights())
current_frame = 0
best_score = -100000
exp_replay = deque(maxlen=int(REPLAY_LENGTH))
optimizer = optimizer=keras.optimizers.RMSprop(lr=0.00025,rho=0.95,epsilon=0.01)

In [None]:
for episode in range(50000):
    
    episode_reward = 0
    episode_start_frame = current_frame
    episode_frames = []
    episode_rewards = []
    if episode %100 == 0 and episode > 0:
        print('episode: ',episode)
        print('replay_buffer length ',len(exp_replay))
        print('current_frame: ',current_frame)
        print('last_reward: ',mean_episode_reward)
        print('last_action: ',action)
        
    state = env.reset()
    state = preprocessImage(state)
    state = skip_3_frames(env,0,state)
    
    episode_frames.append(state[:,:,-1])
    
    for step in range(1000,MAX_EPISODE_FRAMES):
        current_frame = current_frame + 1
        
        #Define Epsilon
        if current_frame < FREE_EXPLORE_FRAMES:
            epsilon=1
        elif current_frame < MAX_FRAMES + FREE_EXPLORE_FRAMES:
            epsilon = get_Epsilon(current_frame,EPS_DECAY_FRAMES,1,EPS_MIN)
        else:
            epsilon = max(get_Epsilon(current_frame,EPS_DECAY_FRAMES,EPS_MIN,0.01),0.01)
            
        
        #Get Action
        action = epsilon_greedy_policy(state,model,epsilon,env)
        next_state, reward, done, _ = env.step(action)
        next_state = preprocessImage(next_state)
        next_state = skip_3_frames(env,action,next_state)
        reward = clip_rewards(reward)
        
        
        #Store experience and frames for gif
        exp_replay.append((state,action,reward,next_state,done))
        episode_frames.append(next_state[:,:,-1])
        episode_rewards.append(reward)
        
        #Update state
        state = next_state
        
        #Training Step
        if current_frame > 100 and current_frame%4 == 0:
            experiences = sample_experiences(BATCH_SIZE,exp_replay)
            states, actions, rewards, next_states, dones = experiences
            
            next_Q_vals = model.predict(next_states.astype(np.float32))
            best_next_actions = np.argmax(next_Q_vals,axis=1)
            
            next_mask = tf.one_hot(best_next_actions,6)
            next_best_Q_values = keras.backend.sum(target.predict(next_states.astype(np.float32))*next_mask,axis=1)
            
            target_Q_values = (rewards + (1 - dones)*GAMMA*next_best_Q_values)
            
            #print('train_on_batch_start')
            model.train_on_batch(states,target_Q_values)
            #print('train_on_batch_end')
        if current_frame%TARGET_UPDATE_FREQUENCY == 0:
            target.set_weights(model.get_weights())

        if done:
            break
    mean_episode_reward = np.mean(episode_rewards)       
    if mean_episode_reward > best_score:
        best_weights = model.get_weights()
        model.save_weights('DDQN_Hail_Mary.hdf5')
        best_score = episode_reward
        generate_gif(current_frame,episode_frames,episode_reward,'Best_DDQN')
        
    if current_frame > MAX_FRAMES:
        break
    

episode:  100
replay_buffer length  21407
current_frame:  21407
last_reward:  -0.02336448598130841
last_action:  1
episode:  200
replay_buffer length  42932
current_frame:  42932
last_reward:  -0.017921146953405017
last_action:  3
episode:  300
replay_buffer length  64383
current_frame:  64383
last_reward:  -0.030303030303030304
last_action:  3
episode:  400
replay_buffer length  86118
current_frame:  86118
last_reward:  -0.024691358024691357
last_action:  4
episode:  500
replay_buffer length  107520
current_frame:  107520
last_reward:  -0.02912621359223301
last_action:  3
episode:  600
replay_buffer length  128803
current_frame:  128803
last_reward:  -0.02040816326530612
last_action:  1
episode:  700
replay_buffer length  150090
current_frame:  150090
last_reward:  -0.031413612565445025
last_action:  5
episode:  800
replay_buffer length  171730
current_frame:  171730
last_reward:  -0.025510204081632654
last_action:  3
episode:  900
replay_buffer length  193358
current_frame:  193358
l

In [None]:
episode_reward

In [None]:
episode