In [None]:
import gym 
import random
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import Huber

In [None]:
#cols = ['episode','steps','reward','epsilon','gamma','alpha','learning_rate']
#df_train_DQN_agent_experience_replay_target_network_SpaceInvaders_ram = pd.DataFrame(data=None, columns=cols)
#df_train_DQN_agent_experience_replay_target_network_SpaceInvaders_ram.to_csv('df_DQN_agent_experience_replay_target_network_SpaceInvaders_ram.csv',index=False)

In [None]:
env_name = "ALE/SpaceInvaders-ram-v5"
env = gym.make(env_name)
input_shape = env.observation_space.shape
actions = env.action_space.n

In [None]:
class DQN_agent_experience_replay_target_network():
    def __init__(self):
        self.ready = False # tells us if model is ready to train
        self.epsilon_max = 1 #can change these values depending on the game and the number of states, and scarcity/abundance of reward
        self.epsilon_min = 0.065
        self.gamma = 0.9
        self.alpha = 0.2
        self.mini_batch_size = 32*2
        self.counter = 0 # update weights of network2 every x steps!
        self.replay_memory = [] # as this re-initialises every time the agent is initialised it means he can only learn from past episodes if we run a number of episodes consecutively in one cell execution
        self.lr = 0.001
        self.optimizer = Adam(learning_rate=self.lr)
        self.memory_limit = 20000


        try:
            self.network1 = tf.keras.models.load_model('model_DQN_agent_experience_replay_target_network_SpaceInvaders_ram',compile=True)
        except:
            print('No previous history of training') 
            
            self.network1 = Sequential()
            self.network1.add(BatchNormalization())
            self.network1.add(Dense(512, activation='relu',input_shape = input_shape))
            self.network1.add(BatchNormalization())
            self.network1.add(Dense(256, activation='relu'))
            self.network1.add(BatchNormalization())
            self.network1.add(Dense(64, activation='relu'))
            self.network1.add(BatchNormalization())
            self.network1.add(Dense(actions, activation='linear'))
            self.network1.compile(optimizer=self.optimizer, loss=Huber(), metrics=['accuracy'])

        self.network2 = self.network1


    def choose_action(self,state,epsilon_upper,decay_step):
        """use neural network to decide best action"""
        self.counter += 1
        epsilon = epsilon_upper - (0.02 *(1- np.exp(-decay_step/100))) # lowest exploration prob will ever get is approx. 0.005
        if (np.random.uniform() < epsilon):
            action_index = np.random.randint(0,actions)
        else:
            action_index = np.argmax(self.network1.predict(np.expand_dims(state,axis=0).astype('float32'))) 
        return action_index
        

    def memorise(self,state,action,reward,new_state,done):
        """Store each transition in memory"""
        self.replay_memory.append((state,action,reward,new_state,done))
        if len(self.replay_memory)>self.memory_limit:
            self.replay_memory.remove(self.replay_memory[0])

    def compute_TD_target_and_learn(self):
        """Use network2 to simulate target value (y_true)"""
        if self.counter<=self.mini_batch_size:
            print('Not ready to train yet')
            return
        else:
            mini_batch = random.sample(self.replay_memory,self.mini_batch_size)
            self.new_mini_batch = pd.DataFrame(data=mini_batch,columns=['state','action','reward','new_state','done'])
            self.dones = self.new_mini_batch["done"].astype(int)
            self.rewards = np.array(self.new_mini_batch['reward'])
            self.new_states = list(self.new_mini_batch['new_state'])
            self.new_states = np.asarray(self.new_states).astype('float32')
            self.preds = np.max(self.network2.predict(self.new_states)) #1
            self.targets = self.rewards + self.dones*(self.gamma * np.array(self.preds))
            self.actions = np.asarray(self.new_mini_batch['action']).astype(int)
            self.states = list(self.new_mini_batch['state'])
            self.states = np.asarray(self.states).astype('float32')
            self.outputs = self.network1.predict(self.states)
            self.outputs[range(len(self.actions)),self.actions] = (1-self.alpha)*self.outputs[range(len(self.actions)),self.actions] + (self.alpha*self.targets)            
            self.network1.fit(self.states,np.asarray(self.outputs).astype('float32'),batch_size = self.mini_batch_size,verbose=0)
            if self.counter % 50 == 0:        
                self.network2 = self.network1

In [None]:
df_DQN_agent_experience_replay_target_network_SpaceInvaders_ram = pd.read_csv('df_DQN_agent_experience_replay_target_network_SpaceInvaders_ram.csv')
history = df_DQN_agent_experience_replay_target_network_SpaceInvaders_ram.values.tolist()
agent = DQN_agent_experience_replay_target_network()
episodes = 1
for episode in range(1, episodes+1):
    if len(history)>0:
        episode_number = history[-1][0]+1
    else:
        episode_number = 1
    state = env.reset()
    done = False
    score = 0 
    decay_step = 0
    epsilon_upper = max(agent.epsilon_max - (((episode_number/10)-1)*0.02),agent.epsilon_min) # for each episode, exploration rate will decay between these bounds
    while not done:
        action = agent.choose_action(state,epsilon_upper,decay_step)
        decay_step += 1
        n_state, reward, done, info = env.step(action)
        score+=reward
        agent.memorise(state,action,reward,n_state,done)
        agent.compute_TD_target_and_learn()
        state = n_state
    print('Episode:{} Steps:{} Score:{}'.format(episode_number,decay_step, score))
    history.append((episode_number,decay_step,score,epsilon_upper - (0.02 *(1- np.exp(-decay_step/100))),agent.gamma,agent.alpha,agent.lr,1,50)) # to keep track of overall stats
    agent.network1.save('model_DQN_agent_experience_replay_target_network_SpaceInvaders_ram')
    df_DQN_agent_experience_replay_target_network_SpaceInvaders_ram = pd.DataFrame(data=history, columns=['episode','steps','reward','epsilon','gamma','alpha','learning_rate','weights','target_update_frequency'])
    df_DQN_agent_experience_replay_target_network_SpaceInvaders_ram.to_csv('df_DQN_agent_experience_replay_target_network_SpaceInvaders_ram.csv',index=False)
env.close()

In [None]:
# Max raw score in single episode

max(df_DQN_agent_experience_replay_target_network_SpaceInvaders_ram['reward'])

In [None]:
# Find moving average of reward

plt.plot(range(len(np.convolve(df_DQN_agent_experience_replay_target_network_SpaceInvaders_ram['reward'][:850], np.ones(100)/100, mode='valid'))),np.convolve(df_DQN_agent_experience_replay_target_network_SpaceInvaders_ram['reward'][:850], np.ones(100)/100, mode='valid'))
plt.xlabel('Episodes')
plt.ylabel('Average Reward')
plt.title('DQN with Target Network and Replay Experience Episodes vs Moving Average Reward')