In [239]:
import random
import math
from collections import deque

import numpy as np
import pandas as pd
import gymnasium as gym
import matplotlib.pyplot as plt 

import torch
from torch import nn
import torch.nn.functional as F

import utils

from testing_envs import *



In [240]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [241]:
# Make gym environment
env = gym.make("Stochastic-4x4-FrozenLake-v0", render_mode="ansi")

# Print action spaces and observation spaces
print("The numbers of action space are: ", env.action_space)
print("The numbers of observation space are: ", env.observation_space)



The numbers of action space are:  Discrete(4)
The numbers of observation space are:  Discrete(16)


In [242]:
# Required Hyperparameter
BUFFER_SIZE = 50
COEFFICIENT_CORRELATION_THRESHOLD = 0.5
MAX_DEPTH_HIERACHY = env.observation_space.n/2

What if state space is too large can never get reward of 1???


In [243]:
# Init buffer and record of one_episode
Buffer = []
one_episode = []
# Reset environment
ob, _ = env.reset()
# Collecting data through implementation sampling
while len(Buffer) < BUFFER_SIZE:
    action = torch.randint(0, env.action_space.n, (1,)).item()
    next_ob, rew, done, _, _ = env.step(action)
    one_episode.append((ob, action, next_ob))
    if done:
        if rew == 1:    # If complete successfully then last state will be the correct goal
            Buffer.append(one_episode)
            ob, _ = env.reset()
            one_episode = []
        else:           # If not complete successfully then reset one_episode and does not record that episode
            ob, _ = env.reset()
            one_episode = []
    else:
        ob = next_ob
print(f"This is the final buffer {Buffer}")



This is the final buffer [[(0, 1, 4), (4, 2, 8), (8, 3, 9), (9, 1, 13), (13, 1, 13), (13, 1, 14), (14, 0, 14), (14, 0, 14), (14, 2, 15)], [(0, 2, 4), (4, 1, 4), (4, 0, 8), (8, 1, 9), (9, 2, 10), (10, 1, 14), (14, 3, 15)], [(0, 1, 1), (1, 0, 1), (1, 1, 0), (0, 3, 0), (0, 1, 1), (1, 1, 2), (2, 3, 1), (1, 2, 2), (2, 0, 1), (1, 2, 2), (2, 2, 2), (2, 0, 6), (6, 0, 10), (10, 0, 14), (14, 1, 15)], [(0, 2, 1), (1, 1, 0), (0, 1, 1), (1, 3, 2), (2, 3, 2), (2, 1, 3), (3, 1, 2), (2, 3, 1), (1, 0, 0), (0, 1, 4), (4, 1, 8), (8, 3, 4), (4, 2, 0), (0, 2, 0), (0, 2, 0), (0, 2, 0), (0, 0, 4), (4, 0, 8), (8, 3, 8), (8, 2, 9), (9, 1, 10), (10, 1, 14), (14, 1, 13), (13, 2, 14), (14, 2, 10), (10, 0, 14), (14, 1, 14), (14, 2, 15)], [(0, 1, 1), (1, 3, 2), (2, 1, 3), (3, 0, 3), (3, 0, 2), (2, 0, 2), (2, 3, 2), (2, 2, 3), (3, 3, 3), (3, 3, 2), (2, 1, 6), (6, 1, 10), (10, 0, 14), (14, 3, 15)], [(0, 0, 0), (0, 2, 0), (0, 2, 0), (0, 1, 0), (0, 3, 0), (0, 0, 0), (0, 0, 0), (0, 0, 4), (4, 2, 8), (8, 3, 4), (4, 3, 4)

This step should be updated with calculation of attention instead of correlation coefficient

In [244]:
def calculate_coefficient_correlation(state, goal):
    correlation_coefficient = np.corrcoef(state, goal)[0, 1]
    return correlation_coefficient

In [245]:
updated_Buffer = []                 # New Buffer list
updated_one_episode = []            # List to record new episode without irrelevant step to goal
for episode in Buffer:
    goal_step = episode[-1]
    for step in episode[:-1]:
        coefficient_correlation = calculate_coefficient_correlation(step, goal_step)
        if coefficient_correlation > COEFFICIENT_CORRELATION_THRESHOLD:
            updated_one_episode.append(step)
    updated_one_episode.append(goal_step) 
    updated_Buffer.append(updated_one_episode)
    updated_one_episode = []          


print(f"This is the updated buffer {updated_Buffer}")


This is the updated buffer [[(4, 2, 8), (8, 3, 9), (9, 1, 13), (13, 1, 13), (13, 1, 14), (14, 0, 14), (14, 0, 14), (14, 2, 15)], [(4, 1, 4), (4, 0, 8), (8, 1, 9), (9, 2, 10), (10, 1, 14), (14, 3, 15)], [(1, 0, 1), (1, 1, 2), (2, 0, 1), (2, 0, 6), (6, 0, 10), (10, 0, 14), (14, 1, 15)], [(2, 1, 3), (3, 1, 2), (4, 1, 8), (8, 3, 4), (0, 0, 4), (4, 0, 8), (8, 3, 8), (8, 2, 9), (9, 1, 10), (10, 1, 14), (14, 1, 13), (13, 2, 14), (14, 2, 10), (10, 0, 14), (14, 1, 14), (14, 2, 15)], [(2, 1, 3), (3, 0, 3), (3, 0, 2), (2, 0, 2), (2, 2, 3), (2, 1, 6), (6, 1, 10), (10, 0, 14), (14, 3, 15)], [(0, 0, 4), (4, 2, 8), (8, 3, 4), (4, 3, 4), (4, 3, 4), (4, 0, 8), (8, 3, 9), (9, 3, 10), (10, 2, 14), (14, 0, 13), (13, 1, 14), (14, 0, 10), (10, 0, 9), (9, 0, 8), (8, 3, 9), (9, 1, 10), (10, 0, 9), (9, 0, 13), (13, 1, 14), (14, 2, 15)], [(1, 0, 1), (2, 1, 6), (2, 0, 6), (6, 1, 10), (10, 2, 14), (14, 2, 15)], [(2, 0, 6), (6, 2, 10), (10, 1, 14), (14, 1, 14), (14, 3, 15)], [(4, 3, 4), (4, 2, 8), (8, 3, 8), (8, 3

This step should be updated with calculation of causal discovery with value between two steps instead of coefficient correlation


In [246]:
sum_coefficient_correlations = []
for episode in updated_Buffer:
    coefficient_correlations = []
    for i in range(len(episode) - 1):
        coefficient_correlation = calculate_coefficient_correlation(episode[i], episode[i + 1])
        coefficient_correlations.append(coefficient_correlation)
    sum_coefficient_correlations.append(sum(coefficient_correlations))

# Find the index of the maximum coefficient correlations episode - intuition is that the maximum coefficient will provide the most information
max_index = sum_coefficient_correlations.index(max(sum_coefficient_correlations))

# Find the episode with the highest index 
highest_relevant_information_episode = updated_Buffer[max_index]

# Create subgoal_hierarchy
subgoal_hierarchy = []
full_hierachy = []
for item in highest_relevant_information_episode:
    full_hierachy.append(item[2])
    if item[2] not in subgoal_hierarchy:
        subgoal_hierarchy.append(item[2])
    else:
        subgoal_hierarchy.remove(item[2])
        subgoal_hierarchy.append(item[2])
print(full_hierachy)
print(subgoal_hierarchy)

[4, 8, 4, 4, 4, 8, 9, 10, 14, 13, 14, 10, 9, 8, 9, 10, 9, 13, 14, 15]
[4, 8, 10, 9, 13, 14, 15]


Train hindsight experience replay based on previous subgoal?

In [247]:
GAMMA=0.99
BATCH_SIZE = 32
BUFFER_SIZE = 1000
MIN_REPLAY_SIZE = 100
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 500
TARGET_UPDATE_FREQ = 1000

	
class Network(nn.Module):
    def __init__(self, env):
        super().__init__()
		# Retrieve the number of observations
        num_observations = 1
        self.n_actions = env.action_space.n
        self.layer1 = nn.Linear(num_observations, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, self.n_actions)
	
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)
        return x
    
    def act(self, state, epsilon):
        if random.random() > epsilon:
            state = torch.as_tensor(state, dtype=torch.float32).unsqueeze(0).to(device) # Adding another dimension to the torch array with unsqueeze
            Q = self.forward(state).cpu() 
            max_q_index = torch.argmax(Q)
            action = max_q_index.detach().item()
        else:
            action = random.randrange(self.n_actions)
        return action
    
class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        state      = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
            
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(next_state), done
    
    def __len__(self):
        return len(self.buffer)




The updated reward must be dependent on when the state was added to the hierarchy. For example first loop = 0.8, second loop 0.5.
Should be dependent on the position in the hierarchy as well.

In [254]:
class UpdatedDeepQRLAlgorithmTester:
    
	def __init__(self, env, train_seed = None, training_time = None):
		"""
		Parameters
		----------
			env: Testing env object
		Self
		----------
			self.replay_memory_D: A dataset to store experiences at each time step, e_t = (s_t, a_t, r_t, s_t+1), D_t = {e_1, ..., e_t}
			self.episode_reward: Current episode reward.
			self.online_net: Main training network.
			self.target_net: To generate y_i 
			self.optimizer: Optimization method
		"""
		if torch.cuda.is_available() and train_seed is None:
			self.num_episodes = 500
		elif torch.cuda.is_available() and train_seed is not None:
			self.num_episodes = len(train_seed)
		else:
			self.num_episodes = 50
		self.epsilon_by_frame = lambda frame_idx: EPSILON_END + (EPSILON_START - EPSILON_END) * math.exp(-1. * frame_idx / EPSILON_DECAY)

		self.episode_duration_list = []
	
		for training in range(training_time):	

			self.replay_memory_D = ReplayBuffer(1000)
			self.episode_reward = 0
			self.episode_duration = []

			# Create the two networks
			self.online_net = Network(env).to(device)
			self.target_net = Network(env).to(device)
			self.target_net.load_state_dict(self.online_net.state_dict())	# Load saved state of online network
			
			self.optimizer = torch.optim.Adam(self.online_net.parameters(), lr=1e-4)

			self.main_iteration(env, training)
			self.episode_duration_list.append(self.episode_duration)
			self.episode_duration = []
		
			torch.save(self.online_net, f"Updated_DQN_model_{training}.pt")

		utils.show_result(change_in_training=self.episode_duration_list, algo_name = "updated_dqn")

	
	def __sample_from_replay_memory__(self):
		state, action, reward, done, next_state = self.replay_memory_D.sample(batch_size=BATCH_SIZE)

		state_t      = torch.FloatTensor(np.float32(state)).to(device)
		next_state_t = torch.FloatTensor(np.float32(next_state)).to(device)
		action_t     = torch.LongTensor(action).to(device)
		reward_t     = torch.FloatTensor(reward).to(device)
		done_t       = torch.FloatTensor(done).to(device)

		return state_t, action_t, reward_t, done_t, next_state_t


	def main_iteration(self, env, training):
		
		seed = 0
		state, _ = env.reset(seed = seed)
		for step in range(1, 10000):
			print(f"Training number {training}, Step number {step}")
			epsilon = self.epsilon_by_frame(step)
			# With prob epsilon select a action
			action = self.online_net.act(state, epsilon)
			
			# Execute action and observe result
			new_states, rew, done, _, _ = env.step(action)
			
			# Store transition in replay_memory
			if new_states in subgoal_hierarchy[:-1]:
				fake_rew = 0.15 * subgoal_hierarchy.index(new_states)	# the higher in the hierarchy the more chance it will reach actual goal
				self.replay_memory_D.push(state, action, fake_rew, done, new_states)
			else:
				self.replay_memory_D.push(state, action, rew, done, new_states)

			state = new_states
			self.episode_reward += rew
			self.episode_duration.append(self.episode_reward)		
			if done:
				# self.episode_duration.append(self.episode_reward+1)
				print(self.episode_reward)
				self.episode_reward = 0	
			if len(self.replay_memory_D) > BATCH_SIZE:

				# Sample random minibatch equals to BATCH_SIZE
				states_t, actions_t, rewards_t, dones_t, new_states_t = self.__sample_from_replay_memory__()
				print(rewards_t)
				loss = 0
				for i in range(len(states_t)):
					state_t = torch.as_tensor(states_t[i], dtype=torch.float32).unsqueeze(0).to(device) # Adding another dimension to the torch array with unsqueeze
					next_state_t = torch.as_tensor(new_states_t[i], dtype=torch.float32).unsqueeze(0).to(device) # Adding another dimension to the torch array with unsqueeze
	
					q_values      = self.online_net.forward(state_t)
					next_q_values = self.target_net.forward(next_state_t)

					q_value = q_values[actions_t[i].detach().item()]
					next_q_value     = next_q_values.max()
					
					expected_q_value = rewards_t[i] + GAMMA * next_q_value * (1 - dones_t[i])
					diff = q_value - expected_q_value
					# v_loss = nn.functional.smooth_l1_loss(q_value, expected_q_value) # Similar to MSE Loss
					# print(loss)
					# print(diff)
					loss += diff
				loss = loss.pow(2)/len(states_t)
				# Update target network
				if step % TARGET_UPDATE_FREQ == 0:
					self.target_net.load_state_dict(self.online_net.state_dict())

			if done:
				seed += 1
				state, _ = env.reset(seed = seed)

		


In [255]:
train_seed = np.arange(1, 20, 1).tolist()
training_time = 1
print("\n" + "-" * 25 + "\nBeginning UpdatedDQN\n" + "-" * 25)
updated_deep_q_rl_algorithm_tester = UpdatedDeepQRLAlgorithmTester(env, train_seed, training_time)


-------------------------
Beginning UpdatedDQN
-------------------------
Training number 0, Step number 1
Training number 0, Step number 2
Training number 0, Step number 3
Training number 0, Step number 4
Training number 0, Step number 5
0.0
Training number 0, Step number 6
Training number 0, Step number 7
Training number 0, Step number 8
Training number 0, Step number 9
Training number 0, Step number 10
Training number 0, Step number 11
0.0
Training number 0, Step number 12
Training number 0, Step number 13
Training number 0, Step number 14
Training number 0, Step number 15
Training number 0, Step number 16
Training number 0, Step number 17
Training number 0, Step number 18
Training number 0, Step number 19
Training number 0, Step number 20
Training number 0, Step number 21
Training number 0, Step number 22
Training number 0, Step number 23
Training number 0, Step number 24
Training number 0, Step number 25
Training number 0, Step number 26
0.0
Training number 0, Step number 27
Train

In [None]:
class DeepQRLAlgorithmTester:
    
	def __init__(self, env, train_seed = None, training_time = None):
		"""
		Parameters
		----------
			env: Testing env object
		Self
		----------
			self.replay_memory_D: A dataset to store experiences at each time step, e_t = (s_t, a_t, r_t, s_t+1), D_t = {e_1, ..., e_t}
			self.episode_reward: Current episode reward.
			self.online_net: Main training network.
			self.target_net: To generate y_i 
			self.optimizer: Optimization method
		"""
		if torch.cuda.is_available() and train_seed is None:
			self.num_episodes = 500
		elif torch.cuda.is_available() and train_seed is not None:
			self.num_episodes = len(train_seed)
		else:
			self.num_episodes = 50
		self.epsilon_by_frame = lambda frame_idx: EPSILON_END + (EPSILON_START - EPSILON_END) * math.exp(-1. * frame_idx / EPSILON_DECAY)

		self.episode_duration_list = []
	
		for training in range(training_time):	

			self.replay_memory_D = ReplayBuffer(1000)
			self.episode_reward = 0
			self.episode_duration = []

			# Create the two networks
			self.online_net = Network(env).to(device)
			self.target_net = Network(env).to(device)
			self.target_net.load_state_dict(self.online_net.state_dict())	# Load saved state of online network
			
			self.optimizer = torch.optim.Adam(self.online_net.parameters(), lr=1e-4)

			self.main_iteration(env, training)
			self.episode_duration_list.append(self.episode_duration)
			self.episode_duration = []
		
			torch.save(self.online_net, f"DQN_model_{training}.pt")

		utils.show_result(change_in_training=self.episode_duration_list, algo_name = "dqn")

	
	def __sample_from_replay_memory__(self):
		state, action, reward, done, next_state = self.replay_memory_D.sample(batch_size=BATCH_SIZE)

		state_t      = torch.FloatTensor(np.float32(state)).to(device)
		next_state_t = torch.FloatTensor(np.float32(next_state)).to(device)
		action_t     = torch.LongTensor(action).to(device)
		reward_t     = torch.FloatTensor(reward).to(device)
		done_t       = torch.FloatTensor(done).to(device)

		return state_t, action_t, reward_t, done_t, next_state_t


	def main_iteration(self, env, training):
		
		seed = 0
		state, _ = env.reset(seed = seed)
		for step in range(1, 10000):
			print(f"Training number {training}, Step number {step}")
			epsilon = self.epsilon_by_frame(step)
			# With prob epsilon select a action
			action = self.online_net.act(state, epsilon)
			
			# Execute action and observe result
			new_states, rew, done, _, _ = env.step(action)
			
			# Store transition in replay_memory
			# if not isinstance(state, int):
			# 	self.replay_memory_D.push(state.cpu(), action, rew, done, new_states)
			# else:
			self.replay_memory_D.push(state, action, rew, done, new_states)
			
			state = new_states
			self.episode_reward += rew
			self.episode_duration.append(self.episode_reward)		
			if done:
				# print(self.episode_reward+1)
				# self.episode_duration.append(self.episode_reward+1)
				self.episode_reward = 0	
								
			if len(self.replay_memory_D) > BATCH_SIZE:

				# Sample random minibatch equals to BATCH_SIZE
				states_t, actions_t, rewards_t, dones_t, new_states_t = self.__sample_from_replay_memory__()

				loss = 0
				for i in range(len(states_t)):
					state_t = torch.as_tensor(states_t[i], dtype=torch.float32).unsqueeze(0).to(device) # Adding another dimension to the torch array with unsqueeze
					next_state_t = torch.as_tensor(new_states_t[i], dtype=torch.float32).unsqueeze(0).to(device) # Adding another dimension to the torch array with unsqueeze
	
					q_values      = self.online_net.forward(state_t)
					next_q_values = self.target_net.forward(next_state_t)


					q_value = q_values[actions_t[i].detach().item()]
					# q_value          = q_values.gather(1, actions_t.unsqueeze(1)).squeeze(1)
					next_q_value     = next_q_values.max()
					
					expected_q_value = rewards_t[i] + GAMMA * next_q_value * (1 - dones_t[i])
					# v_loss = nn.functional.smooth_l1_loss(q_value, expected_q_value) # Similar to MSE Loss
					diff = q_value - expected_q_value
					# v_loss = nn.functional.smooth_l1_loss(q_value, expected_q_value) # Similar to MSE Loss
					loss += diff
				loss = loss.pow(2).mean()
				# Gradient Descent
				self.optimizer.zero_grad()
				loss.backward()
				self.optimizer.step()

				# Update target network
				if step % TARGET_UPDATE_FREQ == 0:
					self.target_net.load_state_dict(self.online_net.state_dict())

			if done:
				seed += 1
				state, _ = env.reset(seed = seed)

In [None]:
train_seed = np.arange(1, 20, 1).tolist()
training_time = 1
print("\n" + "-" * 25 + "\nBeginning DQN\n" + "-" * 25)
deep_q_rl_algorithm_tester = DeepQRLAlgorithmTester(env, train_seed, training_time)


-------------------------
Beginning DQN
-------------------------
Training number 0, Step number 1
Training number 0, Step number 2
Training number 0, Step number 3
Training number 0, Step number 4
Training number 0, Step number 5
Training number 0, Step number 6
Training number 0, Step number 7
Training number 0, Step number 8
Training number 0, Step number 9
Training number 0, Step number 10
Training number 0, Step number 11
Training number 0, Step number 12
Training number 0, Step number 13
Training number 0, Step number 14
Training number 0, Step number 15
Training number 0, Step number 16
Training number 0, Step number 17
Training number 0, Step number 18
Training number 0, Step number 19
Training number 0, Step number 20
Training number 0, Step number 21
Training number 0, Step number 22
Training number 0, Step number 23
Training number 0, Step number 24
Training number 0, Step number 25
Training number 0, Step number 26
Training number 0, Step number 27
Training number 0, Step 

  v_loss = nn.functional.smooth_l1_loss(q_value, expected_q_value) # Similar to MSE Loss


Training number 0, Step number 37
Training number 0, Step number 38
Training number 0, Step number 39
Training number 0, Step number 40
Training number 0, Step number 41
Training number 0, Step number 42
Training number 0, Step number 43
Training number 0, Step number 44
Training number 0, Step number 45
Training number 0, Step number 46
Training number 0, Step number 47
Training number 0, Step number 48
Training number 0, Step number 49
Training number 0, Step number 50
Training number 0, Step number 51
Training number 0, Step number 52
Training number 0, Step number 53
Training number 0, Step number 54
Training number 0, Step number 55
Training number 0, Step number 56
Training number 0, Step number 57
Training number 0, Step number 58
Training number 0, Step number 59
Training number 0, Step number 60
Training number 0, Step number 61
Training number 0, Step number 62
Training number 0, Step number 63
Training number 0, Step number 64
Training number 0, Step number 65
Training numbe

KeyboardInterrupt: 