In [3]:
import os
import gym
import matplotlib.pyplot as plt
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim



In [19]:
class BitFlipEnv:
    """
    A simple bit flip environment
    Bit of the current state flips as an action
    Reward of -1 for each step
    """
    def __init__(self, n_bits):
        self.n_bits = n_bits
        self.state = np.random.randint(2, size=self.n_bits)
        self.goal = np.random.randint(2, size=self.n_bits)

    def reset_env(self):
        """
        Resets the environment with new state and goal
        """
        self.state = np.random.randint(2, size=self.n_bits)
        self.goal = np.random.randint(2, size=self.n_bits)

    def take_step(self, action):
        """
        Returns updated_state, reward, and done for the step taken
        """
        self.state[action] = self.state[action] ^ 1 # This change the value of current state according to bitwise XOR
        # THis just means we flip that particular state
        done = False
        if np.array_equal(self.state, self.goal):
            done = True
            reward = 0
        else:
            reward = -1
        return np.copy(self.state), reward, done

    def print_state(self):
        """
        Prints the current state
        """
        print('Current State:', self.state)

In [5]:
class HindsightExperienceReplayMemory(object):
    """
    Hindsight Experience replay - Takes size, input dimensions and number of actions as parameters
    """
    def __init__(self, memory_size, input_dims, n_actions):
        super(HindsightExperienceReplayMemory, self).__init__()
        self.max_mem_size = memory_size
        self.counter = 0

        # initializes the state, next_state, action, reward, and terminal experience memory
        self.state_memory = np.zeros((memory_size, input_dims), dtype=np.float32)
        self.next_state_memory = np.zeros((memory_size, input_dims), dtype=np.float32)
        self.reward_memory = np.zeros(memory_size, dtype=np.float32)
        self.action_memory = np.zeros(memory_size, dtype=np.int64)
        self.terminal_memory = np.zeros(memory_size, dtype=bool)
        self.goal_memory = np.zeros((memory_size, input_dims), dtype=np.float32)

    def add_experience(self, state, action, reward, next_state, done, goal):
        """
        Adds new experience to the memory
        """
        curr_index = self.counter % self.max_mem_size

        self.state_memory[curr_index] = state
        self.action_memory[curr_index] = action
        self.reward_memory[curr_index] = reward
        self.next_state_memory[curr_index] = next_state
        self.terminal_memory[curr_index] = done
        self.goal_memory[curr_index] = goal

        self.counter += 1

    def get_random_experience(self, batch_size):
        """
        Returns any random memory from the experience replay memory
        """
        rand_index = np.random.choice(min(self.counter, self.max_mem_size), batch_size, replace=False)

        rand_state = self.state_memory[rand_index]
        rand_action = self.action_memory[rand_index]
        rand_reward = self.reward_memory[rand_index]
        rand_next_state = self.next_state_memory[rand_index]
        rand_done = self.terminal_memory[rand_index]
        rand_goal = self.goal_memory[rand_index]

        return rand_state, rand_action, rand_reward, rand_next_state, rand_done, rand_goal

In [6]:
class DeepQNetwork(nn.Module):
    """
    Defines a deep Q network with a single hidden layer
    """
    def __init__(self, learning_rate, n_actions, input_dims, checkpoint_dir, name):
        super(DeepQNetwork, self).__init__()

        self.fc1 = nn.Linear(input_dims, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, n_actions)

        self.optimizer = optim.RMSprop(self.parameters(), lr=learning_rate)
        self.loss = nn.MSELoss()

        self.device_type = 'cuda:0' if torch.cuda.is_available() else 'cpu'
        self.device = torch.device(self.device_type)
        self.to(self.device)

        self.checkpoint_dir = checkpoint_dir
        self.checkpoint_name = os.path.join(checkpoint_dir, name)

    def forward(self, data):
        fc_layer1 = F.relu(self.fc1(data))
        fc_layer2 = F.relu(self.fc2(fc_layer1))
        actions = self.fc3(fc_layer2)
        return actions

    def save_checkpoint(self):
        print('Saving checkpoint ...')
        torch.save(self.state_dict(), self.checkpoint_name)

    def load_checkpoint(self):
        print('Loading checkpoint ...')
        self.load_state_dict(torch.load(self.checkpoint_name))

In [17]:
class DQNAgentWithHER(object):
    def __init__(self, learning_rate, n_actions, input_dims, gamma,
                 epsilon, batch_size, memory_size, replace_network_count,
                 dec_epsilon=1e-5, min_epsilon=0.1, checkpoint_dir='/tmp/ddqn/'):
        self.learning_rate = learning_rate
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.gamma = gamma
        self.epsilon = epsilon
        self.batch_size = batch_size
        self.memory_size = memory_size
        self.replace_network_count = replace_network_count
        self.dec_epsilon = dec_epsilon
        self.min_epsilon = min_epsilon
        self.checkpoint_dir = checkpoint_dir
        self.action_indices = [i for i in range(n_actions)]
        self.learn_steps_count = 0

        self.q_eval = DeepQNetwork(learning_rate=learning_rate, n_actions=n_actions,
                                       input_dims=2*input_dims, name='q_eval',
                                       checkpoint_dir=checkpoint_dir)

        self.q_next = DeepQNetwork(learning_rate=learning_rate, n_actions=n_actions,
                                       input_dims=2*input_dims, name='q_next',
                                       checkpoint_dir=checkpoint_dir)

        self.experience_replay_memory = HindsightExperienceReplayMemory(memory_size=memory_size,
                                                                            input_dims=input_dims,
                                                                            n_actions=n_actions)

    def decrement_epsilon(self):
        """
        Decrements the epsilon after each step till it reaches minimum epsilon (0.1)
        epsilon = epsilon - decrement (default is 1e-5)
        """
        self.epsilon = self.epsilon - self.dec_epsilon if self.epsilon > self.min_epsilon \
            else self.min_epsilon

    def store_experience(self, state, action, reward, next_state, done, goal):
        """
        Saves the experience to the hindsight experience replay memory
        """
        self.experience_replay_memory.add_experience(state=state, action=action,
                                                     reward=reward, next_state=next_state,
                                                     done=done, goal=goal)

    def get_sample_experience(self):
        """
        Gives a sample experience from the hindsight experience replay memory
        """
        state, action, reward, next_state, done, goal = self.experience_replay_memory.get_random_experience(
            self.batch_size)

        t_state = torch.tensor(state).to(self.q_eval.device)
        t_action = torch.tensor(action).to(self.q_eval.device)
        t_reward = torch.tensor(reward).to(self.q_eval.device)
        t_next_state = torch.tensor(next_state).to(self.q_eval.device)
        t_done = torch.tensor(done).to(self.q_eval.device)
        t_goal = torch.tensor(goal).to(self.q_eval.device)

        return t_state, t_action, t_reward, t_next_state, t_done, t_goal

    def replace_target_network(self):
        """
        Updates the parameters after replace_network_count steps
        """
        if self.learn_steps_count % self.replace_network_count == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def choose_action(self, observation, goal):
        """
        Chooses an action with epsilon-greedy method
        """
        if np.random.random() > self.epsilon:
            concat_state_goal = np.concatenate([observation, goal])
            state = torch.tensor([concat_state_goal], dtype=torch.float).to(self.q_eval.device)
            # Include both current observation and goal in choosing the action
            actions = self.q_eval.forward(state)
            action = torch.argmax(actions).item()
        else:
            action = np.random.choice(self.n_actions)

        return action

    def learn(self):
        if self.experience_replay_memory.counter < self.batch_size:
            return

        self.q_eval.optimizer.zero_grad()
        self.replace_target_network()

        state, action, reward, next_state, done, goal = self.get_sample_experience()
        # Gets the evenly spaced batches
        batches = np.arange(self.batch_size)

        concat_state_goal = torch.cat((state, goal), 1)
        concat_next_state_goal = torch.cat((next_state, goal), 1)

        q_value = self.q_eval.forward(concat_state_goal)[batches, action]
        next_q_value = self.q_next.forward(concat_next_state_goal).max(dim=1)[0]

        next_q_value[done] = 0.0
        expected_q_value = reward + self.gamma * next_q_value

        # Computes loss and performs backpropagation
        loss = self.q_eval.loss(expected_q_value, q_value).to(self.q_eval.device)
        loss.backward()

        self.q_eval.optimizer.step()
        self.decrement_epsilon()
        self.learn_steps_count += 1

    def save_model(self):
        """
        Saves the values of q_eval and q_next at the checkpoint
        """
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_model(self):
        """
        Loads the values of q_eval and q_next at the checkpoint
        """
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()

In [20]:
n_bits = 8
# Create an environment with 2 list states and goals with value 0 and 1 and of n_bits number of elements
# Goal is to have the same states and goals list by changing the states value through actions
env = BitFlipEnv(n_bits)
n_episodes = 30000
epsilon_history = []
episodes = []
win_percent = []
success = 0

load_checkpoint = False
checkpoint_dir = os.path.join("D:\Hoang\Projects_and_Studies\Deakin Artificial Intelligent\Git_Repos\Reinforcement_Learning_Algorithms_Implementation\checkpoint")

# Initializes the DQN agent with Hindsight Experience Replay
agent = DQNAgentWithHER(learning_rate=0.0001, n_actions=n_bits,
                                input_dims=n_bits, gamma=0.99,
                                epsilon=0.9, batch_size=64, memory_size=10000,
                                replace_network_count=50,
                                checkpoint_dir=checkpoint_dir)

if load_checkpoint:
    agent.load_model()

# Iterate through the episodes
for episode in range(n_episodes): # line 3
    env.reset_env()
    # Sample a goal and initial state line 4
    state = env.state
    goal = env.goal
    done = False
    transitions = []

    for p in range(n_bits):
        # Number of training step per episode here given the number of bits line 5
        if not done:
            action = agent.choose_action(state, goal) # epsilon greedy
            next_state, reward, done = env.take_step(action)
            if not load_checkpoint:
                agent.store_experience(state, action, reward, next_state, done, goal)
                transitions.append((state, action, reward, next_state))
                agent.learn()
            state = next_state

            if done:
                success += 1

    if not done:
        # Sample a set of additional goals line 13
        new_goal = np.copy(state)
        if not np.array_equal(new_goal, goal):
            for p in range(n_bits):
                transition = transitions[p]
                if np.array_equal(transition[3], new_goal):
                    # If next state is goal
                    agent.store_experience(transition[0], transition[1], 0.0,
                                            transition[3], True, new_goal)
                    agent.learn()
                    break

                agent.store_experience(transition[0], transition[1], transition[2],
                                        transition[3], False, new_goal)
                agent.learn()

    # Average over last 500 episodes to avoid spikes
    if episode % 500 == 0:
        print('success rate for last 500 episodes after', episode, ':', success / 5)
        if len(win_percent) > 0 and (success / 500) > win_percent[len(win_percent) - 1]:
            agent.save_model()
        epsilon_history.append(agent.epsilon)
        episodes.append(episode)
        win_percent.append(success / 500.0)
        success = 0

print('Epsilon History:', epsilon_history)
print('Episodes:', episodes)
print('Win percentage:', win_percent)

figure = plt.figure()
plt.plot(episodes, win_percent)

plt.title('DQN with HER')
plt.ylabel('Win Percentage')
plt.xlabel('Number of Episodes')
plt.ylim([0, 1])

plt.savefig(os.path.join(os.getcwd(), '/plots/'))

this is before update 0, and this is after 1
this is before update 0, and this is after 1
this is before update 1, and this is after 0
this is before update 1, and this is after 0
this is before update 1, and this is after 0
this is before update 0, and this is after 1
this is before update 0, and this is after 1
this is before update 1, and this is after 0
success rate for last 500 episodes after 0 : 0.0
this is before update 1, and this is after 0
this is before update 0, and this is after 1
this is before update 0, and this is after 1
this is before update 0, and this is after 1
7
this is before update 1, and this is after 0
this is before update 1, and this is after 0
this is before update 0, and this is after 1
this is before update 0, and this is after 1
this is before update 0, and this is after 1
this is before update 0, and this is after 1
this is before update 0, and this is after 1
this is before update 1, and this is after 0
this is before update 1, and this is after 0
this

this is before update 0, and this is after 1
this is before update 0, and this is after 1
this is before update 1, and this is after 0
this is before update 0, and this is after 1
this is before update 0, and this is after 1
this is before update 1, and this is after 0
this is before update 0, and this is after 1
this is before update 1, and this is after 0
this is before update 1, and this is after 0
this is before update 1, and this is after 0
7
this is before update 0, and this is after 1
this is before update 0, and this is after 1
this is before update 1, and this is after 0
this is before update 0, and this is after 1
this is before update 1, and this is after 0
this is before update 1, and this is after 0
this is before update 1, and this is after 0
this is before update 1, and this is after 0
0
this is before update 1, and this is after 0
0
this is before update 0, and this is after 1
0
this is before update 1, and this is after 0
this is before update 0, and this is after 1
1


KeyboardInterrupt: 