In [1]:
# import
import numpy as np
import gym
from gym import spaces
import random

import collections
import copy
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

### Environment

In [2]:
class MultiAgentActionSpace(list):
    def __init__(self, agents_action_space):
        for x in agents_action_space:
            assert isinstance(x, gym.spaces.space.Space)

        super(MultiAgentActionSpace, self).__init__(agents_action_space)
        self._agents_action_space = agents_action_space

    def sample(self):
        """ samples action for each agent from uniform distribution"""
        return [agent_action_space.sample() for agent_action_space in self._agents_action_space]

In [3]:
class MultiAgentObservationSpace(list):
    def __init__(self, agents_observation_space):
        for x in agents_observation_space:
            assert isinstance(x, gym.spaces.space.Space)

        super().__init__(agents_observation_space)
        self._agents_observation_space = agents_observation_space

    def sample(self):
        """ samples observations for each agent from uniform distribution"""
        return [agent_observation_space.sample() for agent_observation_space in self._agents_observation_space]

    def contains(self, obs):
        """ contains observation """
        for space, ob in zip(self._agents_observation_space, obs):
            if not space.contains(ob):
                return False
        else:
            return True

In [4]:
class Grid(gym.Env):
    metadata = {'render.modes': ['console']}
    # action id
    XM = 0 # x minus
    XP = 1 # x plus
    YM = 2 # y minus
    YP = 3 # y plus
    
    def __init__(self, x_size=5, y_size=5, n_agents=2, fov_x=3, fov_y=3):
        super(Grid, self).__init__()
        
        # size of 2D grid
        self.x_size = x_size
        self.y_size = y_size

        # number of agents
        self.n_agents = n_agents
        self.idx_agents = list(range(n_agents)) # [0, 1, 2, ..., n_agents - 1]
        
        # initialize the mapping status
        self.init_grid()

        # initialize the position of the agent
        self.init_agent()
        
        # define action space
        n_actions = 4 # LEFT, RIGHT, TOP, BOTTOM
        self.action_space = MultiAgentActionSpace([spaces.Discrete(n_actions) for _ in range(self.n_agents)])
        
        # define observation space (fielf of view)
        self.fov_x = fov_x # number of cells around the agent
        self.fov_y = fov_y # number of cells around the agent

        self.obs_low = -np.ones(4) * 2 # low -2: out of the grid
        self.obs_high = np.ones(4) # high 1: visited
        self.observation_space = MultiAgentObservationSpace([spaces.Box(self.obs_low, self.obs_high) for _ in range(self.n_agents)])
    
    def init_agent(self, initial_pos=None):
        self.agent_pos = []
        if initial_pos is not None:
            self.agent_pos = initial_pos
            for i in range(self.n_agents):
                self.grid_status[self.agent_pos[i][0], self.agent_pos[i][1]] = 1
        else:
            for i in range(self.n_agents):
                agent_pos_x = random.randrange(0, self.x_size)
                agent_pos_y = random.randrange(0, self.x_size)
                self.agent_pos.append([agent_pos_x, agent_pos_y])
                self.grid_status[self.agent_pos[i][0], self.agent_pos[i][1]] = 1

        # iniqialize the stuck count
        self.stuck_counts = [0] * self.n_agents

    def init_grid(self):
        # initialize the mapping status
        ## -2: out of the grid
        ## -1: obstacle
        ## 0: POI that is not mapped
        ## 1: POI that is mapped
        self.grid_status = np.zeros([self.x_size, self.y_size])
        self.grid_counts = np.zeros([self.x_size, self.y_size])

        ## randomly set obstacles
        # n_obstacle = random.randrange(0, self.x_size * self.x_size * 0.2) # at most 20% of the grid
        n_obstacle = 0
        for i in range(n_obstacle):
            x_obstacle = random.randrange(1, self.x_size - 1)
            y_obstacle = random.randrange(1, self.y_size - 1)
            self.grid_status[x_obstacle, y_obstacle] = - 1
            self.grid_counts[x_obstacle, y_obstacle] = - 1
        
        # number of POI in the environment (0)
        self.n_poi = self.x_size * self.y_size - np.count_nonzero(self.grid_status)
    
    def get_coverage(self):
        mapped_poi = (self.grid_status == 1).sum()
        return mapped_poi / self.n_poi

    def get_agent_obs(self):
        self.agent_obs = []

        # observation for each agent
        for agent in range(self.n_agents):
            # default: out of the grid
            single_obs = -np.ones([self.fov_x, self.fov_y]) * 2
            for i in range(self.fov_x): # 0, 1, 2
                for j in range(self.fov_y): # 0, 1, 2
                    obs_x = self.agent_pos[agent][0] + (i - 1) # -1, 0, 1
                    obs_y = self.agent_pos[agent][1] + (j - 1) # -1, 0, 1
                    if obs_x >= 0 and obs_y >= 0 and obs_x <= self.x_size - 1 and obs_y <= self.y_size - 1:
                        single_obs[i][j] = copy.deepcopy(self.grid_status[obs_x][obs_y])
            single_obs_flat = single_obs.flatten() # convert matrix to list
            # extract the necessary cells
            xm = single_obs_flat[1]
            xp = single_obs_flat[7]
            ym = single_obs_flat[3]
            yp = single_obs_flat[5]
            single_obs_flat = np.array([xm, xp, ym, yp])
            self.agent_obs.append(single_obs_flat)
        return self.agent_obs

    def reset(self, initial_pos=None):
        # initialize the mapping status
        self.init_grid()
        # initialize the position of the agent
        self.init_agent(initial_pos)
        
        # check if the drones at initial positions are surrounded by obstacles
        while True:
            obs = self.get_agent_obs()
            obs_tf = []
            for i in range(self.n_agents):
                agent_obs_tf = obs[i][0] != 0 and obs[i][1] != 0 and obs[i][2] != 0 and obs[i][3] != 0
                obs_tf.append(agent_obs_tf)
            if any(obs_tf):
                self.init_grid()
                self.init_agent()
            else:
                break

        return self.get_agent_obs()
        
    def step(self, action, i): # i: index of the drone
        # original position
        org_x  = copy.deepcopy(self.agent_pos[i][0])
        org_y  = copy.deepcopy(self.agent_pos[i][1])

        # move the agent
        if action == self.XM:
            self.agent_pos[i][0] -= 1
        elif action == self.XP:
            self.agent_pos[i][0] += 1
        elif action == self.YM:
            self.agent_pos[i][1] -= 1
        elif action == self.YP:
            self.agent_pos[i][1] += 1
        else:
            raise ValueError("Received invalid action={} which is not part of the action space".format(action))
        
        # account for the boundaries of the grid (-2: out of the grid)
        if self.agent_pos[i][0] > self.x_size - 1 or self.agent_pos[i][0] < 0 or self.agent_pos[i][1] > self.y_size - 1 or self.agent_pos[i][1] < 0:
            self.agent_pos[i][0] = org_x
            self.agent_pos[i][1] = org_y 
            self.grid_counts[self.agent_pos[i][0], self.agent_pos[i][1]] += 1
            reward = 0
        else:
            # previous status of the cell
            prev_status = self.grid_status[self.agent_pos[i][0], self.agent_pos[i][1]]
            if prev_status == -1: # the new position is on the obstacle
                # go back to the original position
                self.agent_pos[i][0] = org_x
                self.agent_pos[i][1] = org_y
                self.grid_counts[self.agent_pos[i][0], self.agent_pos[i][1]] += 1
                reward = 0
            elif prev_status == 0:
                self.grid_counts[self.agent_pos[i][0], self.agent_pos[i][1]] += 1
                self.grid_status[self.agent_pos[i][0], self.agent_pos[i][1]] = 1
                reward = 10
            elif prev_status == 1:
                self.grid_counts[self.agent_pos[i][0], self.agent_pos[i][1]] += 1
                reward = 0

        # update the stuck count
        if org_x == self.agent_pos[i][0] and org_y == self.agent_pos[i][1]: # stuck
            self.stuck_counts[i] += 1
        else:
            self.stuck_counts[i] = 0

        # are we map all cells?
        mapped_poi = (self.grid_status == 1).sum()
        done = bool(mapped_poi == self.n_poi)
        
        return self.get_agent_obs(), reward, done

    def close(self):
        pass

### DQN

In [5]:
class Net(nn.Module):
    def __init__(self, n_obs, n_mid, n_action):
        super().__init__()
        self.fc1 = nn.Linear(n_obs, n_mid) 
        self.fc2 = nn.Linear(n_mid, n_mid)
        self.fc3 = nn.Linear(n_mid, n_action)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [6]:
class ReplayBuffer():
    def __init__(self, buffer_limit):
        self.buffer_limit = buffer_limit
        self.buffer = collections.deque(maxlen=self.buffer_limit)
    
    def put(self, transition):
        self.buffer.append(transition)
    
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []
        
        for transition in mini_batch:
            s, a, r, s_prime, done_mask = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])

        return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
               torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
               torch.tensor(done_mask_lst)
    
    def size(self):
        return len(self.buffer)

In [8]:
class QValues:
    def __init__(self, observation_space, action_space, is_gpu, eps_start=1, eps_end=0.1, r=0.99, gamma=0.9, lr=0.01, buffer_limit=100):
        self.num_agents = len(observation_space)

        self.observation_space = observation_space
        self.observation_values = [-2, -1, 0, 1]
        self.observation_num = len(self.observation_values) # 4
        self.observation_length = observation_space[0].shape[0] # field of view

        self.action_space = action_space
        self.action_values = [0, 1, 2, 3]
        self.action_num = len(self.action_values)

        self.buffer_limit = buffer_limit

        self.is_gpu = is_gpu 

        self.eps = eps_start  # initial epsilon
        self.eps_end = eps_end  # lower bound of epsilon
        self.gamma = gamma  # discount rate
        self.r = r # decrement rate of epsilon
        self.lr = lr # learning rate

        self.net_q = [] # q network
        self.net_target = [] # target network
        self.memory = [] # experience replay
        self.loss_fnc = [] # loss function
        self.optimizer = [] # optimizer
        for i in range(self.num_agents):
            # network
            self.net_q.append(Net(n_obs=self.observation_length, n_mid=16, n_action=self.action_num)) # q network
            self.net_target.append(Net(n_obs=self.observation_length, n_mid=16, n_action=self.action_num)) # target network
            self.net_target[i].load_state_dict(self.net_q[i].state_dict()) # sync parameters
            if self.is_gpu:
                self.net_q[i].cuda()
                self.net_target[i].cuda()

            # memory, loss function, optimizer
            self.memory.append(ReplayBuffer(buffer_limit=self.buffer_limit)) # memory
            self.loss_fnc.append(nn.MSELoss()) # loss function
            self.optimizer.append(optim.Adam(self.net_q[i].parameters(), lr=self.lr)) # optimizer

    def train(self, i, batch_size):
        obs, action, reward, next_obs, done_mask = self.memory[i].sample(batch_size)

        if self.is_gpu:
            obs, action, reward, next_obs, done_mask = obs.cuda(), action.cuda(), reward.cuda(), next_obs.cuda(), done_mask.cuda()

        # current q-value in the online network
        q_out = self.net_q[i](obs) # batch_size x number of actions
        q_a = q_out.gather(1, action) # batch_size x 1 (select the corresponding q-value of each experience)

        # maximum q-value in the target network
        next_q_max = self.net_target[i](next_obs).max(1)[0].unsqueeze(1) # batch_size x 1

        # target
        target = reward + self.gamma * next_q_max * done_mask # batch_size x 1
        
        # difference between the current q-value and the target
        loss = self.loss_fnc[i](q_a, target) # 1 x 1
        self.optimizer[i].zero_grad()
        loss.backward()
        self.optimizer[i].step()
    
    def get_action(self, obs, i, stuck_counts, max_stuck, e_greedy=True, softmax=False):
        obs = torch.from_numpy(obs).float()
        if self.is_gpu:
            obs = obs.cuda()
        
        if stuck_counts[i] >= max_stuck: # random action to avoid stuck
            action = random.choice(self.action_values)
            greedy = False
        elif e_greedy:  # epsilon greedy for training (e_greedy=True)
            if np.random.rand() < self.eps:
                action = random.choice(self.action_values)
                greedy = False
            else:  # make decision with the q network
                q = self.net_q[i].forward(obs) # q values
                action = np.argmax(q.detach().cpu().numpy())
                greedy = True
        elif softmax:  # (e_greedy=False and softmax=True)
            q = self.net_q[i].forward(obs) # q values
            p = self.softmax(q.detach().cpu().numpy())
            action = np.random.choice(np.arange(self.action_num), p=p)
            greedy = False
        else: # all greedy choices for testing performance
            q = self.net_q[i].forward(obs) # q values
            action = np.argmax(q.detach().cpu().numpy())
            greedy = True

        return action, greedy
    
    def softmax(self, a):
        # deal with overflow
        c = np.max(a)
        exp_a = np.exp(a - c)
        sum_exp_a = np.sum(exp_a)
        y = exp_a / sum_exp_a
        return y
    
    def update_target(self):
        for i in range(self.num_agents):
            self.net_target[i].load_state_dict(self.net_q[i].state_dict())
    
    def update_eps(self):
        if self.eps > self.eps_end:
            self.eps *= self.r

### Single: Fixed Initial Position

In [9]:
# ===================================================================================================
# Training: 1 drone
# ===================================================================================================

# records for each episode
time_steps = [] # number of time steps in total
epsilons = [] # epsilon at the end of each episode
greedy = [] # the ratio of greedy choices
coverage = [] # the ratio of visited cells at the end
speed = [] # number of time steps to cover decent amount of cells
sum_q_values = [] # sum of q-values
results_mapping = [] # mapping status
results_count = [] # count status
total_reward = []
total_action_values = []
total_greedy_action_values = []

q_class = []

coverage_threshold = 0.90
max_stuck = 100000

# parameters for training
train_episodes = 200000
max_steps = 10 * 10 * 2
batch_size = 1
interval = 1

# initialize the environment and the q tables
env = Grid(x_size=10, y_size=10, n_agents=1, fov_x=3, fov_y=3)
q = QValues(observation_space=env.observation_space, action_space=env.action_space, is_gpu=False, eps_start=1, eps_end=0, gamma=0.5, r=0.999, lr=0.01, buffer_limit=1)

# training
for episode in range(train_episodes):
    state = env.reset([[0, 0]])
    # state = [arr.astype('int') for arr in state] # convert from float to integer
    eps_tmp = q.eps

    greedy_count = [0] * env.n_agents
    coverage_track = True
    epi_reward = [0] * env.n_agents
    epi_action_value = [0] * env.n_agents
    epi_greedy_action_value = [0] * env.n_agents

    for step in range(max_steps):
        action_order = random.sample(env.idx_agents, env.n_agents) # return a random order of the drone indices
        for agent_i in action_order:
            agent_obs = state[agent_i]
            action, greedy_tf = q.get_action(obs=agent_obs, i=agent_i, stuck_counts=env.stuck_counts, max_stuck=max_stuck, e_greedy=True, softmax=False)
            next_state, reward, done = env.step(action, agent_i)
            # next_state = [arr.astype('int') for arr in next_state] # convert from float to integer
            done_mask = 0.0 if done else 1.0
            agent_next_obs = next_state[agent_i]
            q.memory[agent_i].put((agent_obs, action, reward, agent_next_obs, done_mask))
            epi_reward[agent_i] += reward
            greedy_count[agent_i] += greedy_tf * 1

            if done:
                break
        
            # update the observation
            state = next_state

            # training
            if q.memory[agent_i].size() > 0:
                    q.train(agent_i, batch_size)
            
            # update the target network
            if step % interval == 0:
                q.update_target()

        # check if decent amoung of cells are visited
        current_coverage = env.get_coverage()
        if current_coverage >= coverage_threshold and coverage_track:
            speed.append(step)
            coverage_track = False

        # check if the task is completed
        if done:
            time_steps.append(step)
            break
        elif step == max_steps - 1:
            time_steps.append(step)
            if coverage_track:
                speed.append(np.nan)

    # record
    time_steps.append(step + 1)
    epsilons.append(eps_tmp)
    coverage.append(env.get_coverage())
    greedy.append(list(map(lambda x: x / (step + 1), greedy_count)))
    results_mapping.append(env.grid_status)
    results_count.append(env.grid_counts)
    total_reward.append(epi_reward)

    if episode % 1000 == 0:
        q_class.append(copy.deepcopy(q))

    # update epsilon
    q.update_eps()

    print('//Episode {0}//    Epsilon: {1:.3f},    Steps: {2},    Greedy Choices　(%): {3:.3f},    Coverage (%): {4:.3f},    Steps to Visit {5}% Cells: {6},    Total Reward: {7}'\
          .format(episode+1, q.eps, step+1, np.mean(greedy[episode]), coverage[episode], coverage_threshold * 100, speed[episode], np.mean(total_reward[episode])))



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
//Episode 1723//    Epsilon: 0.178,    Steps: 200,    Greedy Choices　(%): 0.845,    Coverage (%): 0.980,    Steps to Visit 90.0% Cells: 148,    Total Reward: 970.0
//Episode 1724//    Epsilon: 0.178,    Steps: 200,    Greedy Choices　(%): 0.830,    Coverage (%): 0.800,    Steps to Visit 90.0% Cells: nan,    Total Reward: 790.0
//Episode 1725//    Epsilon: 0.178,    Steps: 200,    Greedy Choices　(%): 0.810,    Coverage (%): 0.930,    Steps to Visit 90.0% Cells: 151,    Total Reward: 920.0
//Episode 1726//    Epsilon: 0.178,    Steps: 200,    Greedy Choices　(%): 0.805,    Coverage (%): 0.870,    Steps to Visit 90.0% Cells: nan,    Total Reward: 860.0
//Episode 1727//    Epsilon: 0.178,    Steps: 200,    Greedy Choices　(%): 0.830,    Coverage (%): 0.710,    Steps to Visit 90.0% Cells: nan,    Total Reward: 700.0
//Episode 1728//    Epsilon: 0.177,    Steps: 200,    Greedy Choices　(%): 0.840,    Coverage (%): 0.790,    Steps t

KeyboardInterrupt: ignored

In [None]:
import joblib

In [None]:
joblib.dump(q_class, "q_class_multi_random.txt", compress=3)

['q_class_multi_random.txt']

In [None]:
df = pd.DataFrame(q_class[40-1].q_tables[1])
df.to_csv('qtable.csv')

In [None]:
q_last = copy.deepcopy(q)

In [None]:
state = env.reset([[0, 0], [9, 9]])
state = [arr.astype('int') for arr in state] # convert from float to integer
q_last.eps = 0
trajectory_1 = [[0, 0]]
trajectory_2 = [[9, 9]]
reward_1 = []
reward_2 = []
action_1 = []
action_2 = []

for step in range(max_steps):
    action_order = [0, 1]
    for agent_i in action_order:
        action, greedy_tf, action_value = q_last.get_action(observations=state, agent_i=agent_i, stuck_counts=env.stuck_counts, max_stuck=max_stuck, e_greedy=True, softmax=False)
        next_state, reward, done = env.step(action, agent_i)
        next_state = [arr.astype('int') for arr in next_state] # convert from float to integer
        if agent_i == 0:
            trajectory_1.append(copy.deepcopy(env.agent_pos[0]))
            reward_1.append(reward)
            action_1.append(action)
        else:
            trajectory_2.append(copy.deepcopy(env.agent_pos[1]))
            reward_2.append(reward)
            action_2.append(action)

        if done:
            break
    
        # update the observation
        state = next_state

    # check if the task is completed
    if done:
        break

print(step)

90


In [None]:
print(len(trajectory_1), len(reward_1), len(action_1))

92 91 91


In [None]:
reward_1.append(np.nan)
action_1.append(np.nan)

In [None]:
print(len(trajectory_2), len(reward_2), len(action_2))

91 90 90


In [None]:
reward_2.append(np.nan)
action_2.append(np.nan)

In [None]:
env.get_coverage()

1.0

In [None]:
df = pd.DataFrame({'trajectory': trajectory_1, 'reward': reward_1, 'action': action_1})
df.to_csv('trajectory.csv')

In [None]:
df = pd.DataFrame({'trajectory': trajectory_2, 'reward': reward_2, 'action': action_2})
df.to_csv('trajectory.csv')

In [None]:
# ===================================================================================================
# Test
# ===================================================================================================

# records for each episode
time_steps = [] # number of time steps in total
epsilons = [] # epsilon at the end of each episode
greedy = [] # the ratio of greedy choices
coverage = [] # the ratio of visited cells at the end
speed = [] # number of time steps to cover decent amount of cells
sum_q_values = [] # sum of q-values
results_mapping = [] # mapping status
results_count = [] # count status
total_reward = []
total_action_values = []
total_greedy_action_values = []

q_class = []

coverage_threshold = 0.95
max_stuck = 100000

# parameters for training
test_episodes = 100
size = 50
max_steps = size * size * 2

# initialize the environment and the q tables
env = Grid(x_size=size, y_size=size,  n_agents=2, fov_x=3, fov_y=3)

# training
for episode in range(test_episodes):
    state = env.reset()
    state = [arr.astype('int') for arr in state] # convert from float to integer

    greedy_count = [0] * env.n_agents
    coverage_track = True
    epi_reward = [0] * env.n_agents
    epi_action_value = [0] * env.n_agents
    epi_greedy_action_value = [0] * env.n_agents

    for step in range(max_steps):
        action_order = [0, 1]
        for agent_i in action_order:
            action, greedy_tf, action_value = q_last.get_action(observations=state, agent_i=agent_i, stuck_counts=env.stuck_counts, max_stuck=max_stuck, e_greedy=True, softmax=False)
            next_state, reward, done = env.step(action, agent_i)
            next_state = [arr.astype('int') for arr in next_state] # convert from float to integer

            epi_reward[agent_i] += reward
            greedy_count[agent_i] += greedy_tf * 1
            epi_action_value[agent_i] += action_value
            epi_greedy_action_value[agent_i] += action_value * greedy_tf

            if done:
                break
        
            # update the observation
            state = next_state

        # check if decent amoung of cells are visited
        current_coverage = env.get_coverage()
        if current_coverage >= coverage_threshold and coverage_track:
            speed.append(step)
            coverage_track = False

        # check if the task is completed
        if done:
            time_steps.append(step)
            break
        elif step == max_steps - 1:
            time_steps.append(step)
            if coverage_track:
                speed.append(np.nan)

    # record
    time_steps.append(step + 1)
    epsilons.append(eps_tmp)
    coverage.append(env.get_coverage())
    greedy.append(list(map(lambda x: x / (step + 1), greedy_count)))
    sum_q_values.append([q.q_tables[0].sum()])
    results_mapping.append(env.grid_status)
    results_count.append(env.grid_counts)
    total_reward.append(epi_reward)
    total_action_values.append(epi_action_value)
    total_greedy_action_values.append(epi_greedy_action_value)

    print('//Episode {0}//    Epsilon: {1:.3f},    Steps: {2},    Greedy Choices　(%): {3:.3f},    Coverage (%): {4:.3f},    Steps to Visit {5}% Cells: {6},    Sum of Q-Values: {7:.1f},    Total Reward: {8}'\
          .format(episode+1, eps_tmp, step+1, np.mean(greedy[episode]), coverage[episode], coverage_threshold * 100, speed[episode], sum_q_values[episode][0], np.mean(total_reward[episode])))



//Episode 1//    Epsilon: 0.000,    Steps: 1284,    Greedy Choices　(%): 1.000,    Coverage (%): 1.000,    Steps to Visit 95.0% Cells: 1215,    Sum of Q-Values: 2953.1,    Total Reward: 12490.0
//Episode 2//    Epsilon: 0.000,    Steps: 5000,    Greedy Choices　(%): 1.000,    Coverage (%): 0.998,    Steps to Visit 95.0% Cells: 1571,    Sum of Q-Values: 2953.1,    Total Reward: 12470.0
//Episode 3//    Epsilon: 0.000,    Steps: 1330,    Greedy Choices　(%): 1.000,    Coverage (%): 1.000,    Steps to Visit 95.0% Cells: 1213,    Sum of Q-Values: 2953.1,    Total Reward: 12490.0
//Episode 4//    Epsilon: 0.000,    Steps: 1360,    Greedy Choices　(%): 1.000,    Coverage (%): 1.000,    Steps to Visit 95.0% Cells: 1252,    Sum of Q-Values: 2953.1,    Total Reward: 12490.0
//Episode 5//    Epsilon: 0.000,    Steps: 5000,    Greedy Choices　(%): 1.000,    Coverage (%): 0.997,    Steps to Visit 95.0% Cells: 1491,    Sum of Q-Values: 2953.1,    Total Reward: 12450.0
//Episode 6//    Epsilon: 0.000,   

In [None]:
np.mean(coverage)

0.998468

In [None]:
np.mean(speed)

1386.35

In [None]:
# ===================================================================================================
# Test
# ===================================================================================================

# records for each episode
time_steps = [] # number of time steps in total
epsilons = [] # epsilon at the end of each episode
greedy = [] # the ratio of greedy choices
coverage = [] # the ratio of visited cells at the end
speed = [] # number of time steps to cover decent amount of cells
sum_q_values = [] # sum of q-values
results_mapping = [] # mapping status
results_count = [] # count status
total_reward = []
total_action_values = []
total_greedy_action_values = []

q_class = []

coverage_threshold = 0.95
max_stuck = 100000

# parameters for training
test_episodes = 100
size = 100
max_steps = size * size * 2

# initialize the environment and the q tables
env = Grid(x_size=size, y_size=size,  n_agents=2, fov_x=3, fov_y=3)

# training
for episode in range(test_episodes):
    state = env.reset()
    state = [arr.astype('int') for arr in state] # convert from float to integer

    greedy_count = [0] * env.n_agents
    coverage_track = True
    epi_reward = [0] * env.n_agents
    epi_action_value = [0] * env.n_agents
    epi_greedy_action_value = [0] * env.n_agents

    for step in range(max_steps):
        action_order = [0, 1]
        for agent_i in action_order:
            action, greedy_tf, action_value = q_last.get_action(observations=state, agent_i=agent_i, stuck_counts=env.stuck_counts, max_stuck=max_stuck, e_greedy=True, softmax=False)
            next_state, reward, done = env.step(action, agent_i)
            next_state = [arr.astype('int') for arr in next_state] # convert from float to integer

            epi_reward[agent_i] += reward
            greedy_count[agent_i] += greedy_tf * 1
            epi_action_value[agent_i] += action_value
            epi_greedy_action_value[agent_i] += action_value * greedy_tf

            if done:
                break
        
            # update the observation
            state = next_state

        # check if decent amoung of cells are visited
        current_coverage = env.get_coverage()
        if current_coverage >= coverage_threshold and coverage_track:
            speed.append(step)
            coverage_track = False

        # check if the task is completed
        if done:
            time_steps.append(step)
            break
        elif step == max_steps - 1:
            time_steps.append(step)
            if coverage_track:
                speed.append(np.nan)

    # record
    time_steps.append(step + 1)
    epsilons.append(eps_tmp)
    coverage.append(env.get_coverage())
    greedy.append(list(map(lambda x: x / (step + 1), greedy_count)))
    sum_q_values.append([q.q_tables[0].sum()])
    results_mapping.append(env.grid_status)
    results_count.append(env.grid_counts)
    total_reward.append(epi_reward)
    total_action_values.append(epi_action_value)
    total_greedy_action_values.append(epi_greedy_action_value)

    print('//Episode {0}//    Epsilon: {1:.3f},    Steps: {2},    Greedy Choices　(%): {3:.3f},    Coverage (%): {4:.3f},    Steps to Visit {5}% Cells: {6},    Sum of Q-Values: {7:.1f},    Total Reward: {8}'\
          .format(episode+1, eps_tmp, step+1, np.mean(greedy[episode]), coverage[episode], coverage_threshold * 100, speed[episode], sum_q_values[episode][0], np.mean(total_reward[episode])))



//Episode 1//    Epsilon: 0.000,    Steps: 5203,    Greedy Choices　(%): 1.000,    Coverage (%): 1.000,    Steps to Visit 95.0% Cells: 4931,    Sum of Q-Values: 2953.1,    Total Reward: 49990.0
//Episode 2//    Epsilon: 0.000,    Steps: 5212,    Greedy Choices　(%): 1.000,    Coverage (%): 1.000,    Steps to Visit 95.0% Cells: 4880,    Sum of Q-Values: 2953.1,    Total Reward: 49990.0
//Episode 3//    Epsilon: 0.000,    Steps: 20000,    Greedy Choices　(%): 1.000,    Coverage (%): 0.999,    Steps to Visit 95.0% Cells: 5784,    Sum of Q-Values: 2953.1,    Total Reward: 49955.0
//Episode 4//    Epsilon: 0.000,    Steps: 5165,    Greedy Choices　(%): 1.000,    Coverage (%): 1.000,    Steps to Visit 95.0% Cells: 4867,    Sum of Q-Values: 2953.1,    Total Reward: 49990.0
//Episode 5//    Epsilon: 0.000,    Steps: 5155,    Greedy Choices　(%): 1.000,    Coverage (%): 1.000,    Steps to Visit 95.0% Cells: 4900,    Sum of Q-Values: 2953.1,    Total Reward: 49990.0
//Episode 6//    Epsilon: 0.000,  

In [None]:
np.mean(coverage)

0.99914

In [None]:
np.mean(speed)

5262.35

In [None]:
# ===================================================================================================
# Test
# ===================================================================================================

# records for each episode
time_steps = [] # number of time steps in total
epsilons = [] # epsilon at the end of each episode
greedy = [] # the ratio of greedy choices
coverage = [] # the ratio of visited cells at the end
speed = [] # number of time steps to cover decent amount of cells
sum_q_values = [] # sum of q-values
results_mapping = [] # mapping status
results_count = [] # count status
total_reward = []
total_action_values = []
total_greedy_action_values = []

q_class = []

coverage_threshold = 0.95
max_stuck = 100000

# parameters for training
test_episodes = 100
size = 150
max_steps = size * size * 2

# initialize the environment and the q tables
env = Grid(x_size=size, y_size=size,  n_agents=2, fov_x=3, fov_y=3)

# training
for episode in range(test_episodes):
    state = env.reset()
    state = [arr.astype('int') for arr in state] # convert from float to integer

    greedy_count = [0] * env.n_agents
    coverage_track = True
    epi_reward = [0] * env.n_agents
    epi_action_value = [0] * env.n_agents
    epi_greedy_action_value = [0] * env.n_agents

    for step in range(max_steps):
        action_order = [0, 1]
        for agent_i in action_order:
            action, greedy_tf, action_value = q_last.get_action(observations=state, agent_i=agent_i, stuck_counts=env.stuck_counts, max_stuck=max_stuck, e_greedy=True, softmax=False)
            next_state, reward, done = env.step(action, agent_i)
            next_state = [arr.astype('int') for arr in next_state] # convert from float to integer

            epi_reward[agent_i] += reward
            greedy_count[agent_i] += greedy_tf * 1
            epi_action_value[agent_i] += action_value
            epi_greedy_action_value[agent_i] += action_value * greedy_tf

            if done:
                break
        
            # update the observation
            state = next_state

        # check if decent amoung of cells are visited
        current_coverage = env.get_coverage()
        if current_coverage >= coverage_threshold and coverage_track:
            speed.append(step)
            coverage_track = False

        # check if the task is completed
        if done:
            time_steps.append(step)
            break
        elif step == max_steps - 1:
            time_steps.append(step)
            if coverage_track:
                speed.append(np.nan)

    # record
    time_steps.append(step + 1)
    epsilons.append(eps_tmp)
    coverage.append(env.get_coverage())
    greedy.append(list(map(lambda x: x / (step + 1), greedy_count)))
    sum_q_values.append([q.q_tables[0].sum()])
    results_mapping.append(env.grid_status)
    results_count.append(env.grid_counts)
    total_reward.append(epi_reward)
    total_action_values.append(epi_action_value)
    total_greedy_action_values.append(epi_greedy_action_value)

    print('//Episode {0}//    Epsilon: {1:.3f},    Steps: {2},    Greedy Choices　(%): {3:.3f},    Coverage (%): {4:.3f},    Steps to Visit {5}% Cells: {6},    Sum of Q-Values: {7:.1f},    Total Reward: {8}'\
          .format(episode+1, eps_tmp, step+1, np.mean(greedy[episode]), coverage[episode], coverage_threshold * 100, speed[episode], sum_q_values[episode][0], np.mean(total_reward[episode])))



//Episode 1//    Epsilon: 0.000,    Steps: 14682,    Greedy Choices　(%): 1.000,    Coverage (%): 1.000,    Steps to Visit 95.0% Cells: 13556,    Sum of Q-Values: 2953.1,    Total Reward: 112490.0
//Episode 2//    Epsilon: 0.000,    Steps: 11671,    Greedy Choices　(%): 1.000,    Coverage (%): 1.000,    Steps to Visit 95.0% Cells: 11040,    Sum of Q-Values: 2953.1,    Total Reward: 112490.0
//Episode 3//    Epsilon: 0.000,    Steps: 45000,    Greedy Choices　(%): 1.000,    Coverage (%): 1.000,    Steps to Visit 95.0% Cells: 10959,    Sum of Q-Values: 2953.1,    Total Reward: 112465.0
//Episode 4//    Epsilon: 0.000,    Steps: 11473,    Greedy Choices　(%): 1.000,    Coverage (%): 1.000,    Steps to Visit 95.0% Cells: 10853,    Sum of Q-Values: 2953.1,    Total Reward: 112490.0
//Episode 5//    Epsilon: 0.000,    Steps: 45000,    Greedy Choices　(%): 1.000,    Coverage (%): 0.998,    Steps to Visit 95.0% Cells: 10960,    Sum of Q-Values: 2953.1,    Total Reward: 112320.0
//Episode 6//    Eps

In [None]:
np.mean(coverage)

0.9992702222222222

In [None]:
np.mean(speed)

12408.78