In [1]:
# import
import numpy as np
import gym
from gym import spaces
import random

import collections
import copy
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
class Grid(gym.Env):
    metadata = {'render.modes': ['console']}
    # action id
    XM = 0 # x minus
    XP = 1 # x plus
    YM = 2 # y minus
    YP = 3 # y plus
    
    def __init__(self, x_size=5):
        super(Grid, self).__init__()
        
        # size of 2D grid
        self.x_size = x_size

        # initialize the mapping status
        self.init_grid()
        
        # initialize the position of the agent
        self.init_agent()
        
        # define action space
        n_actions = 4 # LEFT, RIGHT, TOP, BOTTOM
        self.action_space = spaces.Discrete(n_actions)
        
        # define observation space (x and y coordinates)
        self.obs_low = np.zeros(2)
        self.obs_high = np.ones(2) * (self.x_size - 1)
        self.observation_space = spaces.Box(self.obs_low, self.obs_high)
    
    def init_agent(self, initial_pos=None):
        if initial_pos is not None:
            self.agent_pos = initial_pos
        else:
            self.agent_pos = [0, 0]
            self.agent_pos[0] = random.randrange(0, self.x_size)
            self.agent_pos[1] = random.randrange(0, self.x_size)

        self.grid_status[self.agent_pos[0], self.agent_pos[1]] = 1
        self.n_poi = self.x_size ** 2 - np.count_nonzero(self.grid_status)
    
    def init_grid(self):
        self.grid_status = np.zeros([self.x_size, self.x_size])
    
    def get_coverage(self):
        mapped_poi = (self.grid_status == 1).sum() - 1 # exclude initial pos
        return mapped_poi / self.n_poi

    def get_agent_obs(self):
        pos_x  = copy.deepcopy(self.agent_pos[0])
        pos_y  = copy.deepcopy(self.agent_pos[1])

        return [pos_x, pos_y]

    def reset(self, initial_pos=None):
        self.init_grid()
        self.init_agent(initial_pos)

        return self.get_agent_obs()
        
    def step(self, action): # i: index of the drone
        # original position
        org_x  = copy.deepcopy(self.agent_pos[0])
        org_y  = copy.deepcopy(self.agent_pos[1])

        # move the agent
        if action == self.XM:
            self.agent_pos[0] -= 1
        elif action == self.XP:
            self.agent_pos[0] += 1
        elif action == self.YM:
            self.agent_pos[1] -= 1
        elif action == self.YP:
            self.agent_pos[1] += 1
        else:
            raise ValueError("Received invalid action={} which is not part of the action space".format(action))
        
        # account for the boundaries of the grid (-2: out of the grid)
        if self.agent_pos[0] > self.x_size - 1 or self.agent_pos[0] < 0 or self.agent_pos[1] > self.x_size - 1 or self.agent_pos[1] < 0:
            self.agent_pos[0] = org_x
            self.agent_pos[1] = org_y

        # reward
        prev_status = self.grid_status[self.agent_pos[0], self.agent_pos[1]]
        if prev_status == 0:
            reward = 10
            self.grid_status[self.agent_pos[0], self.agent_pos[1]] = 1
        else:
            reward = 0
        
        # done
        mapped_poi = (self.grid_status == 1).sum() - 1 # exclude initial pos
        if mapped_poi == self.n_poi:
            done = True
        else:
            done = False
        
        return self.get_agent_obs(), reward, done

    def close(self):
        pass

In [3]:
class QTables():
    def __init__(self, observation_space, action_space, eps_start=1, eps_end=0.1, gamma=0.9, r=0.99, lr=0.1):
        self.observation_space = observation_space
        self.observation_length = observation_space.shape[0]
        self.size = int(self.observation_space.high[0] - self.observation_space.low[0]) + 1

        self.action_space = action_space
        self.action_values = [0, 1, 2, 3] # corresponding to the column numbers in q table
        self.action_num = len(self.action_values) # 4

        self.eps = eps_start  # current epsilon
        self.eps_end = eps_end # epsilon lower bound
        self.r = r  # decrement rate of epsilon
        self.gamma = gamma  # discount rate
        self.lr = lr  # learning rate

        self.q_table = np.zeros([self.size**2, self.action_num])

    # support function: convert the fov to the unique row number in the q table
    def obs_to_row(self, obs_array):
        return obs_array[0] * self.size + obs_array[1]
    
    def get_action(self, obs):
        if np.random.rand() < self.eps:
            action = random.choice(self.action_values)
            greedy = False
        else:
            obs_row = self.obs_to_row(obs)
            action = np.argmax(self.q_table[obs_row])
            greedy = True
        
        return action, greedy
    
    def update_eps(self):
        # update the epsilon
        if self.eps > self.eps_end: # lower bound
            self.eps *= self.r

    def train(self, obs, obs_next, action, reward, done):
        obs_row = self.obs_to_row(obs)
        obs_next_row = self.obs_to_row(obs_next)

        q_current = self.q_table[obs_row][action] # current q value
        q_next_max = np.max(self.q_table[obs_next_row]) # the maximum q value in the next state

        # update the q value
        if done:
            self.q_table[obs_row][action] = q_current + self.lr * reward
        else:
            self.q_table[obs_row][action] = q_current + self.lr * (reward + self.gamma * q_next_max - q_current)

In [5]:
idx3 = []
for i in range(3):
    for j in range(3):
        idx3.append((i, j))

idx4 = []
for i in range(4):
    for j in range(4):
        idx4.append((i, j))

idx5 = []
for i in range(5):
    for j in range(5):
        idx5.append((i, j))

### Fixed Initial Position

#### 3 x 3

In [207]:
# records for each episode
time_steps = [] # number of time steps in total
epsilons = [] # epsilon at the end of each episode
greedy = [] # the ratio of greedy choices
trajectory = []
coverage = []

q_class = []

# parameters for training
train_episodes = 500
size = 3
max_steps = size * 5

# initialize the environment and the q tables
env = Grid(x_size=size)
q = QTables(observation_space=env.observation_space, action_space=env.action_space, eps_start=1, eps_end=0, gamma=0.5, r=0.99, lr=0.01)

# training
for episode in range(train_episodes):
    env.reset([0, 0])
    state = env.get_agent_obs()
    eps_tmp = q.eps

    greedy_count = 0
    epi_trajectory = []
    epi_trajectory.append(env.get_agent_obs())

    for step in range(max_steps):
        action, greedy_tf = q.get_action(obs=state)
        next_state, reward, done = env.step(action)
        q.train(state, next_state, action, reward, done)

        greedy_count += greedy_tf * 1
        epi_trajectory.append(env.get_agent_obs())

        if done:
            break
    
        # update the observation
        state = next_state

    # record
    time_steps.append(len(epi_trajectory)-1)
    epsilons.append(eps_tmp)
    greedy.append(greedy_count / (step + 1))
    q_class.append(copy.deepcopy(q))
    trajectory.append(epi_trajectory)
    coverage.append(env.get_coverage())

    # update epsilon
    q.update_eps()

    if (episode + 1) % 100 == 0:
        print(episode + 1, time_steps[episode], epsilons[episode], greedy[episode], coverage[episode])



100 15 0.36972963764972655 0.7333333333333333 0.875
200 15 0.13533300490703207 0.8666666666666667 0.875
300 15 0.04953625663766238 1.0 0.875
400 15 0.018131871994995084 1.0 0.875
500 15 0.006636851557994551 1.0 0.875


In [214]:
df = pd.DataFrame(q_class[300].q_table, index=idx3)
df

Unnamed: 0,0,1,2,3
"(0, 0)",0.740357,13.71116,0.710506,3.344385
"(0, 1)",0.203848,1.257231,0.872949,1.676737
"(0, 2)",0.427991,2.045826,3.082307,0.516007
"(1, 0)",0.484401,3.465679,0.666198,13.102768
"(1, 1)",2.191111,12.428453,0.660545,2.279602
"(1, 2)",9.362383,1.272298,1.170999,0.521455
"(2, 0)",0.134436,0.012558,0.049171,3.206805
"(2, 1)",0.940918,0.362254,1.556763,12.312653
"(2, 2)",11.432798,0.468206,0.754691,0.456002


In [18]:
[epsilons[i] for i in [0, 10, 20, 50, 100, 200, 300]]

[1,
 0.9043820750088043,
 0.8179069375972307,
 0.6050060671375365,
 0.36603234127322926,
 0.13397967485796175,
 0.04904089407128576]

#### 4 x 4

In [112]:
# records for each episode
time_steps = [] # number of time steps in total
epsilons = [] # epsilon at the end of each episode
greedy = [] # the ratio of greedy choices
trajectory = []
coverage = []

q_class = []

# parameters for training
train_episodes = 5000
size = 4
max_steps = size * 5

# initialize the environment and the q tables
env = Grid(x_size=size)
q = QTables(observation_space=env.observation_space, action_space=env.action_space, eps_start=1, eps_end=0, gamma=0.5, r=0.999, lr=0.01)

# training
for episode in range(train_episodes):
    env.reset([0, 0])
    state = env.get_agent_obs()
    eps_tmp = q.eps

    greedy_count = 0
    epi_trajectory = []
    epi_trajectory.append(env.get_agent_obs())

    for step in range(max_steps):
        action, greedy_tf = q.get_action(obs=state)
        next_state, reward, done = env.step(action)
        q.train(state, next_state, action, reward, done)

        greedy_count += greedy_tf * 1
        epi_trajectory.append(env.get_agent_obs())

        if done:
            break
    
        # update the observation
        state = next_state

    # record
    time_steps.append(len(epi_trajectory)-1)
    epsilons.append(eps_tmp)
    greedy.append(greedy_count / (step + 1))
    q_class.append(copy.deepcopy(q))
    trajectory.append(epi_trajectory)
    coverage.append(env.get_coverage())

    # update epsilon
    q.update_eps()

    if (episode + 1) % 100 == 0:
        print(episode + 1, time_steps[episode], epsilons[episode], greedy[episode], coverage[episode])



100 20 0.9056978449586682 0.1 0.4
200 20 0.8194682977764125 0.2 0.4
300 20 0.7414484806367364 0.25 0.5333333333333333
400 20 0.6708567627695098 0.35 0.8666666666666667
500 20 0.6069859307919768 0.45 0.8
600 20 0.5491961035890855 0.5 0.6666666666666666
700 20 0.49690832175285177 0.65 0.9333333333333333
800 20 0.44959874735743227 0.5 0.7333333333333333
900 20 0.4067934159611651 0.65 0.6666666666666666
1000 20 0.36806348825922275 0.75 0.6666666666666666
1100 20 0.3330209538162239 0.65 0.7333333333333333
1200 20 0.3013147438372364 0.7 0.6666666666666666
1300 20 0.2726272140335106 0.8 0.7333333333333333
1400 20 0.24667096234700878 0.8 0.8
1500 20 0.22318594965255484 0.6 0.8
1600 19 0.2019368945917472 0.8421052631578947 1.0
1700 19 0.1827109164391416 0.8421052631578947 1.0
1800 20 0.1653154023860845 0.85 0.9333333333333333
1900 17 0.14957607787587202 0.9411764705882353 1.0
2000 15 0.13533526065815754 0.9333333333333333 1.0
2100 20 0.12245028107108785 0.85 0.8
2200 16 0.11079205272498677 0.87

In [53]:
df = pd.DataFrame(q_class[4999].q_table, index=idx4)
df

Unnamed: 0,0,1,2,3
"(0, 0)",0.0,9.988674,0.0,9.101908
"(0, 1)",0.0,4.034256,0.0,9.776448
"(0, 2)",0.0,2.920761,0.720744,9.817284
"(0, 3)",0.0,9.836436,0.406802,0.0
"(1, 0)",0.0,9.964992,0.0,8.818477
"(1, 1)",9.777599,2.186246,1.872113,2.509033
"(1, 2)",5.753234,1.435947,9.726242,5.128965
"(1, 3)",2.003868,9.879461,1.910628,0.0
"(2, 0)",0.242541,9.86264,0.0,7.94948
"(2, 1)",6.724733,1.390414,2.05028,9.697131


In [54]:
[epsilons[i] for i in [0, 100, 200, 500, 1000, 2000, 3000, 4000, 4999]]

[1,
 0.9047921471137096,
 0.818648829478636,
 0.6063789448611848,
 0.3676954247709635,
 0.1351999253974994,
 0.04971239399803625,
 0.018279019827489446,
 0.006727839799665273]

#### 5 x 5

In [38]:
# records for each episode
time_steps = [] # number of time steps in total
epsilons = [] # epsilon at the end of each episode
greedy = [] # the ratio of greedy choices
trajectory = []
coverage = []

q_class = []

# parameters for training
train_episodes = 50000
size = 5
max_steps = size * 5

# initialize the environment and the q tables
env = Grid(x_size=size)
q = QTables(observation_space=env.observation_space, action_space=env.action_space, eps_start=1, eps_end=0, gamma=0, r=0.9995, lr=0.01)

# training
for episode in range(train_episodes):
    env.reset([0, 0])
    state = env.get_agent_obs()
    eps_tmp = q.eps

    greedy_count = 0
    epi_trajectory = []
    epi_trajectory.append(env.get_agent_obs())

    for step in range(max_steps):
        action, greedy_tf = q.get_action(obs=state)
        next_state, reward, done = env.step(action)
        q.train(state, next_state, action, reward, done)

        greedy_count += greedy_tf * 1
        epi_trajectory.append(env.get_agent_obs())

        if done:
            break
    
        # update the observation
        state = next_state

    # record
    time_steps.append(len(epi_trajectory)-1)
    epsilons.append(eps_tmp)
    greedy.append(greedy_count / (step + 1))
    q_class.append(copy.deepcopy(q))
    trajectory.append(epi_trajectory)
    coverage.append(env.get_coverage())

    # update epsilon
    q.update_eps()

    if (episode + 1) % 100 == 0:
        print(episode + 1, time_steps[episode], epsilons[episode], greedy[episode], coverage[episode])



100 25 0.9516933769307994 0.0 0.4166666666666667
200 25 0.9052674235521029 0.16 0.4583333333333333
300 25 0.8611062428400729 0.12 0.4166666666666667
400 25 0.8190993535905904 0.2 0.4583333333333333
500 25 0.7791416641455342 0.28 0.5416666666666666
600 25 0.7411332094774175 0.2 0.4583333333333333
700 25 0.7049789010996835 0.12 0.2916666666666667
800 25 0.6705882891769959 0.28 0.5416666666666666
900 25 0.6378753362403742 0.32 0.4583333333333333
1000 25 0.6067582019410674 0.28 0.5833333333333334
1100 25 0.5771590383046616 0.44 0.5833333333333334
1200 25 0.5490037949732016 0.52 0.6666666666666666
1300 25 0.5222220339480774 0.44 0.5416666666666666
1400 25 0.49674675337021873 0.48 0.5416666666666666
1500 25 0.47251421989671744 0.52 0.5833333333333334
1600 25 0.44946380925453877 0.44 0.4583333333333333
1700 25 0.4275378545724137 0.6 0.5
1800 25 0.4066815021114777 0.56 0.5416666666666666
1900 25 0.38684257403372235 0.8 0.5833333333333334
2000 25 0.3679714378649446 0.56 0.4583333333333333
2100 

In [40]:
df = pd.DataFrame(q_class[40000].q_table, index=idx5)
df

Unnamed: 0,0,1,2,3
"(0, 0)",0.0,9.583563,0.0,10.0
"(0, 1)",0.0,9.505724,0.0,10.0
"(0, 2)",0.0,9.021095,0.100476,10.0
"(0, 3)",0.0,8.814045,0.190995,10.0
"(0, 4)",0.0,10.0,0.071303,0.0
"(1, 0)",0.0,4.899126,0.0,3018.397704
"(1, 1)",2.676531,2.671102,2.668606,2.866517
"(1, 2)",1.588885,10.0,7.792079,1.215192
"(1, 3)",1.311297,1.095568,10.0,1.495115
"(1, 4)",0.547981,10.0,8.393251,0.0


### Random Initial Position

#### 3 x 3

In [200]:
# records for each episode
time_steps = [] # number of time steps in total
epsilons = [] # epsilon at the end of each episode
greedy = [] # the ratio of greedy choices
trajectory = []
coverage = []

q_class = []

# parameters for training
train_episodes = 500
size = 3
max_steps = size * 5

# initialize the environment and the q tables
env = Grid(x_size=size)
q = QTables(observation_space=env.observation_space, action_space=env.action_space, eps_start=1, eps_end=0, gamma=0.5, r=0.99, lr=0.01)

# training
for episode in range(train_episodes):
    env.reset()
    state = env.get_agent_obs()
    eps_tmp = q.eps

    greedy_count = 0
    epi_trajectory = []
    epi_trajectory.append(env.get_agent_obs())

    for step in range(max_steps):
        action, greedy_tf = q.get_action(obs=state)
        next_state, reward, done = env.step(action)
        q.train(state, next_state, action, reward, done)

        greedy_count += greedy_tf * 1
        epi_trajectory.append(env.get_agent_obs())

        if done:
            break
    
        # update the observation
        state = next_state

    # record
    time_steps.append(len(epi_trajectory)-1)
    epsilons.append(eps_tmp)
    greedy.append(greedy_count / (step + 1))
    q_class.append(copy.deepcopy(q))
    trajectory.append(epi_trajectory)
    coverage.append(env.get_coverage())

    # update epsilon
    q.update_eps()

    if (episode + 1) % 100 == 0:
        print(episode + 1, time_steps[episode], epsilons[episode], greedy[episode], coverage[episode])



100 15 0.36972963764972655 0.6 0.875
200 15 0.13533300490703207 0.7333333333333333 0.875
300 15 0.04953625663766238 1.0 0.875
400 15 0.018131871994995084 1.0 0.875
500 8 0.006636851557994551 1.0 1.0


In [185]:
coverage[495]

0.875

In [142]:
df = pd.DataFrame(q_class[300].q_table, index=idx3)
df

Unnamed: 0,0,1,2,3
"(0, 0)",0.63909,9.67775,0.533565,1.146069
"(0, 1)",0.587423,2.001888,9.81835,1.880736
"(0, 2)",0.460892,1.557945,9.057324,0.521032
"(1, 0)",1.211043,9.686061,0.749546,2.320437
"(1, 1)",5.909444,1.215902,1.413557,1.26667
"(1, 2)",9.752809,1.777749,2.51699,0.413084
"(2, 0)",1.220285,0.81145,0.78268,9.545217
"(2, 1)",3.55164,0.37825,1.653853,9.665989
"(2, 2)",9.595659,0.42167,1.116842,0.674388


In [66]:
trajectory[499]

[[1, 1], [0, 1], [0, 2], [1, 2], [2, 2], [2, 1], [2, 0], [1, 0], [0, 0]]

#### 4 x 4

gamma = 0

In [67]:
# records for each episode
time_steps = [] # number of time steps in total
epsilons = [] # epsilon at the end of each episode
greedy = [] # the ratio of greedy choices
trajectory = []
coverage = []

q_class = []

# parameters for training
train_episodes = 5000
size = 4
max_steps = size * 5

# initialize the environment and the q tables
env = Grid(x_size=size)
q = QTables(observation_space=env.observation_space, action_space=env.action_space, eps_start=1, eps_end=0, gamma=0, r=0.999, lr=0.01)

# training
for episode in range(train_episodes):
    env.reset([0, 0])
    state = env.get_agent_obs()
    eps_tmp = q.eps

    greedy_count = 0
    epi_trajectory = []
    epi_trajectory.append(env.get_agent_obs())

    for step in range(max_steps):
        action, greedy_tf = q.get_action(obs=state)
        next_state, reward, done = env.step(action)
        q.train(state, next_state, action, reward, done)

        greedy_count += greedy_tf * 1
        epi_trajectory.append(env.get_agent_obs())

        if done:
            break
    
        # update the observation
        state = next_state

    # record
    time_steps.append(len(epi_trajectory)-1)
    epsilons.append(eps_tmp)
    greedy.append(greedy_count / (step + 1))
    q_class.append(copy.deepcopy(q))
    trajectory.append(epi_trajectory)
    coverage.append(env.get_coverage())

    # update epsilon
    q.update_eps()

    if (episode + 1) % 100 == 0:
        print(episode + 1, time_steps[episode], epsilons[episode], greedy[episode], coverage[episode])



100 20 0.9056978449586682 0.15 0.9333333333333333
200 20 0.8194682977764125 0.1 0.6
300 20 0.7414484806367364 0.2 0.5333333333333333
400 20 0.6708567627695098 0.3 0.4
500 20 0.6069859307919768 0.5 0.8
600 20 0.5491961035890855 0.6 0.7333333333333333
700 20 0.49690832175285177 0.35 0.26666666666666666
800 20 0.44959874735743227 0.7 0.6
900 20 0.4067934159611651 0.3 0.6666666666666666
1000 20 0.36806348825922275 0.5 0.6
1100 20 0.3330209538162239 0.7 0.6
1200 20 0.3013147438372364 0.85 0.8
1300 20 0.2726272140335106 0.85 0.7333333333333333
1400 20 0.24667096234700878 0.8 0.8666666666666667
1500 20 0.22318594965255484 0.85 0.5333333333333333
1600 20 0.2019368945917472 0.85 0.6
1700 19 0.1827109164391416 0.6842105263157895 1.0
1800 15 0.1653154023860845 0.9333333333333333 1.0
1900 20 0.14957607787587202 0.75 0.6
2000 18 0.13533526065815754 0.8888888888888888 1.0
2100 20 0.12245028107108785 0.95 0.8
2200 15 0.11079205272498677 1.0 1.0
2300 17 0.100243779268176 0.9411764705882353 1.0
2400 20

In [83]:
df = pd.DataFrame(q_class[0].q_table, index=idx4)
df

Unnamed: 0,0,1,2,3
"(0, 0)",0.0,3e-06,0.000498,0.099005
"(0, 1)",0.0,0.1,0.000498,0.1
"(0, 2)",0.0,0.1,0.0,0.0
"(0, 3)",0.0,0.0,0.0,0.0
"(1, 0)",0.000992,0.0,0.0,0.0
"(1, 1)",0.0,0.0,0.1,0.0
"(1, 2)",0.0,0.0,0.0,0.099003
"(1, 3)",0.0,0.1,0.0005,7e-06
"(2, 0)",0.0,0.0,0.0,0.0
"(2, 1)",0.0,0.0,0.0,0.0


gamma = 0.5

In [82]:
# records for each episode
time_steps = [] # number of time steps in total
epsilons = [] # epsilon at the end of each episode
greedy = [] # the ratio of greedy choices
trajectory = []
coverage = []

q_class = []

# parameters for training
train_episodes = 5000
size = 4
max_steps = size * 5

# initialize the environment and the q tables
env = Grid(x_size=size)
q = QTables(observation_space=env.observation_space, action_space=env.action_space, eps_start=1, eps_end=0, gamma=0.5, r=0.999, lr=0.01)

# training
for episode in range(train_episodes):
    env.reset([0, 0])
    state = env.get_agent_obs()
    eps_tmp = q.eps

    greedy_count = 0
    epi_trajectory = []
    epi_trajectory.append(env.get_agent_obs())

    for step in range(max_steps):
        action, greedy_tf = q.get_action(obs=state)
        next_state, reward, done = env.step(action)
        q.train(state, next_state, action, reward, done)

        greedy_count += greedy_tf * 1
        epi_trajectory.append(env.get_agent_obs())

        if done:
            break
    
        # update the observation
        state = next_state

    # record
    time_steps.append(len(epi_trajectory)-1)
    epsilons.append(eps_tmp)
    greedy.append(greedy_count / (step + 1))
    q_class.append(copy.deepcopy(q))
    trajectory.append(epi_trajectory)
    coverage.append(env.get_coverage())

    # update epsilon
    q.update_eps()

    if (episode + 1) % 100 == 0:
        print(episode + 1, time_steps[episode], epsilons[episode], greedy[episode], coverage[episode])



100 20 0.9056978449586682 0.2 0.6
200 20 0.8194682977764125 0.4 0.5333333333333333
300 20 0.7414484806367364 0.1 0.5333333333333333
400 20 0.6708567627695098 0.25 0.6666666666666666
500 20 0.6069859307919768 0.45 0.6
600 20 0.5491961035890855 0.35 0.6666666666666666
700 20 0.49690832175285177 0.6 0.6
800 20 0.44959874735743227 0.45 0.6666666666666666
900 20 0.4067934159611651 0.7 0.7333333333333333
1000 20 0.36806348825922275 0.35 0.6
1100 20 0.3330209538162239 0.45 0.6
1200 20 0.3013147438372364 0.7 0.7333333333333333
1300 20 0.2726272140335106 0.75 0.7333333333333333
1400 20 0.24667096234700878 0.6 0.8666666666666667
1500 20 0.22318594965255484 0.7 0.6666666666666666
1600 20 0.2019368945917472 0.65 0.4666666666666667
1700 20 0.1827109164391416 0.8 0.8666666666666667
1800 20 0.1653154023860845 0.8 0.9333333333333333
1900 20 0.14957607787587202 0.8 0.8
2000 15 0.13533526065815754 0.8666666666666667 1.0
2100 20 0.12245028107108785 0.9 0.9333333333333333
2200 20 0.11079205272498677 0.75 

In [95]:
df = pd.DataFrame(q_class[4999].q_table, index=idx4)
df

Unnamed: 0,0,1,2,3
"(0, 0)",8.015614,15.115757,8.132656,19.908258
"(0, 1)",7.173383,19.845656,7.615535,15.03861
"(0, 2)",5.696976,6.536189,7.319452,20.198058
"(0, 3)",5.453128,20.733212,6.104643,5.863416
"(1, 0)",4.732103,7.552231,3.574362,10.736241
"(1, 1)",7.590859,19.773639,13.292753,14.275139
"(1, 2)",19.922598,7.001018,7.733043,13.458734
"(1, 3)",8.380377,21.842014,6.908649,6.240311
"(2, 0)",192.80642,9.801808,8.266588,7.581053
"(2, 1)",7.364051,13.221371,18.699997,19.762423


suboptimal case with gamma = 0.5

In [97]:
# records for each episode
time_steps = [] # number of time steps in total
epsilons = [] # epsilon at the end of each episode
greedy = [] # the ratio of greedy choices
trajectory = []
coverage = []

q_class = []

# parameters for training
train_episodes = 5000
size = 4
max_steps = size * 5

# initialize the environment and the q tables
env = Grid(x_size=size)
q = QTables(observation_space=env.observation_space, action_space=env.action_space, eps_start=1, eps_end=0, gamma=0.5, r=0.999, lr=0.01)

# training
for episode in range(train_episodes):
    env.reset([0, 0])
    state = env.get_agent_obs()
    eps_tmp = q.eps

    greedy_count = 0
    epi_trajectory = []
    epi_trajectory.append(env.get_agent_obs())

    for step in range(max_steps):
        action, greedy_tf = q.get_action(obs=state)
        next_state, reward, done = env.step(action)
        q.train(state, next_state, action, reward, done)

        greedy_count += greedy_tf * 1
        epi_trajectory.append(env.get_agent_obs())

        if done:
            break
    
        # update the observation
        state = next_state

    # record
    time_steps.append(len(epi_trajectory)-1)
    epsilons.append(eps_tmp)
    greedy.append(greedy_count / (step + 1))
    q_class.append(copy.deepcopy(q))
    trajectory.append(epi_trajectory)
    coverage.append(env.get_coverage())

    # update epsilon
    q.update_eps()

    if (episode + 1) % 100 == 0:
        print(episode + 1, time_steps[episode], epsilons[episode], greedy[episode], coverage[episode])



100 20 0.9056978449586682 0.1 0.6
200 20 0.8194682977764125 0.1 0.5333333333333333
300 20 0.7414484806367364 0.3 0.6666666666666666
400 20 0.6708567627695098 0.4 0.5333333333333333
500 20 0.6069859307919768 0.55 0.8
600 20 0.5491961035890855 0.3 0.5333333333333333
700 20 0.49690832175285177 0.7 0.5333333333333333
800 20 0.44959874735743227 0.55 0.6
900 20 0.4067934159611651 0.7 0.7333333333333333
1000 20 0.36806348825922275 0.6 0.7333333333333333
1100 20 0.3330209538162239 0.7 0.6666666666666666
1200 20 0.3013147438372364 0.7 0.8
1300 20 0.2726272140335106 0.75 0.8666666666666667
1400 20 0.24667096234700878 0.75 0.9333333333333333
1500 20 0.22318594965255484 0.75 0.8
1600 20 0.2019368945917472 0.85 0.8
1700 20 0.1827109164391416 0.95 0.8
1800 20 0.1653154023860845 0.95 0.8666666666666667
1900 20 0.14957607787587202 1.0 0.8
2000 20 0.13533526065815754 0.8 0.8
2100 20 0.12245028107108785 0.8 0.8
2200 20 0.11079205272498677 0.95 0.8
2300 20 0.100243779268176 0.9 0.8666666666666667
2400 20

In [107]:
df = pd.DataFrame(q_class[4999].q_table, index=idx4)
df

Unnamed: 0,0,1,2,3
"(0, 0)",8.287745,19.619628,8.16496,14.921176
"(0, 1)",3.257082,7.978426,4.865852,11.294043
"(0, 2)",5.192016,5.836082,9.99558,11.49651
"(0, 3)",4.821443,11.17575,5.550641,4.532648
"(1, 0)",7.875048,19.308831,7.893921,14.055334
"(1, 1)",11.111976,7.665947,6.512304,7.776624
"(1, 2)",10.762378,5.883128,10.800396,10.354148
"(1, 3)",8.060432,10.467257,5.757808,5.306232
"(2, 0)",7.275031,13.033647,6.611039,18.687773
"(2, 1)",12.795718,17.43751,7.115525,12.316711


In [109]:
# records for each episode
time_steps = [] # number of time steps in total
epsilons = [] # epsilon at the end of each episode
greedy = [] # the ratio of greedy choices
trajectory = []
coverage = []

q_class = []

# parameters for training
train_episodes = 50000
size = 4
max_steps = size * 5

# initialize the environment and the q tables
env = Grid(x_size=size)
q = QTables(observation_space=env.observation_space, action_space=env.action_space, eps_start=1, eps_end=0, gamma=0, r=0.9999, lr=0.001)

# training
for episode in range(train_episodes):
    env.reset([0, 0])
    state = env.get_agent_obs()
    eps_tmp = q.eps

    greedy_count = 0
    epi_trajectory = []
    epi_trajectory.append(env.get_agent_obs())

    for step in range(max_steps):
        action, greedy_tf = q.get_action(obs=state)
        next_state, reward, done = env.step(action)
        q.train(state, next_state, action, reward, done)

        greedy_count += greedy_tf * 1
        epi_trajectory.append(env.get_agent_obs())

        if done:
            break
    
        # update the observation
        state = next_state

    # record
    time_steps.append(len(epi_trajectory)-1)
    epsilons.append(eps_tmp)
    greedy.append(greedy_count / (step + 1))
    q_class.append(copy.deepcopy(q))
    trajectory.append(epi_trajectory)
    coverage.append(env.get_coverage())

    # update epsilon
    q.update_eps()

    if (episode + 1) % 100 == 0:
        print(episode + 1, time_steps[episode], epsilons[episode], greedy[episode], coverage[episode])



100 20 0.9901483535267248 0.0 0.4
200 20 0.9802957226154846 0.0 0.4666666666666667
300 20 0.9705411318974407 0.0 0.4
400 20 0.9608836058078369 0.0 0.4666666666666667
500 20 0.95132217848943 0.0 0.6666666666666666
600 20 0.9418558936958947 0.0 0.2
700 20 0.9324838046961919 0.05 0.7333333333333333
800 20 0.9232049741798795 0.05 0.5333333333333333
900 20 0.9140184741633747 0.2 0.4666666666666667
1000 20 0.9049233858971459 0.1 0.4666666666666667
1100 20 0.8959187997738268 0.25 0.6666666666666666
1200 20 0.8870038152372448 0.15 0.4666666666666667
1300 20 0.8781775406923579 0.15 0.5333333333333333
1400 20 0.869439093416084 0.05 0.4666666666666667
1500 20 0.8607875994690198 0.05 0.6
1600 20 0.8522221936080363 0.15 0.5333333333333333
1700 20 0.843742019199747 0.1 0.4666666666666667
1800 20 0.8353462281348323 0.2 0.6
1900 20 0.8270339807432225 0.15 0.5333333333333333
2000 20 0.8188044457101201 0.2 0.5333333333333333
2100 20 0.8106567999928598 0.2 0.6666666666666666
2200 20 0.8025902287385944 0.