# Halite challenge - basics

In [21]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from Environment import halite_env as Env

In [22]:
from importlib import reload
reload(Env)

<module 'Environment.halite_env' from '/home/nicola/Nicola_unipd/QuartoAnno/TODO/Baiesi/RL/haliteRL/Environment/halite_env.py'>

## Setup of the environment

In [23]:
num_players = 1
n_actions = 5 # no dropoffs, 1 action for staying still, 4 for moving in the cardinal directions
map_size = 7 # 7 x 7 map

In [24]:
env = Env.HaliteEnv(num_players, map_size)

Initializing Halite Environment


In [25]:
# halite in the map, min = 0, max = 1000
env.map[:,:,0]

array([[ 49, 338, 222, 605, 366, 544, 335],
       [473, 863, 981, 465, 797, 118, 441],
       [386, 763, 257, 514, 280, 199, 554],
       [985, 718, 155,   0, 273, 348, 562],
       [364, 444, 456, 728, 700,  70, 936],
       [838, 153,  63, 344, 209, 247, 660],
       [980, 664, 962, 572, 693, 389, 152]])

In [26]:
# ship position
env.map[:,:,1] 

array([[0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0]])

In [27]:
# shows the halite carried from each ship in the position corresponding to the ship
# initially there is no ship, hence no halite carried either

env.map[:,:,2] 

array([[0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0]])

In [28]:
# shipyard position
env.map[:,:,3] 

array([[0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0]])

In [29]:
print("Initial halite: ", env.player_halite[0])

Initial halite:  [5000.]


In [30]:
# actions are represented as a matrix whose entries are -1 if no ship is in that position, 
#'a_i' if ship i is present in that position 
action_matrix = np.full((map_size,map_size), -1) # no ship, no action

In [31]:
# the environment already has in memory the last state, thus we don't need to resubmit it
# the only things that we submit are the action matrix and the shipyard action (1 or True to spawn a ship, 0 otherwise)
shipyard_action = 1 # initially always choose to create a ship
# returns the state, i.e. env.map
s, h, finish, _ = env.step(action_matrix, makeship = shipyard_action)
# s_0 -> map_halite, s_1 -> cargo_halite, s_2 -> shipyard_position (not used), s_3 -> ship_position
map_halite = s[:,:,0]
ship_pos_matrix = s[:,:,1]
shipy_pos_matrix = s[:,:,3]

In [32]:
def encode(v_dec, L):
    # v_two = [v1,v2]
    # returns the encoded version V[v1,v2] of V = np.arange(0,L)
    # L = length(all_possible_v)
    V = np.arange(0,L**2).reshape((L,L))
    v_enc = V[v_dec[0],v_dec[1]] 
    return v_enc

def decode(v_enc, L):
    V = np.arange(0,L**2).reshape((L,L))
    v_dec = np.array([np.where(v_enc == V)[0][0],np.where(v_enc == V)[1][0]])
    return v_dec

def one_to_index(V,L):
    # matrix V with one entry = 1 and the others 0
    return np.arange(L**2).reshape((L, L))[V.astype(bool)]

# ship positions x (n_cells + 1) x halite levels -> tensor
# we will need a 3D encoding, like the 2D seen above
def encode3D(v_dec, L1, L2, L3):
    # v_dec = [v1,v2,v3]
    # returns the encoded version V[v1,v2,v3] of V = np.arange(0,L1*L2*L3)
    V = np.arange(0,L1*L2*L3).reshape((L1,L2,L3))
    v_enc = V[v_dec[0],v_dec[1], v_dec[2]] 
    return v_enc

def decode3D(v_enc, L1, L2, L3):
    # v_enc = V[v1,v2,v3] 
    # V = np.arange(0,L1*L2*L3)
    # returns the decoded version v_dec = [v1,v2,v3] of V[v1,v2,v3] 
    V = np.arange(0,L1*L2*L3).reshape((L1,L2,L3))
    v_dec = np.array([np.where(v_enc == V)[0][0],np.where(v_enc == V)[1][0], np.where(v_enc == V)[2][0]])
    return v_dec

In [33]:
#position_encoded of the ship
pos_enc = one_to_index(ship_pos_matrix, map_size)
print("Encoded position of the ship: ", pos_enc)
#position_decoded of the ship
pos_dec = decode(pos_enc, map_size)
print("Decoded position of the ship: ", pos_dec)

#position_encoded of the ship
shipy_enc = one_to_index(shipy_pos_matrix, map_size)
print("Encoded position of the shipyard: ", shipy_enc)
#position_decoded of the ship
shipy_dec = decode(shipy_enc, map_size)
print("Decoded position of the shipyard: ", shipy_dec)

Encoded position of the ship:  [24]
Decoded position of the ship:  [3 3]
Encoded position of the shipyard:  [24]
Decoded position of the shipyard:  [3 3]


In [34]:
ship_cargo = s[pos_dec[0],pos_dec[1],2]
print("Initial ship cargo: ", ship_cargo)

Initial ship cargo:  0


In [63]:
search_range = 3
# now suppose that the ship is in [2,2], whereas the shipyard is at the center of the map, i.e. [3,3]
example = np.roll(map_halite, shift = (1,1) , axis =  (0,1)) #in this way we simulate the ship to be in (2,2)
print("This is what we should get: \n", example)
pos_dec = [2,2]
shift = (shipy_dec[0]-pos_dec[0],shipy_dec[1]-pos_dec[1])
centered_h = np.roll(map_halite, shift = shift, axis = (0,1))
print("Result: \n",centered_h)

def roll_and_cut(M, shift, axis, border = 1, center = (3,3)):
    M_temp = np.roll(M, shift = shift, axis = axis)
    M_cut = M_temp[center[0]-border:center[0]+border+1, center[1]-border:center[1]+border+1]
    return M_cut

# try to return just the 3x3 area around the ship
around_ship = roll_and_cut(centered_h, shift = 0, axis = 0)
print("Neighborhood of the ship: \n", around_ship, '\n')
# we actually need to do this shifting by two in all cardinal directions w.r.t. the map centered around the ship
mean_cardinal_h = []
perm = [(a,s) for a in [0,1] for s in [-2,2]]
for a,s in perm:
    print("Map shifted in direction: (%d,%d)\n"%(s,a), roll_and_cut(centered_h, shift = s, axis = a))
    mean_h = np.mean(roll_and_cut(centered_h, shift = s, axis = a), axis = (0,1))
    print("Mean halite in direction: (%d,%d)"%(s,a), mean_h, '\n')
    mean_cardinal_h.append(mean_h)

mean_cardinal_h = np.array(mean_cardinal_h)
halite_direction = np.argmax(mean_cardinal_h) + 1
print("Action suggested to reach the nearest and richest halite deposit: ", halite_direction)

This is what we should get: 
 [[152 980 664 962 572 693 389]
 [335  49 338 222 605 366 544]
 [441 473 863 981 465 797 118]
 [554 386 763 257 514 280 199]
 [562 985 718 155   0 273 348]
 [936 364 444 456 728 700  70]
 [660 838 153  63 344 209 247]]
Result: 
 [[152 980 664 962 572 693 389]
 [335  49 338 222 605 366 544]
 [441 473 863 981 465 797 118]
 [554 386 763 257 514 280 199]
 [562 985 718 155   0 273 348]
 [936 364 444 456 728 700  70]
 [660 838 153  63 344 209 247]]
Neighborhood of the ship: 
 [[863 981 465]
 [763 257 514]
 [718 155   0]] 

Map shifted in direction: (-2,0)
 [[718 155   0]
 [444 456 728]
 [153  63 344]]
Mean halite in direction: (-2,0) 340.1111111111111 

Map shifted in direction: (2,0)
 [[664 962 572]
 [338 222 605]
 [863 981 465]]
Mean halite in direction: (2,0) 630.2222222222222 

Map shifted in direction: (-2,1)
 [[465 797 118]
 [514 280 199]
 [  0 273 348]]
Mean halite in direction: (-2,1) 332.6666666666667 

Map shifted in direction: (2,1)
 [[441 473 863]
 [5

In [None]:
# functions for the agent
def greedy_policy(s, q_values):
    return np.argmax(q_values[s])

def e_greedy_policy(s, q_values, eps = 0.01):
    # s is encoded in input, a is encoded in output
    u = np.random.rand()
    if u > eps:
        return np.argmax(q_values[s])
    else:
        return np.random.randint(0, len(q_values[s]))
    
def update_q(s, a, r, sp, ap, q_values, gamma = 1):
    q_values[s,a] = r + gamma*q_values[sp,ap]
    return q_values

## State complexity and state approximation

We have:
- $(map\_size)^2$ positions ($49$ in this case, up to $64^2 = 4096$ for the largest map);
- $1000$ values of halite for each position ($h\_lev$);
- $1000$ values of carried halite ($h\_lev$).

The state of the system is defined by the position of the ship + the halite in EACH cell + the halite carried by the ship. We can have almost all the possible combinations of the values that those variables can assume, thus we have the combinations of $1000$ values of halite for $50$ cells ($49$ of the map + the one carried by the ship) all multiplied by $49$ possible positions of the ship, for a total of $49\times 10^{147}$ possible states. If instead we consider the largest map of $64 \times 64$ we arrive at $4096\times 10^{12288}$ possible states.

The general formula can be written as: 
$$\# states = (map\_size)^2 \times (h\_lev)^{map\_size^2+1}$$

where we consider the map always centered on the shipyard, hence in this framework its position is fixed and not considered as a variable.
Of course this state representation is totally uncontrollable, because it scales exponentially in the number of cells of the map.

To tackle this issue we choose to drastically reduce the amount of information that is observed by the ship through two processes: partial observation and state abstraction.



### Partial observation: depth of field

The most expensive dependence in the formula about the number of states of the system is the exponent at which is elevated $h\_lev$. This is obtained considering all possible combinations of halite for all the cells of the map and the halite carried by the ship. A different approach is to consider only the halite inside the field of view of the agent and restrict the depth of field to the minimal possible quantity, i.e. nearest neighbors. In this way, independently from the $map\_size$ we get an exponent that in 2D is equal to 6 (4 for the neighbors, 1 for the state in which the ship stands and 1 for the halite it carries).

In other words, we get: 

$$\# states = (map\_size)^2 \times (h\_lev)^{6}$$

that yields $4.9 \times 10^{19}$ for the $7 \times 7$ map, that is still not manageable, but considerably smaller ( of order $ \approx 10^{120}$).

### State abstraction: halite quantization

Differently from the restriction on the observation space, that are somewhat straightforward, the state abstraction must involve some hypothesis about the environment that involve knowing the model of the environment. For example, if we were in the situation on not knowing how the halite is collected we probably would have done a different choice.
Since we know that the ship collects an amount of halite proportional to the halite in the cell (25% of it, to be more exact) and pays a fee of 10% of the halite contained in a cell to leave it, we are more interested in having encoded the notions of "low" and "high" halite levels, instead of sampling with precision the middle-high half of the halite scale. 
To be more specific, we choose to approximate the information about the halite using for $h_lev = 3$ halite levels and the following encoding:
- $h = 0$ if $halite \le 10$; 
- $h = 1$ if $10 < halite \le 100$; 
- $h = 2$ if $100 < halite \le 1000$.

The important part is that the halite is quantized in intervals that grow of a decade each, but we could also test adding a fourth level.
In this way the number of states of the system becomes:

$$\# states = (map\_size)^2 \times 3^{6}$$

yielding for a $7 \times 7$ map $35.721$ states, that is reachable with our resources. 

### State abstraction: meta-informations

The problem of these manipulations is that now the ship has access only to local informations and lacks of the knowledge about the position of the shipyard (but again, being the latter fixed, only the ship position is needed) and of that of distant halite deposits. In order to enhance the ability of the ship to find those deposits, we encode in a 4-states additional information the direction that the ship should take to go towards the nearest and richest deposit.
In this final formulation, the total number of states that needs to be experienced by the ship is:

$$\# states = (map\_size)^2 \times 3^{6} * 4$$

yielding for a $7 \times 7$ map the final result of $142.884$ states. Considering that each of these states requires 64 bits, i.e. 8 bytes, to be stored, the memory required to store the Q-value table is 1.143.072 bytes = 1.143 Mb.



In [64]:
search_range = 3
# now suppose that the ship is in [2,2], whereas the shipyard is at the center of the map, i.e. [3,3]
example = np.roll(map_halite, shift = (1,1) , axis =  (0,1)) #in this way we simulate the ship to be in (2,2)
print("This is what we should get: \n", example)
pos_dec = [2,2]
shift = (shipy_dec[0]-pos_dec[0],shipy_dec[1]-pos_dec[1])
centered_h = np.roll(map_halite, shift = shift, axis = (0,1))
print("Result: \n",centered_h)

def roll_and_cut(M, shift, axis, border = 1, center = (3,3)):
    M_temp = np.roll(M, shift = shift, axis = axis)
    M_cut = M_temp[center[0]-border:center[0]+border+1, center[1]-border:center[1]+border+1]
    return M_cut

# try to return just the 3x3 area around the ship
around_ship = roll_and_cut(centered_h, shift = 0, axis = 0)
print("Neighborhood of the ship: \n", around_ship, '\n')
# we actually need to do this shifting by two in all cardinal directions w.r.t. the map centered around the ship
mean_cardinal_h = []
perm = [(a,s) for a in [0,1] for s in [-2,2]]
for a,s in perm:
    print("Map shifted in direction: (%d,%d)\n"%(s,a), roll_and_cut(centered_h, shift = s, axis = a))
    mean_h = np.mean(roll_and_cut(centered_h, shift = s, axis = a), axis = (0,1))
    print("Mean halite in direction: (%d,%d)"%(s,a), mean_h, '\n')
    mean_cardinal_h.append(mean_h)

mean_cardinal_h = np.array(mean_cardinal_h)
halite_direction = np.argmax(mean_cardinal_h) + 1
print("Action suggested to reach the nearest and richest halite deposit: ", halite_direction)

This is what we should get: 
 [[152 980 664 962 572 693 389]
 [335  49 338 222 605 366 544]
 [441 473 863 981 465 797 118]
 [554 386 763 257 514 280 199]
 [562 985 718 155   0 273 348]
 [936 364 444 456 728 700  70]
 [660 838 153  63 344 209 247]]
Result: 
 [[152 980 664 962 572 693 389]
 [335  49 338 222 605 366 544]
 [441 473 863 981 465 797 118]
 [554 386 763 257 514 280 199]
 [562 985 718 155   0 273 348]
 [936 364 444 456 728 700  70]
 [660 838 153  63 344 209 247]]
Neighborhood of the ship: 
 [[863 981 465]
 [763 257 514]
 [718 155   0]] 

Map shifted in direction: (-2,0)
 [[718 155   0]
 [444 456 728]
 [153  63 344]]
Mean halite in direction: (-2,0) 340.1111111111111 

Map shifted in direction: (2,0)
 [[664 962 572]
 [338 222 605]
 [863 981 465]]
Mean halite in direction: (2,0) 630.2222222222222 

Map shifted in direction: (-2,1)
 [[465 797 118]
 [514 280 199]
 [  0 273 348]]
Mean halite in direction: (-2,1) 332.6666666666667 

Map shifted in direction: (2,1)
 [[441 473 863]
 [5

In [None]:
def halite_quantization(h, q_number = 4):
    # h can either be a scalar or a matrix 
    tresholds = np.logspace(0,3,4) # [1, 10, 100, 1000] = [10^0, 10^1, 10^2, 10^3]
    h_shape = h.shape
    h_temp = h.flatten()
    mask = (h_temp[:,np.newaxis] < tresholds).astype(int)
    level = np.argmax(mask, axis = 1)
    return level.reshape(h_shape)

In [None]:
# how to use it
q_h = halite_quantization(map_halite)
c_h = halite_quantization(ship_cargo)

In [None]:
# Now we need to write a function that is able to get the decoded state from the output of the environment
# ship positions x (n_cells + 1) -> ship positions x (n_cells + 1) x halite levels

pos_enc = one_to_index(ship_pos_matrix, map_size) #this is the first entry

# Now we compose an array of the halite values of the n cells and the ship 
# and encode it through the matrix [(n_cells + 1) x halite levels]


In [None]:
h_lev = 4 # halite levels
n_cells = map_size**2
n_states = n_cells*(n_cells+1)*h_lev
print("Total number of states to be experienced: ", n_states)
n_actions = 5
tot_turns = 400
# to account the fact that the ships can't know the number of turns elapsed we train them as if there was
# a uniform probability of 1/tot_turns of ending the game at each turn
discount_factor = 1 - 1/tot_turns 
n_batch = 100 # number of episodes in an epoch
epochs = 0
q_values = np.zeros((n_states,n_actions)) #initialize to zero
max_epochs = 1000
reward_score = np.zeros(max_epochs)

In [None]:
s_dec = np.array([0,0,1])
print("Original decoded state: ", s_dec)
s_enc = encode3D(s_dec, L1 = n_cells, L2 = n_cells+1, L3 = h_lev)
print("Encoded state: ", s_enc)
s_dec_2 = decode3D(s_enc, L1 = n_cells, L2 = n_cells+1, L3 = h_lev)
print("New decoded state: ", s_dec)

In [None]:
while True:
    #@@@@@@@@@@@@@@@@@@@@@@
    # here starts an epoch
    #@@@@@@@@@@@@@@@@@@@@@@
    epochs = epochs + 1
    reward_progress = np.zeros(n_batch) # bunch of 100 episodes
    eps = 0.5 # starting value of epsilon
    # generate an adaptive epsilon greedy algorithm, has to be calibrated
    epsilons = np.array(list(map(lambda i : eps*np.exp(-i/100), np.arange(0,max_epochs+1))))
    
    for i in range(n_batch):
        #@@@@@@@@@@@@@@@@@@@@@@@@
        # here starts an episode
        #@@@@@@@@@@@@@@@@@@@@@@@@
        s_dec = [0,4] # init state: top corners
        s_enc = encode(s_dec, 25) # encoded state used to have a scalar index to access the states; 25 singular states
        steps = 0
        reward = 0
        
        while True:
            steps = steps + 1
            
            a_enc = e_greedy_pol_v1(s_enc, q_values, eps = epsilons[epochs])
            a_dec = decode(a_enc, 4) # 4 singular actions
            
            sp_dec, r = turn_v2(s_dec, a_dec) # interacts with the environment 
            sp_enc = encode(sp_dec, 25) # encode to scalar state
            reward = reward + r
            
            a_temp_enc = greedy_pol_v0(sp_enc, q_values)
            a_temp_dec = decode(a_temp_enc, 4)

            q_values = update_q_v0(s_enc, a_enc, r, sp_enc, a_temp_enc, q_values, gamma = discount_factor)
            
            # update states
            s_enc = sp_enc
            s_dec = sp_dec
            
            
            # terminal states are 2 and 24
            if (s_dec[0] == 2 and s_dec[1] == 24) or (s_dec[1] == 2 and s_dec[0] == 24):
                #print("Terminal state reached at step %d."%steps)
                reward_progress[i] = reward
                game_progress[i] = steps
                terminal_states[:,i] = s_dec
                break
            """if (s_dec[0] == s_dec[1]):
                #print("Agent crushed one on each other.") 
                #game_progress[i] = 100 # assign 100 equals to failure (and the -100 reward accounts for the accident)
                #break"""
            if steps >= 100:
                #print("Too much time has passed. Game Over.")
                reward_progress[i] = reward
                game_progress[i] = steps
                terminal_states[:,i] = s_dec
                break
    
    print("Average reward per episode in epoch %d: "%epochs, reward_progress.mean())
    reward_score[epochs-1] = reward_progress.mean()

    if epochs >= max_epochs:
        print("Hey, I think you've had enough! Let's stop here.")
        break