# TODO LIST

1. Implement the new state encoding for multiple agents;
2. Implement shipyard policy from `predict_reward`;
3. Try to run the multiple agents training for a random policy of the shipyard;
4. Implement the framework to train the shipyard (memory buffer + regression);
5. Use some higher level interface for the learning of the two classes, that allows more clarity and concision and also set up things so that the training can be done with continuity through multiple sessions (because it is likely that it will take several days to get a good result);
6. Tune the learning parameters (epochs, batches, learning rate and so on...);
7. Do a time profiling of the learning procedure to understand where is the bottleneck;
8. Optimize all the implementations by tackilng the bottlenecks.

# Shipyard implementation

In [149]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.insert(0, "../Environment/")
import halite_env as Env
from sklearn.preprocessing import PolynomialFeatures
import encode as cod

In [245]:
from importlib import reload
reload(cod)

<module 'encode' from '/home/nicola/Nicola_unipd/QuartoAnno/TODO/Baiesi/RL/haliteRL/Tabular_methods/encode.py'>

In [142]:
def predict_reward(weights, state, h_tot, steps, tot_turns = 400, map_size = 7):
    
    from sklearn.preprocessing import PolynomialFeatures # check if needed
    import encode as cod # check if needed
    
    # h_tot is the halite available
    N = np.count_nonzero(state[:,:,1]) # number of ships in the map
    t_left = tot_turns - steps # number of turns left until the end of the episode

    shipy_enc = cod.one_to_index(state[:,:,3], map_size) 
    shipy_dec = cod.decode(shipy_enc, map_size)
    s1 = shipy_dec + [1,0]
    s2 = shipy_dec + [-1,0]
    s3 = shipy_dec + [0,1]
    s4 = shipy_dec + [0,-1]
    
    s = [shipy_dec,s1,s2,s3,s4]
    mask = np.zeros((map_size,map_size)).astype(int)
    
    for x in s:
        mask[tuple(x)] = 1

    mask = mask.astype(bool)
    near_ships = state[:,:,1][mask].sum() #number of ships that in one move can go to the shipyard
    
    shipy_state = np.array([N,t_left,h_tot,near_ships]) # vectorial state of the four variables
    # [a,b] gets mapped to [1,a,b,a^2,ab,b^2]
    poly = PolynomialFeatures(2)
    poly_state = poly.fit_transform(shipy_state.reshape(1,-1))[0]
    
    # returns the scalar product between the weights and the polynomial mapping of degree 2 of the shipy_state
    return np.dot(weights,poly_state)

In [290]:
def shipy_policy(weights, state, h_tot, steps, epsilon, tot_turns = 400, map_size = 7):
    r = predict_reward(weights, state, h_tot, steps, tot_turns = 400, map_size = 7)
    ship_on_shipyard = state[:,:,1][state[:,:,3].astype(bool)].astype(bool)
    if (h_tot < 1000) or ship_on_shipyard:
        return False
    else:
        u = np.random.rand()
        if u < epsilon:
            C = np.random.choice([True,False])
            return C
        elif r > 0:
            return True
        else:
            return False

## Memorizing trajectories

In order to learn, we must save for each episode all the (N,t_left,h_tot,near_ships) tuples, one for each time a ship was spawn. Of course this number can change from episode to episode, up to a maximum of 400 tuples. At the end of the episode, also the difference between the initial and the final amount of halite must be saved associated to the episode trajectory. Since we want to learn from 100 episodes at least (we could consider also to train on all the data from all the epochs, up to $n\_epochs\times100$ episodes) the simplest container for all this experience is a list of tuples, each one containing a list (or a numpy array) of all the states (N,t_left,h_tot,near_ships) of the episode in one entry and the reward obtained in the other entry.

From that we will build the associated polynomial states $\underline{\tilde{s}_i}$ and also sum feature by feature all those states in an episode in order to form $\underline{\tilde{S}_j}$, that will be used togheter with $R_j$ to estimate $\underline{w}$.

In [180]:
from importlib import reload
reload(cod)

<module 'encode' from '/home/nicola/Nicola_unipd/QuartoAnno/TODO/Baiesi/RL/haliteRL/Tabular_methods/encode.py'>

In [181]:
num_players = 1
map_size = 7 # 7 x 7 map
env = Env.HaliteEnv(num_players, map_size)

In [182]:
action_matrix = np.full((map_size,map_size), -1) 
shipyard_action = 1 
state, h, finish, _ = env.step(action_matrix, makeship = shipyard_action)
# s_0 -> map_halite, s_1 -> ship_position, s_2 -> cargo_halite, s_3 -> shipyard_position (not used)
print("Ships label: \n", state[:,:,-1])
print("Halite:", h[0][0])

Ships label: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
Halite: 4000.0


In [183]:
a = [1]
action_matrix = cod.multi_scalar_to_matrix_action(a, state)
print("Action matrix: \n", action_matrix)
shipyard_action = 1 # qui non crea la barca
state, h, finish, _ = env.step(action_matrix, makeship = shipyard_action)
print("Ships label: \n", state[:,:,-1])
print("Halite:", h[0][0])

pos_dec:  [3 3]
Action matrix: 
 [[-1 -1 -1 -1 -1 -1 -1]
 [-1 -1 -1 -1 -1 -1 -1]
 [-1 -1 -1 -1 -1 -1 -1]
 [-1 -1 -1  1 -1 -1 -1]
 [-1 -1 -1 -1 -1 -1 -1]
 [-1 -1 -1 -1 -1 -1 -1]
 [-1 -1 -1 -1 -1 -1 -1]]
Ships label: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 2 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
Halite: 3000.0


In [184]:
a = [1,1] #[2]
action_matrix = cod.multi_scalar_to_matrix_action(a, state)
print("Action matrix: \n", action_matrix)
shipyard_action = 1
state, h, finish, _ = env.step(action_matrix, makeship = shipyard_action)
print("Ships label: \n", state[:,:,-1])
print("Halite:", h[0][0])

pos_dec:  [3 3]
pos_dec:  [4 3]
Action matrix: 
 [[-1 -1 -1 -1 -1 -1 -1]
 [-1 -1 -1 -1 -1 -1 -1]
 [-1 -1 -1 -1 -1 -1 -1]
 [-1 -1 -1  1 -1 -1 -1]
 [-1 -1 -1  1 -1 -1 -1]
 [-1 -1 -1 -1 -1 -1 -1]
 [-1 -1 -1 -1 -1 -1 -1]]
Ships label: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 3 0 0 0]
 [0 0 0 2 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0]]
Halite: 2000.0


In [185]:
print(state[:,:,1])
#position_encoded of the ship
ships_pos_enc = cod.one_to_index(state[:,:,1], map_size) # works for multiple ships
print("Encoded position of the ship: ", ships_pos_enc)

[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0]]
Encoded position of the ship:  [24 31 38]


In [186]:
for i in range(len(ships_pos_enc)):
    print(safest_dir(ships_pos_enc[i], state))

3
3
4


In [187]:
ship_ids = state[:,:,4][state[:,:,1].astype(bool)]
print("IDs in order of position from top-left to bottom-right: ", ship_ids)

IDs in order of position from top-left to bottom-right:  [3 2 1]


In [188]:
state[:,:,1]

array([[0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0]])

In [192]:
# the order is the one of the position, not of the IDs
new_encoded_states = cod.encode_multi_state(state, map_size = 7, h_lev = 3, n_actions = 5, debug = False)

In [234]:
s = [1,2,3]
index = [2,4,1]
# output should be s[1,2,0] = [3,1,2]
index_sort = index[:]
index_sort.sort()
print(index_sort)

index = np.array(index)
index_sort = np.array(index_sort)

Is = np.tile(index_sort, (len(index_sort),1))
I = np.tile(index, (len(index),1)).T

mask = (I == Is)
jj = np.nonzero(mask)[1]
print(jj)
for i in range(3):
    print("s[%d] : "%jj[i],s[jj[i]])
    print("s[%d] ID : "%jj[i],index[jj[i]])
    
# the order is shuffled, but it's good

[1, 2, 4]
[1 2 0]
s[1] :  2
s[1] ID :  4
s[2] :  3
s[2] ID :  1
s[0] :  1
s[0] ID :  2


In [161]:
# getting the position given the ID
import copy
ID = ship_ids[0]
mask = (state[:,:,4] == ID)
one_state = copy.deepcopy(state)
one_state[:,:,1][~mask] = 0 # map it to a one-ship state
one_state[:,:,1]

array([[0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0]])

In [168]:
cod.encode_multi_state(state, map_size = 7, h_lev = 3, n_actions = 5, debug = False)

[350992, 454854, 558718]

# Implementation

In [281]:
#@@@@@@@@@@@@@@@@@@@
# RL agent functions
#@@@@@@@@@@@@@@@@@@@

def greedy_policy(s, q_values):
    return np.argmax(q_values[s])

def e_greedy_policy(s, q_values, eps = 0.01):
    # s is encoded in input, a is encoded in output
    u = np.random.rand()
    if u > eps:
        return np.argmax(q_values[s])
    else:
        return np.random.randint(0, len(q_values[s]))

    
def update_q_v0(s, a, r, sp, ap, q_values, gamma = 1):
    q_values[s,a] = r + gamma*q_values[sp,ap]
    return q_values

def update_q_v1(s, a, r, sp, ap, q_values, gamma = 1, n_cells = 49, h_lev = 3, n_actions = 5, alpha = 0.1):
    s_dec = decode3D(s, L1 = n_cells, L2 = h_lev**6, L3 = n_actions-1)
    sp_dec = decode3D(sp, L1 = n_cells, L2 = h_lev**6, L3 = n_actions-1)
    shipy_pos = (n_cells-1)/2 #shipyard is at the center of the map
    if (sp_dec[0] == shipy_pos and s_dec[0] != shipy_pos):
        q_values[s,a] = (1-alpha)*q_values[s,a] + alpha*r # sp is terminal state -> enforce to have Q-value = 0 for all actions ap
        #print("Terminal value update rule executed.")
    else:
        q_values[s,a] = (1-alpha)*q_values[s,a] + alpha*(r + gamma*q_values[sp,ap]) # normal update
    return q_values

def update_q_v2(s, a, r, sp, ap, q_values, gamma = 1, n_cells = 49, h_lev = 3, n_actions = 5, alpha = 0.1):
    s_dec = cod.decode4D(s, L1 = n_cells, L2 = h_lev**6, L3 = n_actions-1, L4 = n_actions)
    sp_dec = cod.decode4D(sp, L1 = n_cells, L2 = h_lev**6, L3 = n_actions-1, L4 = n_actions)
    shipy_pos = (n_cells-1)/2 #shipyard is at the center of the map
    if (sp_dec[0] == shipy_pos and s_dec[0] != shipy_pos):
        q_values[s,a] = (1-alpha)*q_values[s,a] + alpha*r # sp is terminal state -> enforce to have Q-value = 0 for all actions ap
        #print("Terminal value update rule executed.")
    else:
        q_values[s,a] = (1-alpha)*q_values[s,a] + alpha*(r + gamma*q_values[sp,ap]) # normal update
    return q_values

In [173]:
#@@@@@@@@@@@@@@@@@@@@@@
# Environment variables
#@@@@@@@@@@@@@@@@@@@@@@
NUM_PLAYERS = 1
MAP_SIZE = 7 # 7 x 7 map
TOT_TURNS = 400 # number of turns for each episode

#@@@@@@@@@@@@@@@@
# State variables
#@@@@@@@@@@@@@@@@
H_LEV = 3 # halite levels
N_CELLS = MAP_SIZE**2 # number of cells in a square map
N_ACTIONS = 5 # no dropoffs, 1 action for staying still, 4 for moving in the cardinal directions
N_STATES = N_CELLS*H_LEV**6*(N_ACTIONS-1)*N_ACTIONS
print("Total number of states to be experienced: ", N_STATES)

#@@@@@@@@@@@@@@@@@@@@
# Learning parameters
#@@@@@@@@@@@@@@@@@@@@
#N_BATCH = 50 #100 # number of episodes in an epoch
#MAX_EPOCHS = 300 # max number of epochs played before stopping (500 ~ 7.3 hours of training)
DISCOUNT_FACTOR = 1 - 1/TOT_TURNS #train ships as if each turn has a probability of 1/tot_turns of ending the game 
STD_REWARD = -0.01
LEARNING_RATE = 0.1

Total number of states to be experienced:  714420


In [230]:
q_values = np.zeros((N_STATES,N_ACTIONS)) #initialize to zero
#q_values = np.load("Q_values.npy") # or re-use the one already trained
from sklearn.preprocessing import PolynomialFeatures
SHIPY_FEATURES = 4
w_len = int((SHIPY_FEATURES+1)*(SHIPY_FEATURES+2)/2)
print("Number of parameters for the shipyard policy: ", w_len)
sigmas = np.array([1/N_CELLS, 1/TOT_TURNS, 1/5000, 1/5]) # inverse of the maximum/initial values of each feature
poly = PolynomialFeatures(2)
poly_sigmas = poly.fit_transform(sigmas.reshape(1,-1))[0]
weights = np.random.normal(scale = poly_sigmas) # random initial weights
print("Initial random weights: \n", weights)

Number of parameters for the shipyard policy:  15
Initial random weights: 
 [-5.47746770e-01  3.90146919e-02  2.18939524e-03  2.71884991e-04
  6.33589112e-01 -8.87592595e-04 -3.42105971e-05 -3.95667779e-06
  7.55953505e-03  1.18358633e-05  1.10878800e-07 -2.93533450e-04
  2.46145262e-08  5.40365166e-06 -6.11596916e-03]


In [291]:
def multi_ship_agent_training(q_values, weights, shipy_eps, show = True, N_BATCH = 20, MAX_EPOCHS = 300):
    
    halite_score = np.zeros(MAX_EPOCHS)
    epochs = 0
    
    if show:
        # visualize online the results
        %matplotlib notebook
        fig = plt.figure(figsize = (6,8))
        plt.ion()

        ax1 = fig.add_subplot(211)
        ax1.set_xlim(0,MAX_EPOCHS+1)
        #ax1.set_ylim(0,8000)
        ax1.set_xlabel("Epochs", fontsize = 14)
        ax1.set_ylabel("Halite collected", fontsize = 14)


        ax2 = fig.add_subplot(212)
        ax2.set_xlim(0,MAX_EPOCHS+1)
        ax2.set_ylim(0,0.5)
        ax2.set_xlabel("Epochs", fontsize = 14)
        ax2.set_ylabel("Epsilon", fontsize = 14)

        plt.tight_layout()
        fig.show()
        fig.canvas.draw()

    from tqdm import tnrange

    for k in tnrange(MAX_EPOCHS):
        #@@@@@@@@@@@@@@@@@@@@@@
        # here starts an epoch
        #@@@@@@@@@@@@@@@@@@@@@@
        epochs = epochs + 1
        halite_progress = np.zeros(N_BATCH) # bunch of 100 episodes
        eps = 0.5 # starting value of epsilon
        # generate an adaptive epsilon greedy algorithm, calibrated in order to have epsilon = 10^-4 at the last epoch
        epsilons = np.array(list(map(lambda i : eps*np.exp(-i*2*np.log(10)/MAX_EPOCHS), np.arange(0,MAX_EPOCHS+1))))

        for i in range(N_BATCH):
            #@@@@@@@@@@@@@@@@@@@@@@@@
            # here starts an episode
            #@@@@@@@@@@@@@@@@@@@@@@@@
            env = Env.HaliteEnv(NUM_PLAYERS, MAP_SIZE, episode_lenght = TOT_TURNS) # init environment
            steps = 0

            # first mandatory step
            steps = steps + 1
            print("\nStep number %d:"%steps)
            action_matrix = np.full((MAP_SIZE,MAP_SIZE), -1) # no ship, no action
            shipyard_action = 1 # initially always choose to create a ship
            # returns the matricial state, the array of players halite and a flag that is true if it's the final turn
            state, players_halite, finish, _ = env.step(action_matrix, makeship = shipyard_action) 
            print("Ship labels: \n", state[:,:,4])
            current_halite = players_halite[0][0]
            encoded_states = cod.encode_multi_state(state, map_size = MAP_SIZE, h_lev = H_LEV, n_actions = N_ACTIONS, debug=False)
            encoded_states = np.array(encoded_states)
            current_ship_ids = state[:,:,4][state[:,:,1].astype(bool)]
            print("Initial ship IDs: ", current_ship_ids)
            
            while True:
                steps = steps + 1
                print("\nStep number %d:"%steps)
                print("Current halite: ", current_halite)
                actions = []
                for j in range(len(encoded_states)):
                    a_enc = e_greedy_policy(encoded_states[j], q_values, eps = epsilons[epochs])
                    actions.append(a_enc)
                a_mat = cod.multi_scalar_to_matrix_action(actions, state, map_size = MAP_SIZE)

                shipyard_action = shipy_policy(weights, state, current_halite, steps, epsilon = shipy_eps,
                                               tot_turns = TOT_TURNS, map_size = MAP_SIZE)

                # submit the action and get the new state
                state, players_halite, finish, _ = env.step(a_mat, makeship = shipyard_action) 
                print("Ship labels: \n", state[:,:,4])
                new_halite = players_halite[0][0]
                print("New halite: ", new_halite)
                new_ship_ids = state[:,:,4][state[:,:,1].astype(bool)]
                print("New ships IDs: ", new_ship_ids)
                new_encoded_states = cod.encode_multi_state(state, map_size = 7, h_lev = 3,
                                                            n_actions = 5, debug = False)
                new_encoded_states = np.array(new_encoded_states)
                
                rewards = [] # rewards are stored following the position order
                for j in range(len(current_ship_ids)):
                    if current_ship_ids[j] not in new_ship_ids:
                        r = -1 + STD_REWARD*(TOT_TURNS-steps)
                    else:
                        mask = (new_ship_ids == current_ship_ids[j])
                        sp_dec = cod.decode4D(new_encoded_states[mask], 
                                              L1 = N_CELLS, L2 = H_LEV**6, L3 = N_ACTIONS-1, L4 = N_ACTIONS)
                        shipy_pos = (N_CELLS-1)/2 #shipyard is at the center of the map
                        if sp_dec[0] == shipy_pos:
                            r = (new_halite - current_halite)/1000 + STD_REWARD
                        else:
                            r = STD_REWARD
                    rewards.append(r)

                for j in range(len(encoded_states)):
                    s_enc = encoded_states[j]
                    a_enc = actions[j]
                    r = rewards[j]
                    if current_ship_ids[j] in new_ship_ids:
                        # simulate the best action in the new state (before update)
                        mask = (new_ship_ids == current_ship_ids[j])
                        print("Mask: ", mask)
                        print("new_encoded_states: ", new_encoded_states)
                        sp_enc = new_encoded_states[mask]
                        a_temp_enc = greedy_policy(sp_enc, q_values) 
                        # update Q-values
                        q_values = update_q_v2(s_enc, a_enc, r, sp_enc, a_temp_enc, q_values, 
                                               gamma = DISCOUNT_FACTOR, n_cells = N_CELLS, h_lev = H_LEV, 
                                               n_actions = N_ACTIONS, alpha = LEARNING_RATE)
                    else:
                        q_values[s_enc,a_enc] = (1-LEARNING_RATE)*q_values[s_enc,a_enc] + LEARNING_RATE*r 
                    
                # update states and halite
                encoded_states = new_encoded_states
                current_halite = new_halite
                current_ship_ids = new_ship_ids
                # TO DO: case in which 2 or more ships crash

                if (finish == True) or (steps >= 400):
                    #print("End episode.")
                    halite_progress[i] = current_halite - 4000
                    break
            #break # play just 1 episode

        #break # play just 1 epoch

        halite_score[epochs-1] = halite_progress.mean()

        if show:
            ax1.scatter(epochs, halite_score[epochs-1], s = 10, color='blue')
            #ax2.scatter(epochs, reward_score[epochs-1], s = 10,color='blue')
            ax2.scatter(epochs, epsilons[epochs], s = 10, color='blue')
            #ax4.scatter(epochs, shipy_pass[epochs-1], s = 10, color='blue')
            fig.canvas.draw()
        else:
            #print("Average reward per episode in epoch %d: %.3f"%(epochs, reward_progress.mean()))
            print("Average halite collected per episode in epoch %d: %.3f"%(epochs,halite_progress.mean()))
            print("Espilon value: %.4f \n"%epsilons[epochs])

        if epochs >= MAX_EPOCHS:
            print("Hey, I think you've had enough! Let's stop here.")
            break

In [292]:
from importlib import reload
reload(cod)

<module 'encode' from '/home/nicola/Nicola_unipd/QuartoAnno/TODO/Baiesi/RL/haliteRL/Tabular_methods/encode.py'>

In [293]:
multi_ship_agent_training(q_values, weights, shipy_eps = 0.5, show = False, N_BATCH = 20, MAX_EPOCHS = 300)

HBox(children=(IntProgress(value=0, max=300), HTML(value='')))


Step number 1:
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
Initial ship IDs:  [1]

Step number 2:
Current halite:  4000.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  4000.0
New ships IDs:  [1]
Mask:  [ True]
new_encoded_states:  [351521]

Step number 3:
Current halite:  4000.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  4000.0
New ships IDs:  [1]
Mask:  [ True]
new_encoded_states:  [251609]

Step number 4:
Current halite:  4000.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 0 2 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  3000.0
New ships IDs:  [1 2]
Mask:  [ True False]
new_encoded_states:  [261329 351521]

Step number 5:
Current halite:


Step number 25:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 5 0 0 0]
 [0 0 4 0 2 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  467.0
New ships IDs:  [5 4 2]
Mask:  [ True False False]
new_encoded_states:  [351529 451964 481124]
Mask:  [False  True False]
new_encoded_states:  [351529 451964 481124]
Mask:  [False False  True]
new_encoded_states:  [351529 451964 481124]

Step number 26:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 2 0 0]
 [0 4 0 5 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  467.0
New ships IDs:  [2 4 5]
Mask:  [False False  True]
new_encoded_states:  [379026 437362 456463]
Mask:  [False  True False]
new_encoded_states:  [379026 437362 456463]
Mask:  [ True False False]
new_encoded_states:  [379026 437362 456463]

Step number 27:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 2 0 0]
 [0 0 0 0 0 0 0]
 [4 0 0 5 0 0 0


Step number 45:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 2 0 0 0]
 [0 0 5 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 4 0 0 0 0]]
New halite:  467.0
New ships IDs:  [2 5 4]
Mask:  [ True False False]
new_encoded_states:  [150649 246229 646361]
Mask:  [False  True False]
new_encoded_states:  [150649 246229 646361]
Mask:  [False False  True]
new_encoded_states:  [150649 246229 646361]

Step number 46:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 2 0 0]
 [0 0 5 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 4 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  467.0
New ships IDs:  [2 5 4]
Mask:  [ True False False]
new_encoded_states:  [165236 244607 544304]
Mask:  [False  True False]
new_encoded_states:  [165236 244607 544304]
Mask:  [False False  True]
new_encoded_states:  [165236 244607 544304]

Step number 47:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 2 0 0]
 [0 0 0 0 0 0 0]
 [0 0 5 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0

Mask:  [ True False False]
new_encoded_states:  [121494 529722 573464]
Mask:  [False  True False]
new_encoded_states:  [121494 529722 573464]
Mask:  [False False  True]
new_encoded_states:  [121494 529722 573464]

Step number 65:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 4 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 5 0 0 0 2 0]
 [0 0 0 0 0 0 0]]
New halite:  467.0
New ships IDs:  [4 5 2]
Mask:  [ True False False]
new_encoded_states:  [121492 529724 588043]
Mask:  [False  True False]
new_encoded_states:  [121492 529724 588043]
Mask:  [False False  True]
new_encoded_states:  [121492 529724 588043]

Step number 66:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [4 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 5 0 0 0 2 0]
 [0 0 0 0 0 0 0]]
New halite:  467.0
New ships IDs:  [4 5 2]
Mask:  [ True False False]
new_encoded_states:  [105654 529721 588041]
Mask:  [False  True False]
new_encoded_states:  [105654 529721 5880

Mask:  [ True False False]
new_encoded_states:  [223387 281867 411476]
Mask:  [False  True False]
new_encoded_states:  [223387 281867 411476]
Mask:  [False False  True]
new_encoded_states:  [223387 281867 411476]

Step number 85:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 2 0]
 [0 5 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [4 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  467.0
New ships IDs:  [2 5 4]
Mask:  [False  True False]
new_encoded_states:  [179808 223387 411479]
Mask:  [ True False False]
new_encoded_states:  [179808 223387 411479]
Mask:  [False False  True]
new_encoded_states:  [179808 223387 411479]

Step number 86:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 2 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 5 0 0 0 0 0]
 [4 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  467.0
New ships IDs:  [2 5 4]
Mask:  [ True False False]
new_encoded_states:  [ 77749 325549 411479]
Mask:  [False  True False]
new_encoded_states:  [ 77749 325549 4114

Mask:  [False  True False]
new_encoded_states:  [234349 427656 514976]
Mask:  [False False  True]
new_encoded_states:  [234349 427656 514976]

Step number 104:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 2 0 0 0 0]
 [0 5 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 4 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  467.0
New ships IDs:  [2 5 4]
Mask:  [ True False False]
new_encoded_states:  [234349 325546 529721]
Mask:  [False  True False]
new_encoded_states:  [234349 325546 529721]
Mask:  [False False  True]
new_encoded_states:  [234349 325546 529721]

Step number 105:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [5 0 2 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 4 0 0 0 0 0]]
New halite:  467.0
New ships IDs:  [5 2 4]
Mask:  [False  True False]
new_encoded_states:  [310138 338089 631794]
Mask:  [ True False False]
new_encoded_states:  [310138 338089 631794]
Mask:  [False False  True]
new_encoded_states:  [310138 338089 63

Mask:  [False  True False]
new_encoded_states:  [ 19413 194397 234349]
Mask:  [False False  True]
new_encoded_states:  [ 19413 194397 234349]

Step number 124:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 5 0 0 0 2]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 4 0 0 0 0 0]]
New halite:  467.0
New ships IDs:  [5 2 4]
Mask:  [False False  True]
new_encoded_states:  [134994 194399 631793]
Mask:  [False  True False]
new_encoded_states:  [134994 194399 631793]
Mask:  [ True False False]
new_encoded_states:  [134994 194399 631793]

Step number 125:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 5 0 0 0]
 [0 0 0 0 0 0 2]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 4 0 0 0 0 0]]
New halite:  467.0
New ships IDs:  [5 2 4]
Mask:  [ True False False]
new_encoded_states:  [150646 296339 631792]
Mask:  [False  True False]
new_encoded_states:  [150646 296339 631792]
Mask:  [False False  True]
new_encoded_states:  [150646 296339 63

Mask:  [False False  True]
new_encoded_states:  [121492 338089 675522]

Step number 143:
Current halite:  467.0
Ship labels: 
 [[0 4 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 5 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 2 0 0]]
New halite:  467.0
New ships IDs:  [4 5 2]
Mask:  [ True False False]
new_encoded_states:  [ 19414 338088 675523]
Mask:  [False  True False]
new_encoded_states:  [ 19414 338088 675523]
Mask:  [False False  True]
new_encoded_states:  [ 19414 338088 675523]

Step number 144:
Current halite:  467.0
Ship labels: 
 [[4 0 0 0 2 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 5 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  467.0
New ships IDs:  [4 2 5]
Mask:  [ True False False]
new_encoded_states:  [  3233  63176 338087]
Mask:  [False False  True]
new_encoded_states:  [  3233  63176 338087]
Mask:  [False  True False]
new_encoded_states:  [  3233  63176 338087]

Step number 145:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0


Step number 162:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [4 0 0 6 0 0 0]
 [0 0 0 0 0 5 2]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  467.0
New ships IDs:  [4 6 5 2]
Mask:  [ True False False False]
new_encoded_states:  [310138 350968 485984 500501]
Mask:  [False  True False False]
new_encoded_states:  [310138 350968 485984 500501]
Mask:  [False False  True False]
new_encoded_states:  [310138 350968 485984 500501]
Mask:  [False False False  True]
new_encoded_states:  [310138 350968 485984 500501]

Step number 163:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 4 0 6 0 0 0]
 [0 0 0 0 0 2 5]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  467.0
New ships IDs:  [4 6 2 5]
Mask:  [ True False False False]
new_encoded_states:  [325549 350969 485984 500502]
Mask:  [False  True False False]
new_encoded_states:  [325549 350969 485984 500502]
Mask:  [False False False  True]
new_encoded_states:  

Mask:  [ True False False False]
new_encoded_states:  [ 19413 383927 539444 616678]
Mask:  [False False  True False]
new_encoded_states:  [ 19413 383927 539444 616678]
Mask:  [False  True False False]
new_encoded_states:  [ 19413 383927 539444 616678]
Mask:  [False False False  True]
new_encoded_states:  [ 19413 383927 539444 616678]

Step number 178:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 2 0]
 [0 0 0 0 0 0 0]
 [0 0 6 0 0 0 0]
 [4 5 0 0 0 0 0]]
New halite:  467.0
New ships IDs:  [2 6 4 5]
Mask:  [False False False  True]
new_encoded_states:  [383928 553843 616677 631792]
Mask:  [ True False False False]
new_encoded_states:  [383928 553843 616677 631792]
Mask:  [False  True False False]
new_encoded_states:  [383928 553843 616677 631792]
Mask:  [False False  True False]
new_encoded_states:  [383928 553843 616677 631792]

Step number 179:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]


Mask:  [False  True]
new_encoded_states:  [165228 602639]

Step number 196:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 4 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 6]]
New halite:  467.0
New ships IDs:  [4 6]
Mask:  [ True False]
new_encoded_states:  [179806 704696]
Mask:  [False  True]
new_encoded_states:  [179806 704696]

Step number 197:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 4 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 6]]
New halite:  467.0
New ships IDs:  [4 6]
Mask:  [ True False]
new_encoded_states:  [281863 704696]
Mask:  [False  True]
new_encoded_states:  [281863 704696]

Step number 198:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 6]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 4 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  467.0
New ships IDs:  [6 4]
Mask:  [False  True]
new_encoded_states:  [ 92216 281864]
M


Step number 221:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 6 0 0]
 [4 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  467.0
New ships IDs:  [6 4]
Mask:  [ True False]
new_encoded_states:  [470304 514977]
Mask:  [False  True]
new_encoded_states:  [470304 514977]

Step number 222:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [4 0 0 0 0 6 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  467.0
New ships IDs:  [4 6]
Mask:  [False  True]
new_encoded_states:  [411479 485984]
Mask:  [ True False]
new_encoded_states:  [411479 485984]

Step number 223:
Current halite:  467.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 6 0]
 [0 4 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  467.0
New ships IDs:  [6 4]
Mask:  [False  True]
new_encoded_states:  [383922 427598]
Mask:  [ True False]
new_encoded_states:  [383922 427598]



Mask:  [ True False False]
new_encoded_states:  [223389 350966 393641]
Mask:  [False  True False]
new_encoded_states:  [223389 350966 393641]
Mask:  [False False  True]
new_encoded_states:  [223389 350966 393641]

Step number 246:
Current halite:  486.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [6 0 0 0 0 0 0]
 [0 0 0 0 0 0 7]
 [0 0 0 4 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  486.0
New ships IDs:  [6 7 4]
Mask:  [ True False False]
new_encoded_states:  [205737 408221 454824]
Mask:  [False False  True]
new_encoded_states:  [205737 408221 454824]
Mask:  [False  True False]
new_encoded_states:  [205737 408221 454824]

Step number 247:
Current halite:  486.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [6 0 0 0 0 0 0]
 [0 0 0 4 0 0 7]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  486.0
New ships IDs:  [6 4 7]
Mask:  [ True False False]
new_encoded_states:  [205738 350968 408224]
Mask:  [False False  True]
new_encoded_states:  [205738 350968 40


Step number 265:
Current halite:  486.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 4 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 6 0 0 0 0 0]
 [0 0 0 0 0 0 7]
 [0 0 0 0 0 0 0]]
New halite:  486.0
New ships IDs:  [4 6 7]
Mask:  [ True False False]
new_encoded_states:  [160368 427598 612359]
Mask:  [False False  True]
new_encoded_states:  [160368 427598 612359]
Mask:  [False  True False]
new_encoded_states:  [160368 427598 612359]

Step number 266:
Current halite:  486.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 4 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [7 6 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  486.0
New ships IDs:  [4 7 6]
Mask:  [ True False False]
new_encoded_states:  [160367 524157 529733]
Mask:  [False False  True]
new_encoded_states:  [160367 524157 529733]
Mask:  [False  True False]
new_encoded_states:  [160367 524157 529733]

Step number 267:
Current halite:  486.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 4 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 

Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  492.0
New ships IDs:  []

Step number 354:
Current halite:  492.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  492.0
New ships IDs:  []

Step number 355:
Current halite:  492.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  492.0
New ships IDs:  []

Step number 356:
Current halite:  492.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halite:  492.0
New ships IDs:  []

Step number 357:
Current halite:  492.0
Ship labels: 
 [[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
New halit

In [294]:
def get_shipy_state(state, steps, tot_turns = 400, map_size = 7):
    import encode as cod # check if needed
    
    # h_tot is the halite available
    N = np.count_nonzero(state[:,:,1]) # number of ships in the map
    t_left = tot_turns - steps # number of turns left until the end of the episode

    shipy_enc = cod.one_to_index(state[:,:,3], map_size) 
    shipy_dec = cod.decode(shipy_enc, map_size)
    s1 = shipy_dec + [1,0]
    s2 = shipy_dec + [-1,0]
    s3 = shipy_dec + [0,1]
    s4 = shipy_dec + [0,-1]
    
    s = [shipy_dec,s1,s2,s3,s4]
    mask = np.zeros((map_size,map_size)).astype(int)
    
    for x in s:
        mask[tuple(x)] = 1

    mask = mask.astype(bool)
    near_ships = state[:,:,1][mask].sum() #number of ships that in one move can go to the shipyard
    
    shipy_state = np.array([N,t_left,h_tot,near_ships])
    return shipy_state

In [None]:
def shipyard_training(q_values, weights, ship_eps, show = True, N_BATCH = 20, MAX_EPOCHS = 300):
    
    halite_score = np.zeros(MAX_EPOCHS)
    epochs = 0
    
    if show:
        # visualize online the results
        %matplotlib notebook
        fig = plt.figure(figsize = (6,8))
        plt.ion()

        ax1 = fig.add_subplot(211)
        ax1.set_xlim(0,MAX_EPOCHS+1)
        #ax1.set_ylim(0,8000)
        ax1.set_xlabel("Epochs", fontsize = 14)
        ax1.set_ylabel("Halite collected", fontsize = 14)


        ax2 = fig.add_subplot(212)
        ax2.set_xlim(0,MAX_EPOCHS+1)
        ax2.set_ylim(0,0.5)
        ax2.set_xlabel("Epochs", fontsize = 14)
        ax2.set_ylabel("Epsilon", fontsize = 14)

        plt.tight_layout()
        fig.show()
        fig.canvas.draw()

    from tqdm import tnrange

    for k in tnrange(MAX_EPOCHS):
        #@@@@@@@@@@@@@@@@@@@@@@
        # here starts an epoch
        #@@@@@@@@@@@@@@@@@@@@@@
        epochs = epochs + 1
        halite_progress = np.zeros(N_BATCH) # bunch of 100 episodes
        episode_progress = []
        eps = 0.5 # starting value of epsilon
        # generate an adaptive epsilon greedy algorithm, calibrated in order to have epsilon = 10^-4 at the last epoch
        epsilons = np.array(list(map(lambda i : eps*np.exp(-i*2*np.log(10)/MAX_EPOCHS), np.arange(0,MAX_EPOCHS+1))))

        for i in range(N_BATCH):
            #@@@@@@@@@@@@@@@@@@@@@@@@
            # here starts an episode
            #@@@@@@@@@@@@@@@@@@@@@@@@
            env = Env.HaliteEnv(NUM_PLAYERS, MAP_SIZE, episode_lenght = TOT_TURNS) # init environment
            steps = 0
            episode_trajectory = []

            # first mandatory step
            steps = steps + 1
            action_matrix = np.full((MAP_SIZE,MAP_SIZE), -1) # no ship, no action
            shipyard_action = 1 # initially always choose to create a ship
            # returns the matricial state, the array of players halite and a flag that is true if it's the final turn
            n_ships = 0
            t_left = TOT_TURNS - steps
            h_tot = 5000
            near_ships = 0
            episode_trajectory.append([n_ships, t_left, h_tot, near_ships])
            state, players_halite, finish, _ = env.step(action_matrix, makeship = shipyard_action) 
            current_halite = players_halite[0][0]
            encoded_states = cod.encode_multi_state(state, map_size = MAP_SIZE, h_lev = H_LEV, n_actions = N_ACTIONS, debug=False)
            encoded_states = np.array(encoded_states)
            
            
            while True:
                steps = steps + 1
                actions = []
                for j in range(len(encoded_states)):
                    a_enc = greedy_policy(encoded_states[j], q_values)
                    actions.append(a_enc)
                a_mat = cod.multi_scalar_to_matrix_action(actions, state, map_size = MAP_SIZE)

                shipyard_action = shipy_policy(weights, state, current_halite, steps, epsilon = epsilons[epochs],
                                               tot_turns = TOT_TURNS, map_size = MAP_SIZE)
                if shipyard_action == True:
                    episode_trajectory.append(get_shipy_state(state, steps, tot_turns = TOT_TURNS, map_size = MAP_SIZE))
                
                state, players_halite, finish, _ = env.step(a_mat, makeship = shipyard_action) 
                current_halite = players_halite[0][0]
                encoded_states = cod.encode_multi_state(state, map_size = 7, h_lev = 3,
                                                            n_actions = 5, debug = False)

                if (finish == True) or (steps >= 400):
                    #print("End episode.")
                    halite_progress[i] = current_halite - 4000
                    shipy_progress.append((episode_trajectory, current_halite - 4000))
                    break
            #break # play just 1 episode

        #break # play just 1 epoch

        halite_score[epochs-1] = halite_progress.mean()

        if show:
            ax1.scatter(epochs, halite_score[epochs-1], s = 10, color='blue')
            #ax2.scatter(epochs, reward_score[epochs-1], s = 10,color='blue')
            ax2.scatter(epochs, epsilons[epochs], s = 10, color='blue')
            #ax4.scatter(epochs, shipy_pass[epochs-1], s = 10, color='blue')
            fig.canvas.draw()
        else:
            #print("Average reward per episode in epoch %d: %.3f"%(epochs, reward_progress.mean()))
            print("Average halite collected per episode in epoch %d: %.3f"%(epochs,halite_progress.mean()))
            print("Espilon value: %.4f \n"%epsilons[epochs])

        if epochs >= MAX_EPOCHS:
            print("Hey, I think you've had enough! Let's stop here.")
            break