In [1]:
import numpy as np
import random
from collections import defaultdict

In [440]:
class env:
    def __init__(self, size, step_limit, debug=0):
        self.number_of_resources = 3
        self.amount_of_resource = 4
        self.debug = debug
        self.size = size
        self.step_limit = step_limit
        #self.valid_moves = ['up', 'down', 'left', 'right', 'pass']
        self.valid_moves = [0, 1, 2, 3, 4]
        self.move_map = {0:[-1,0], 
                         1:[1,0], 
                         2:[0,-1], 
                         3:[0,1], 
                         4:[0,0]}
        
        self.initialize_map()
        self.initialize_resources()
        

    
    def initialize_map(self):
        self.map = np.zeros((self.size, self.size), dtype=int)
        agent_rand_loc = random.randint(0,1)
        if agent_rand_loc:
            agent_1_coords = [0,self.size -1]
            self.map[0,self.size -1 ] = -1
            agent_2_coords = [0,0]
            self.map[0, 0] = -2 
            
        else:
            agent_1_coords = [0,0]
            self.map[0,0] = -1
            agent_2_coords = [0,self.size-1]
            self.map[0,self.size-1] = -2
        self.agents_coords = [agent_1_coords, agent_2_coords]
        
    def initialize_resources(self):
        self.resources = []
        index = [i for i in range(self.size**2) if (i != 0) and (i != self.size-1)]
        all_indexes = random.sample(index, self.number_of_resources*self.amount_of_resource)
        res_ind = []
        for i in range(self.number_of_resources):
            resource = random.sample(all_indexes, self.amount_of_resource)
            all_indexes = [r for r in all_indexes if r not in resource]
            resource = [(int(r/self.size),r%self.size) for r in resource]
            empty_map = np.zeros((self.size, self.size), dtype=int)
            for ind in range(len(resource)):
                x, y = resource[ind][0], resource[ind][1]                
                self.map[x, y] = i+1
                empty_map[x, y] = 1
            self.resources.append(empty_map)
        goal_index = [i for i in range(self.number_of_resources)]
        goals = random.sample(goal_index, 2)
        self.agents_goals = goals
        self.agents_res_collected = [[], []]
        self.agents_self_goal_collected = [0, 0]
        self.agents_othr_goal_collected = [0, 0]
        self.agents_nthr_goal_collected = [0, 0]
                
    def step(self, agent_numb, move_code):
        if self.step_limit == 0:
            if self.debug:
                print('step limit reached, error')
        else:
            self.step_limit -= 1
            if self.debug:
                print(self.step_limit, 'steps left')
        if move_code not in self.valid_moves:
            if self.debug:
                print('icorrect move')
        else:
            agent_old_coords = self.agents_coords[agent_numb]
            move = self.move_map[move_code]
            agent_new_coords = l3 = [l+r for l,r in zip(agent_old_coords, move)]
            correct_move = True
            for coord in agent_new_coords:
                if (coord < 0) or (coord >= self.size):
                    if self.debug:
                        print('icorrect move')
                    correct_move = False
            other_coords = self.agents_coords[1-agent_numb]
            if agent_new_coords == other_coords:
                if self.debug:
                    print('icorrect move')
                correct_move = False
            if correct_move:
                self.map[agent_old_coords[0],agent_old_coords[1]] = 0
                self.map[agent_new_coords[0],agent_new_coords[1]] = -1 - agent_numb
                self.agents_coords[agent_numb] = agent_new_coords
                for res in range(self.number_of_resources):
                    if self.resources[res][agent_new_coords[0],agent_new_coords[1]] != 0:
                        self.agents_res_collected[agent_numb].append(res)
                        self.resources[res][agent_new_coords[0],agent_new_coords[1]] = 0
                        if res == self.agents_goals[agent_numb]:
                            self.agents_self_goal_collected[agent_numb]+=1
                            if self.debug:
                                print('collected self goal resource:', res)
                        elif res == self.agents_goals[1 - agent_numb]:
                            self.agents_othr_goal_collected[agent_numb]+=1
                            if self.debug:
                                print('collected other goal resource:', res)
                        else:
                            self.agents_nthr_goal_collected[agent_numb]+=1
                            if self.debug:
                                print('collected neither goal resource:', res)
                            
    def observation(self, agent_numb):
        feature_vec = []
        self_other_coords = [self.agents_coords[agent_numb], 
                             self.agents_coords[1 - agent_numb] 
                            ]
        for agnt_coords in self_other_coords:
            agent_map = np.zeros((self.size, self.size), dtype=int)
            agent_map[agnt_coords[0],agnt_coords[1]] = 1
            agent_map = np.reshape(agent_map, self.size*self.size)
            feature_vec.extend(agent_map)
            
        for res in range(len(self.resources)):
            feature_vec.extend(np.reshape(self.resources[res], self.size*self.size))
        
        return feature_vec
    
    def reward(self):
        
        n_self_c_self = self.agents_self_goal_collected[0]
        n_othr_c_self = self.agents_othr_goal_collected[1]
        
        n_self_c_othr = self.agents_othr_goal_collected[0]
        n_othr_c_othr = self.agents_self_goal_collected[1]
        
        n_self_c_nthr = self.agents_nthr_goal_collected[0]
        n_othr_c_nthr = self.agents_nthr_goal_collected[1]
        
        reward = (n_self_c_self + n_othr_c_self)**2 + \
                 (n_self_c_othr + n_othr_c_othr)**2 - \
                 (n_self_c_nthr + n_othr_c_nthr)**2 
        
        return reward

    def reset(self):
        self.step_limit = step_limit
        self.initialize_map()
        self.initialize_resources()

In [441]:
buffer_ag_1 = []
buffer_ag_2 = []

def sample_from_buffer(buffer, batch_size=1):
    batch = random.sample(buffer,batch_size)
    inputs = [i[0] for i in batch]
    actions = [i[1] for i in batch]
    revards = [i[2] for i in batch]
    
    inputs = torch.FloatTensor(inputs)
    actions = torch.FloatTensor(actions)
    revards = torch.FloatTensor(revards)
    
    return inputs, actions, revards

In [442]:
env_1 = env(8, 20)
env_1.map

array([[-2,  0,  0,  3,  0,  0,  0, -1],
       [ 0,  1,  0,  0,  1,  0,  0,  0],
       [ 0,  0,  3,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  2,  0,  0,  3,  0],
       [ 0,  0,  2,  0,  0,  0,  0,  0],
       [ 0,  1,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  2,  0,  0,  0],
       [ 0,  0,  2,  1,  0,  0,  0,  3]])

In [443]:
env_1.agents_goals

[1, 0]

In [444]:
env_1.agents_coords

[[0, 7], [0, 0]]

In [445]:
#self.valid_moves = ['up', 'down', 'left', 'right', 'pass']
#self.valid_moves = [0, 1, 2, 3, 4]

In [457]:
env_1.step(agent_numb = 0, move_code = 1)
env_1.map

11 steps left
icorrect move


array([[-2,  0,  0,  3,  0,  0,  0,  0],
       [ 0,  1,  0,  0,  1,  0,  0,  0],
       [ 0,  0,  3,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  2,  0,  0,  3,  0],
       [ 0,  0,  2,  0,  0,  0,  0,  0],
       [ 0,  1,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  2,  0,  0,  0],
       [ 0,  0,  2,  1,  0,  0,  0, -1]])

In [449]:
env_1.debug = 1

In [405]:
env_1.agents_self_goal_collected

[0, 0]

In [406]:
env_1.agents_othr_goal_collected

[0, 0]

In [407]:
env_1.agents_nthr_goal_collected

[0, 0]

In [426]:
env_1.reward()

0

In [377]:
env_1.map

array([[-2,  0,  0,  0,  0,  0,  0, -1],
       [ 0,  0,  0,  3,  0,  0,  1,  0],
       [ 1,  0,  0,  0,  0,  0,  0,  0],
       [ 3,  0,  0,  0,  0,  0,  0,  3],
       [ 0,  0,  1,  2,  3,  2,  0,  0],
       [ 2,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  1,  0,  2,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0]])

In [389]:
env_1.reset()
env_1.map

array([[-1,  0,  0,  3,  1,  1,  0, -2],
       [ 0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  1,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  3,  0,  0],
       [ 2,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  2],
       [ 0,  0,  2,  2,  3,  0,  0,  3],
       [ 0,  0,  0,  0,  1,  0,  0,  0]])

In [None]:
env_1