In [0]:
import torch
import torch.nn as nn

In [0]:
import numpy as np
import random
from collections import defaultdict

In [0]:
class env:
    def __init__(self, size, step_limit, debug=0):
        self.number_of_resources = 3
        self.amount_of_resource = 4
        self.debug = debug
        self.size = size
        self.step_limit = step_limit
        #self.valid_moves = ['up', 'down', 'left', 'right', 'pass']
        self.valid_moves = [0, 1, 2, 3, 4]
        self.move_map = {0:[-1,0], 
                         1:[1,0], 
                         2:[0,-1], 
                         3:[0,1], 
                         4:[0,0]}
        
        self.initialize_map()
        self.initialize_resources()
        

    
    def initialize_map(self):
        self.map = np.zeros((self.size, self.size), dtype=int)
        agent_rand_loc = random.randint(0,1)
        if agent_rand_loc:
            agent_1_coords = [0,self.size -1]
            self.map[0,self.size -1 ] = -1
            agent_2_coords = [0,0]
            self.map[0, 0] = -2 
            
        else:
            agent_1_coords = [0,0]
            self.map[0,0] = -1
            agent_2_coords = [0,self.size-1]
            self.map[0,self.size-1] = -2
        self.agents_coords = [agent_1_coords, agent_2_coords]
        
    def initialize_resources(self):
        self.resources = []
        index = [i for i in range(self.size**2) if (i != 0) and (i != self.size-1)]
        all_indexes = random.sample(index, self.number_of_resources*self.amount_of_resource)
        res_ind = []
        for i in range(self.number_of_resources):
            resource = random.sample(all_indexes, self.amount_of_resource)
            all_indexes = [r for r in all_indexes if r not in resource]
            resource = [(int(r/self.size),r%self.size) for r in resource]
            empty_map = np.zeros((self.size, self.size), dtype=int)
            for ind in range(len(resource)):
                x, y = resource[ind][0], resource[ind][1]                
                self.map[x, y] = i+1
                empty_map[x, y] = 1
            self.resources.append(empty_map)
        goal_index = [i for i in range(self.number_of_resources)]
        goals = random.sample(goal_index, 2)
        self.agents_goals = goals
        self.agents_res_collected = [[], []]
        self.agents_self_goal_collected = [0, 0]
        self.agents_othr_goal_collected = [0, 0]
        self.agents_nthr_goal_collected = [0, 0]
                
    def step(self, agent_numb, move_code):
        if self.step_limit == 0:
            if self.debug:
                print('step limit reached, error')
        else:
            self.step_limit -= 1
            if self.debug:
                print(self.step_limit, 'steps left')
        if move_code not in self.valid_moves:
            if self.debug:
                print('icorrect move')
        else:
            agent_old_coords = self.agents_coords[agent_numb]
            move = self.move_map[move_code]
            agent_new_coords = l3 = [l+r for l,r in zip(agent_old_coords, move)]
            correct_move = True
            for coord in agent_new_coords:
                if (coord < 0) or (coord >= self.size):
                    if self.debug:
                        print('icorrect move')
                    correct_move = False
            other_coords = self.agents_coords[1-agent_numb]
            if agent_new_coords == other_coords:
                if self.debug:
                    print('icorrect move')
                correct_move = False
            if correct_move:
                self.map[agent_old_coords[0],agent_old_coords[1]] = 0
                self.map[agent_new_coords[0],agent_new_coords[1]] = -1 - agent_numb
                self.agents_coords[agent_numb] = agent_new_coords
                for res in range(self.number_of_resources):
                    if self.resources[res][agent_new_coords[0],agent_new_coords[1]] != 0:
                        self.agents_res_collected[agent_numb].append(res)
                        self.resources[res][agent_new_coords[0],agent_new_coords[1]] = 0
                        if res == self.agents_goals[agent_numb]:
                            self.agents_self_goal_collected[agent_numb]+=1
                            if self.debug:
                                print('collected self goal resource:', res)
                        elif res == self.agents_goals[1 - agent_numb]:
                            self.agents_othr_goal_collected[agent_numb]+=1
                            if self.debug:
                                print('collected other goal resource:', res)
                        else:
                            self.agents_nthr_goal_collected[agent_numb]+=1
                            if self.debug:
                                print('collected neither goal resource:', res)
                            
    def observation(self, agent_numb):
        feature_vec = []
        self_other_coords = [self.agents_coords[agent_numb], 
                             self.agents_coords[1 - agent_numb] 
                            ]
        for agnt_coords in self_other_coords:
            agent_map = np.zeros((self.size, self.size), dtype=int)
            agent_map[agnt_coords[0],agnt_coords[1]] = 1
            agent_map = np.reshape(agent_map, self.size*self.size)
            feature_vec.extend(agent_map)
            
        for res in range(len(self.resources)):
            feature_vec.extend(np.reshape(self.resources[res], self.size*self.size))
        
        return feature_vec
    
    def reward(self):
        
        n_self_c_self = self.agents_self_goal_collected[0]
        n_othr_c_self = self.agents_othr_goal_collected[1]
        
        n_self_c_othr = self.agents_othr_goal_collected[0]
        n_othr_c_othr = self.agents_self_goal_collected[1]
        
        n_self_c_nthr = self.agents_nthr_goal_collected[0]
        n_othr_c_nthr = self.agents_nthr_goal_collected[1]
        
        reward = (n_self_c_self + n_othr_c_self)**2 + \
                 (n_self_c_othr + n_othr_c_othr)**2 - \
                 (n_self_c_nthr + n_othr_c_nthr)**2 
        
        return reward

    def reset(self):
        self.step_limit = step_limit
        self.initialize_map()
        self.initialize_resources()

In [0]:
class Ffunction(nn.Module):
  def __init__(self, input_size):
    super(Ffunction, self).__init__()
    self.input_size = input_size
    self.lstm = nn.LSTM(input_size, 64)
    self.fullycon1 = nn.Linear(64, 64)
    nn.init.orthogonal_(self.fullycon1.weight)
    self.fullycon2 = nn.Linear(64, 64)
    nn.init.orthogonal_(self.fullycon2.weight)
    self.policy = nn.Linear(64, 5)
    nn.init.orthogonal_(self.policy.weight)
    self.value = nn.Linear(64, 1)
    nn.init.orthogonal_(self.value.weight)

    self.beta = 0.01

  def forward(self, x): #x = input
    result, _ = self.lstm(x.view(-1, 1, self.input_size))
    result = nn.functional.elu(self.fullycon1(result.view(-1, 64)))
    result = nn.functional.elu(self.fullycon2(result))
    return nn.functional.softmax(self.policy(result)), self.value(result)

  def get_a2c_loss(self, actions, states, rewards):
    probs, values = self.forward(states)
    advantage = rewards - values.view(batch_size)
    actions_one_hot = torch.zeros(batch_size, 5)
    actions_one_hot[torch.arange(batch_size), actions.view(batch_size)] = 1
    logs = torch.log(torch.sum(probs.view(batch_size, 5) * actions_one_hot.view(batch_size, 5), (1)))
    #print(values)
    logloss = logs * advantage.view(batch_size)
    entropyloss = -torch.sum(probs * torch.log(probs), (1))
    print(logloss, entropyloss)

    return logloss + self.beta * entropyloss

In [0]:
agent1 = Ffunction(326)
opt1 = torch.optim.Adam(agent1.parameters(), betas=(0.9, 0.999), eps=1 * 10 ** -8)
batch_size = 10

In [0]:
agent2 = Ffunction(326)
opt2 = torch.optim.Adam(agent2.parameters(), betas=(0.9, 0.999), eps=1 * 10 ** -8)

In [0]:
# buffer_ag_N = [[input, action, revard]*number_of_samples]
buffer_ag_1 = []
buffer_ag_2 = []

def sample_from_buffer(buffer, batch_size=10):
    batch = random.sample(buffer, batch_size)
    inputs = [i[0] for i in batch]
    actions = [i[1] for i in batch]
    revards = [i[2] for i in batch]
    
    inputs = torch.FloatTensor(inputs)
    actions = torch.LongTensor(actions)
    revards = torch.FloatTensor(revards)
    
    return inputs, actions, revards

In [130]:
for episodes in range(100):
  x_other1 = torch.FloatTensor([1] * 3) / 3
  x_other1.requires_grad_(True)
  opt_other1 = torch.optim.SGD([x_other1], lr=0.1)
  x_self1 = [0] * 3
  x_self1[environment.agents_goals[0]] = 1
  x_self1 = torch.FloatTensor(x_self1)
  x_self1.requires_grad_(False)

  x_other2 = torch.FloatTensor([1] * 3) / 3
  x_other2.requires_grad_(True)
  opt_other2 = torch.optim.SGD([x_other2], lr=0.1)
  x_self2 = [0] * 3
  x_self2[environment.agents_goals[1]] = 1
  x_self2 = torch.FloatTensor(x_self2)
  x_self2.requires_grad_(False)

  environment = env(8, 20)

  action1, action2 = 0, 0

  seq1 = []
  seq2 = []

  for step in range(10):
    #Make action
    state1 = environment.observation(0)
    input1 = torch.cat([torch.Tensor(state1), x_self1, x_other1])
    probs, _ = agent1.forward(input1)
    action1 = torch.argmax(probs)

    
    state2 = environment.observation(1)
    input2 = torch.cat([torch.Tensor(state2), x_self2, x_other2])
    probs, _ = agent2.forward(input2)
    action2 = torch.argmax(probs)

    #Update SOM
    opt_other1.zero_grad()
    input_other1 = torch.cat([torch.Tensor(state2), x_other1, x_self1])
    probs1, _ = agent1.forward(input_other1)
    loss = torch.nn.functional.cross_entropy(probs1.view(1, 5), torch.LongTensor([action2.item()]))
    loss.backward()
    opt_other1.step()


    opt_other2.zero_grad()
    input_other2 = torch.cat([torch.Tensor(state1), x_other2, x_self2])
    probs2, _ = agent2.forward(input_other2)
    loss = torch.nn.functional.cross_entropy(probs2.view(1, 5), torch.LongTensor([action1.item()]))
    loss.backward()
    opt_other2.step()


    #Add SA to sequences
    environment.step(0, action1.item())
    seq1.append([input1.tolist(), action1.tolist()])
    
    environment.step(1, action2.item())
    seq2.append([input2.tolist(), action2.tolist()])
  
  #Add sequences to buffer
  reward = environment.reward()
  for i in range(10):
    buffer_ag_1.append(seq1[-i - 1] + [reward])
    buffer_ag_2.append(seq2[-i - 1] + [reward])
    reward *= 0.99

  #A2C update
  if(len(buffer_ag_1) >= batch_size and len(buffer_ag_2) >= batch_size):
    inputs, actions, rewards = sample_from_buffer(buffer_ag_1, batch_size)
    opt1.zero_grad()
    loss = torch.sum(agent1.get_a2c_loss(actions, inputs, rewards))
    #print(loss, ' ', end='')
    loss.backward()
    opt1.step()

    inputs, actions, rewards = sample_from_buffer(buffer_ag_2, batch_size)
    opt2.zero_grad()
    loss = torch.sum(agent2.get_a2c_loss(actions, inputs, rewards))
    #print(loss, end='\n')
    loss.backward()
    opt2.step()





tensor([1.2416, 1.1135, 1.0020, 1.0600, 1.0458, 1.0389, 1.0361, 1.1222, 1.1070,
        1.0951], grad_fn=<MulBackward0>) tensor([1.6059, 1.6045, 1.6036, 1.6045, 1.6039, 1.6046, 1.6046, 1.6047, 1.6040,
        1.6043], grad_fn=<NegBackward>)
tensor([1.1919, 1.3139, 1.2683, 1.3049, 1.3683, 1.3552, 1.3204, 1.2922, 1.3131,
        1.3911], grad_fn=<MulBackward0>) tensor([1.6023, 1.6035, 1.6038, 1.6039, 1.6040, 1.6045, 1.6045, 1.6044, 1.6044,
        1.6041], grad_fn=<NegBackward>)
tensor([-0.1523,  1.0071, -0.2350,  0.9618, -0.3244,  0.9636,  0.8933, -0.2990,
         0.8727, -0.2969], grad_fn=<MulBackward0>) tensor([1.6070, 1.6045, 1.6036, 1.6013, 1.6004, 1.5996, 1.5992, 1.6013, 1.6008,
        1.6031], grad_fn=<NegBackward>)
tensor([-0.2193,  1.0812, -0.2901,  0.9822,  1.1121, -0.2615, -0.1806, -0.1531,
         1.1058, -0.1894], grad_fn=<MulBackward0>) tensor([1.5926, 1.5992, 1.5936, 1.5986, 1.6001, 1.5961, 1.5900, 1.5891, 1.5978,
        1.5929], grad_fn=<NegBackward>)
tensor([-11.6890

In [84]:
x = torch.FloatTensor([0.5, 0.3, 0.2])
y = torch.FloatTensor([0.5, 0.3, 0.2])
x.tolist()

[0.5, 0.30000001192092896, 0.20000000298023224]