In [3]:
import torch
import torch.nn as nn

import numpy as np
import random
from collections import defaultdict

import wandb


batch_size = 1000

cuda = torch.device('cpu')


wandb.init(entity = "dexfrost89", project="coin game", name=str(batch_size))

session_saver = []

class env:
    def __init__(self, size, step_limit, debug=0):
        self.number_of_resources = 3
        self.amount_of_resource = 4
        self.debug = debug
        self.size = size
        self.step_limit = step_limit
        #self.valid_moves = ['up', 'down', 'left', 'right', 'pass']
        self.valid_moves = [0, 1, 2, 3, 4]
        self.move_map = {0:[-1,0], 
                         1:[1,0], 
                         2:[0,-1], 
                         3:[0,1], 
                         4:[0,0]}
        
        self.initialize_map()
        self.initialize_resources()
        

    
    def initialize_map(self):
        self.map = np.zeros((self.size, self.size), dtype=int)
        agent_rand_loc = random.randint(0,1)
        if agent_rand_loc:
            agent_1_coords = [0,self.size -1]
            self.map[0,self.size -1 ] = -1
            agent_2_coords = [0,0]
            self.map[0, 0] = -2 
            
        else:
            agent_1_coords = [0,0]
            self.map[0,0] = -1
            agent_2_coords = [0,self.size-1]
            self.map[0,self.size-1] = -2
        self.agents_coords = [agent_1_coords, agent_2_coords]
        
    def initialize_resources(self):
        self.resources = []
        index = [i for i in range(self.size**2) if (i != 0) and (i != self.size-1)]
        all_indexes = random.sample(index, self.number_of_resources*self.amount_of_resource)
        res_ind = []
        for i in range(self.number_of_resources):
            resource = random.sample(all_indexes, self.amount_of_resource)
            all_indexes = [r for r in all_indexes if r not in resource]
            resource = [(int(r/self.size),r%self.size) for r in resource]
            empty_map = np.zeros((self.size, self.size), dtype=int)
            for ind in range(len(resource)):
                x, y = resource[ind][0], resource[ind][1]                
                self.map[x, y] = i+1
                empty_map[x, y] = 1
            self.resources.append(empty_map)
        goal_index = [i for i in range(self.number_of_resources)]
        goals = random.sample(goal_index, 2)
        self.agents_goals = goals
        self.agents_res_collected = [[], []]
        self.agents_self_goal_collected = [0, 0]
        self.agents_othr_goal_collected = [0, 0]
        self.agents_nthr_goal_collected = [0, 0]
                
    def step(self, agent_numb, move_code):
        if self.step_limit == 0:
            if self.debug:
                print('step limit reached, error')
        else:
            self.step_limit -= 1
            if self.debug:
                print(self.step_limit, 'steps left')
        if move_code not in self.valid_moves:
            if self.debug:
                print('icorrect move')
        else:
            agent_old_coords = self.agents_coords[agent_numb]
            move = self.move_map[move_code]
            agent_new_coords = l3 = [l+r for l,r in zip(agent_old_coords, move)]
            correct_move = True
            for coord in agent_new_coords:
                if (coord < 0) or (coord >= self.size):
                    if self.debug:
                        print('icorrect move')
                    correct_move = False
            other_coords = self.agents_coords[1-agent_numb]
            if agent_new_coords == other_coords:
                if self.debug:
                    print('icorrect move')
                correct_move = False
            if correct_move:
                self.map[agent_old_coords[0],agent_old_coords[1]] = 0
                self.map[agent_new_coords[0],agent_new_coords[1]] = -1 - agent_numb
                self.agents_coords[agent_numb] = agent_new_coords
                for res in range(self.number_of_resources):
                    if self.resources[res][agent_new_coords[0],agent_new_coords[1]] != 0:
                        self.agents_res_collected[agent_numb].append(res)
                        self.resources[res][agent_new_coords[0],agent_new_coords[1]] = 0
                        if res == self.agents_goals[agent_numb]:
                            self.agents_self_goal_collected[agent_numb]+=1
                            if self.debug:
                                print('collected self goal resource:', res)
                        elif res == self.agents_goals[1 - agent_numb]:
                            self.agents_othr_goal_collected[agent_numb]+=1
                            if self.debug:
                                print('collected other goal resource:', res)
                        else:
                            self.agents_nthr_goal_collected[agent_numb]+=1
                            if self.debug:
                                print('collected neither goal resource:', res)
                            
    def observation(self, agent_numb):
        feature_vec = []
        self_other_coords = [self.agents_coords[agent_numb], 
                             self.agents_coords[1 - agent_numb] 
                            ]
        for agnt_coords in self_other_coords:
            agent_map = np.zeros((self.size, self.size), dtype=int)
            agent_map[agnt_coords[0],agnt_coords[1]] = 1
            agent_map = np.reshape(agent_map, self.size*self.size)
            feature_vec.extend(agent_map)
            
        for res in range(len(self.resources)):
            feature_vec.extend(np.reshape(self.resources[res], self.size*self.size))
        
        return feature_vec
    
    def reward(self):
        
        n_self_c_self = self.agents_self_goal_collected[0]
        n_othr_c_self = self.agents_othr_goal_collected[1]
        
        n_self_c_othr = self.agents_othr_goal_collected[0]
        n_othr_c_othr = self.agents_self_goal_collected[1]
        
        n_self_c_nthr = self.agents_nthr_goal_collected[0]
        n_othr_c_nthr = self.agents_nthr_goal_collected[1]
        
        reward = (n_self_c_self + n_othr_c_self)**2 + \
                 (n_self_c_othr + n_othr_c_othr)**2 - \
                 (n_self_c_nthr + n_othr_c_nthr)**2 
        
        return reward

    def reset(self):
        self.step_limit = step_limit
        self.initialize_map()
        self.initialize_resources()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [0]:
!pip install wandb

In [0]:
class Ffunction(nn.Module):
    def __init__(self, input_size):
        super(Ffunction, self).__init__()
        self.input_size = input_size
        self.fullycon1 = nn.Linear(self.input_size, 64)
        nn.init.orthogonal_(self.fullycon1.weight)
        self.fullycon1.bias.data.fill_(0)

        self.fullycon2 = nn.Linear(64, 64)
        nn.init.orthogonal_(self.fullycon2.weight)
        self.fullycon2.bias.data.fill_(0)
        
        self.lstm = nn.LSTM(64, 64)
        nn.init.orthogonal_(self.lstm.weight_hh_l0)
        nn.init.orthogonal_(self.lstm.weight_ih_l0)
        self.lstm.bias_ih_l0.data.fill_(0)
        self.lstm.bias_hh_l0.data.fill_(0)

        self.policy = nn.Linear(64, 5)
        nn.init.orthogonal_(self.policy.weight)
        self.policy.bias.data.fill_(0)
        self.value = nn.Linear(64, 1)
        nn.init.orthogonal_(self.value.weight)
        self.value.bias.data.fill_(0)

        self.entropy_loss_coef = 0.01
        self.value_loss_coef = 0.5

    def forward(self, inputs):
        inputs, (hx, cx) = inputs
        
        x = nn.functional.elu(self.fullycon1(inputs))
        x = nn.functional.elu(self.fullycon2(x))

        x = x.view(-1, 1, 64)

        x, (hx, cx) = self.lstm(x, (hx, cx))
        x = x.view(-1, 64)
        return nn.functional.softmax(self.policy(x)), self.value(x), (hx, cx)

    def get_a2c_loss(self, probs, values, rewards):

        advantages = rewards - values
        value_loss = (advantages.pow(2)).mean()

        entropy_loss = (-torch.sum(probs * torch.log(probs), (1))).mean()
        
        logprobs, _ = torch.max(torch.log(probs), (1))
        policy_loss = -(logprobs * advantages.detach()).mean()

        #print(policy_loss.item(), entropy_loss.item(), value_loss.item())

        return policy_loss - self.entropy_loss_coef * entropy_loss + self.value_loss_coef * value_loss

In [0]:
agent1 = Ffunction(326)
agent1.to(device=cuda)
opt1 = torch.optim.Adam(agent1.parameters(), lr=0.0001, betas=(0.9, 0.999), eps=1 * 10 ** -8)

agent2 = Ffunction(326)
agent2.to(device=cuda)
opt2 = torch.optim.Adam(agent2.parameters(), lr=0.0001, betas=(0.9, 0.999), eps=1 * 10 ** -8)

# buffer_ag_N = [[input, action, revard]*number_of_samples]
buffer_ag_1 = []
buffer_ag_2 = []
reward_buf = []
self_coins_collected1 = []
self_coins_collected2 = []
other_coins_collected1 = []
other_coins_collected2 = []
neither_coins_collected1 = []
neither_coins_collected2 = []

def sample_from_buffer(buffer, batch_size=10):
    batch = random.sample(buffer, batch_size)
    inputs = [i[0] for i in batch]
    actions = [i[1] for i in batch]
    revards = [i[2] for i in batch]
    
    inputs = torch.FloatTensor(inputs)
    actions = torch.LongTensor(actions)
    revards = torch.FloatTensor(revards)
    
    return inputs, actions, revards

def sample_buffer(buffer):
    probs = [i[0] for i in buffer]
    values = [i[1] for i in buffer]
    rewards = [i[2] for i in buffer]

    probs = torch.cat(probs)
    values = torch.LongTensor(values)
    rewards = torch.FloatTensor(rewards)

    return probs, values, rewards

for episode in range(1, 100001):

    hx11, cx11 = torch.zeros(1, 1, 64), torch.zeros(1, 1, 64)

    hx21, cx21 = torch.zeros(1, 1, 64), torch.zeros(1, 1, 64)
  
    for _ in range(batch_size // 10):
        hx1 = hx11.detach()
        cx1 = cx11.detach()

        hx2 = hx21.detach()
        cx2 = cx21.detach()

        session_saver.append([])
        environment = env(8, 20)
        session_saver[-1].append(environment.map.tolist())

        x_other1 = torch.FloatTensor([1] * 3).to(device=cuda) / 3
        x_other1.requires_grad_(True)
        opt_other1 = torch.optim.SGD([x_other1], lr=0.1)
        x_self1 = [0] * 3
        x_self1[environment.agents_goals[0]] = 1
        x_self1 = torch.FloatTensor(x_self1).to(device=cuda)
        x_self1.requires_grad_(False)

        x_other2 = torch.FloatTensor([1] * 3).to(device=cuda) / 3
        x_other2.requires_grad_(True)
        opt_other2 = torch.optim.SGD([x_other2], lr=0.1)
        x_self2 = [0] * 3
        x_self2[environment.agents_goals[1]] = 1
        x_self2 = torch.FloatTensor(x_self2).to(device=cuda)
        x_self2.requires_grad_(False)


        action1, action2 = 0, 0

        seq1 = []
        seq2 = []

        for step in range(10):
            #Make action
            state1 = environment.observation(0)
            input1 = torch.cat([torch.Tensor(state1).to(device=cuda), x_self1, x_other1]).to(device=cuda)
            probs1, values1, (hx11, cx11) = agent1.forward((input1, (hx1, cx1)))
            action1 = torch.argmax(probs1)
            seq1.append([probs1, values1])


            state2 = environment.observation(1)
            input2 = torch.cat([torch.Tensor(state2).to(device=cuda), x_self2, x_other2]).to(device=cuda)
            probs2, values2, (hx21, cx21) = agent2.forward((input2, (hx2, cx2)))
            action2 = torch.argmax(probs2)
            seq2.append([probs2, values2])

            #Update SOM
            opt_other1.zero_grad()
            input_other1 = torch.cat([torch.Tensor(state2).to(device=cuda), x_other1, x_self1]).to(device=cuda)
            probs1, _, _ = agent1.forward((input_other1, (hx2, cx2)))
            loss = torch.nn.functional.cross_entropy(probs1.view(1, 5), torch.LongTensor([action2.item()]).to(device=cuda))
            loss.backward()
            opt_other1.step()

            #print(action1.item() == torch.argmax(probs1).item())

            opt_other2.zero_grad()
            input_other2 = torch.cat([torch.Tensor(state1).to(device=cuda), x_other2, x_self2]).to(device=cuda)
            probs2, _, _ = agent2.forward((input_other2, (hx1, cx1)))
            loss = torch.nn.functional.cross_entropy(probs2.view(1, 5), torch.LongTensor([action1.item()]).to(device=cuda))
            loss.backward()
            opt_other2.step()


            #Add SA to sequences
            environment.step(0, action1.item())
            session_saver[-1].append(environment.map.tolist())

            environment.step(1, action2.item())
            session_saver[-1].append(environment.map.tolist())
    
    #Add sequences to buffer
        reward = environment.reward()
        reward_buf.append(reward)
        self_coins_collected1.append(environment.agents_self_goal_collected[0])
        self_coins_collected2.append(environment.agents_self_goal_collected[1])
        other_coins_collected1.append(environment.agents_othr_goal_collected[0])
        other_coins_collected2.append(environment.agents_othr_goal_collected[1])
        neither_coins_collected1.append(environment.agents_nthr_goal_collected[0])
        neither_coins_collected2.append(environment.agents_nthr_goal_collected[1])

        for i in range(10):
            buffer_ag_1.append(seq1[-i - 1] + [torch.FloatTensor([[reward]])])
            buffer_ag_2.append(seq2[-i - 1] + [torch.FloatTensor([[reward]])])
            reward *= 0.99

    #A2C update
    if(len(buffer_ag_1) >= batch_size and len(buffer_ag_2) >= batch_size):

        print(np.mean(reward_buf))

        inputs, actions, rewards = sample_buffer(buffer_ag_1)
        opt1.zero_grad()
        loss1 = torch.sum(agent1.get_a2c_loss(inputs.to(device=cuda), actions.to(device=cuda), rewards.to(device=cuda))) / batch_size
        print(loss1.item(), ' ', end='')
        loss1.backward()
        opt1.step()

        inputs, actions, rewards = sample_buffer(buffer_ag_2)
        opt2.zero_grad()
        loss2 = torch.sum(agent2.get_a2c_loss(inputs.to(device=cuda), actions.to(device=cuda), rewards.to(device=cuda))) / batch_size
        print(loss2.item(), end='\n')
        loss2.backward()
        opt2.step()


        wandb.log({"loss1": loss1.item(), "loss2": loss2.item(), "reward": np.mean(reward_buf), "episode": episode, "games": episode * batch_size // 10, \
                   "self_coins1": np.mean(self_coins_collected1), "self_coins2": np.mean(self_coins_collected2), "other_coins1": np.mean(other_coins_collected1), \
                   "other_coins2": np.mean(other_coins_collected2), "neither_coins1": np.mean(neither_coins_collected1), "neither_coins2": np.mean(neither_coins_collected2)})

        buffer_ag_1 = []
        buffer_ag_2 = []
        reward_buf = []
        self_coins_collected1 = []
        self_coins_collected2 = []
        other_coins_collected1 = []
        other_coins_collected2 = []
        neither_coins_collected1 = []
        neither_coins_collected2 = []

import pickle

ftw = open('save' + str(batch_size), 'wb')

pickle.dump(session_saver, file=ftw)
ftw.close()



0.55
0.0026675343979150057  0.002664212603121996
0.42
0.002639257814735174  0.002640261547639966
0.99
0.004864643793553114  0.0048658642917871475
0.89
0.007156666833907366  0.007151665631681681
0.7
0.0034937458112835884  0.0035018501803278923
0.87
0.00555397430434823  0.00554636400192976
0.73
0.003878618124872446  0.0038714371621608734
1.2
0.0053109838627278805  0.005296173505485058
1.23
0.00501578813418746  0.005000838078558445
0.55
0.002908026799559593  0.0028940278571099043
1.04
0.005117109511047602  0.0050837756134569645
1.02
0.005049831699579954  0.005033153109252453
1.03
0.004983439575880766  0.00495892483741045
1.2
0.006738920230418444  0.006719629745930433
0.95
0.006808947306126356  0.006790853571146727
0.82
0.0052961185574531555  0.005285393912345171
1.31
0.008494737558066845  0.00845377892255783
0.99
0.00653299642726779  0.006508359685540199
0.59
0.005439794156700373  0.005420708563178778
0.8
0.006639121100306511  0.006618714891374111
0.57
0.003766696434468031  0.003752577351

In [0]:
envi = env(8, 20)

In [8]:
#envi.agents_self_goal_collected
#envi.agents_othr_goal_collected
envi.agents_nthr_goal_collected

[0, 0]