In [20]:
import torch
import torch.nn as nn

import numpy as np
import random
from collections import defaultdict

import wandb

wandb.init(entity = "dexfrost89", project="coin game", name="10")

session_saver = []

class env:
    def __init__(self, size, step_limit, debug=0):
        self.number_of_resources = 3
        self.amount_of_resource = 4
        self.debug = debug
        self.size = size
        self.step_limit = step_limit
        #self.valid_moves = ['up', 'down', 'left', 'right', 'pass']
        self.valid_moves = [0, 1, 2, 3, 4]
        self.move_map = {0:[-1,0], 
                         1:[1,0], 
                         2:[0,-1], 
                         3:[0,1], 
                         4:[0,0]}
        
        self.initialize_map()
        self.initialize_resources()
        

    
    def initialize_map(self):
        self.map = np.zeros((self.size, self.size), dtype=int)
        agent_rand_loc = random.randint(0,1)
        if agent_rand_loc:
            agent_1_coords = [0,self.size -1]
            self.map[0,self.size -1 ] = -1
            agent_2_coords = [0,0]
            self.map[0, 0] = -2 
            
        else:
            agent_1_coords = [0,0]
            self.map[0,0] = -1
            agent_2_coords = [0,self.size-1]
            self.map[0,self.size-1] = -2
        self.agents_coords = [agent_1_coords, agent_2_coords]
        
    def initialize_resources(self):
        self.resources = []
        index = [i for i in range(self.size**2) if (i != 0) and (i != self.size-1)]
        all_indexes = random.sample(index, self.number_of_resources*self.amount_of_resource)
        res_ind = []
        for i in range(self.number_of_resources):
            resource = random.sample(all_indexes, self.amount_of_resource)
            all_indexes = [r for r in all_indexes if r not in resource]
            resource = [(int(r/self.size),r%self.size) for r in resource]
            empty_map = np.zeros((self.size, self.size), dtype=int)
            for ind in range(len(resource)):
                x, y = resource[ind][0], resource[ind][1]                
                self.map[x, y] = i+1
                empty_map[x, y] = 1
            self.resources.append(empty_map)
        goal_index = [i for i in range(self.number_of_resources)]
        goals = random.sample(goal_index, 2)
        self.agents_goals = goals
        self.agents_res_collected = [[], []]
        self.agents_self_goal_collected = [0, 0]
        self.agents_othr_goal_collected = [0, 0]
        self.agents_nthr_goal_collected = [0, 0]
                
    def step(self, agent_numb, move_code):
        if self.step_limit == 0:
            if self.debug:
                print('step limit reached, error')
        else:
            self.step_limit -= 1
            if self.debug:
                print(self.step_limit, 'steps left')
        if move_code not in self.valid_moves:
            if self.debug:
                print('icorrect move')
        else:
            agent_old_coords = self.agents_coords[agent_numb]
            move = self.move_map[move_code]
            agent_new_coords = l3 = [l+r for l,r in zip(agent_old_coords, move)]
            correct_move = True
            for coord in agent_new_coords:
                if (coord < 0) or (coord >= self.size):
                    if self.debug:
                        print('icorrect move')
                    correct_move = False
            other_coords = self.agents_coords[1-agent_numb]
            if agent_new_coords == other_coords:
                if self.debug:
                    print('icorrect move')
                correct_move = False
            if correct_move:
                self.map[agent_old_coords[0],agent_old_coords[1]] = 0
                self.map[agent_new_coords[0],agent_new_coords[1]] = -1 - agent_numb
                self.agents_coords[agent_numb] = agent_new_coords
                for res in range(self.number_of_resources):
                    if self.resources[res][agent_new_coords[0],agent_new_coords[1]] != 0:
                        self.agents_res_collected[agent_numb].append(res)
                        self.resources[res][agent_new_coords[0],agent_new_coords[1]] = 0
                        if res == self.agents_goals[agent_numb]:
                            self.agents_self_goal_collected[agent_numb]+=1
                            if self.debug:
                                print('collected self goal resource:', res)
                        elif res == self.agents_goals[1 - agent_numb]:
                            self.agents_othr_goal_collected[agent_numb]+=1
                            if self.debug:
                                print('collected other goal resource:', res)
                        else:
                            self.agents_nthr_goal_collected[agent_numb]+=1
                            if self.debug:
                                print('collected neither goal resource:', res)
                            
    def observation(self, agent_numb):
        feature_vec = []
        self_other_coords = [self.agents_coords[agent_numb], 
                             self.agents_coords[1 - agent_numb] 
                            ]
        for agnt_coords in self_other_coords:
            agent_map = np.zeros((self.size, self.size), dtype=int)
            agent_map[agnt_coords[0],agnt_coords[1]] = 1
            agent_map = np.reshape(agent_map, self.size*self.size)
            feature_vec.extend(agent_map)
            
        for res in range(len(self.resources)):
            feature_vec.extend(np.reshape(self.resources[res], self.size*self.size))
        
        return feature_vec
    
    def reward(self):
        
        n_self_c_self = self.agents_self_goal_collected[0]
        n_othr_c_self = self.agents_othr_goal_collected[1]
        
        n_self_c_othr = self.agents_othr_goal_collected[0]
        n_othr_c_othr = self.agents_self_goal_collected[1]
        
        n_self_c_nthr = self.agents_nthr_goal_collected[0]
        n_othr_c_nthr = self.agents_nthr_goal_collected[1]
        
        reward = (n_self_c_self + n_othr_c_self)**2 + \
                 (n_self_c_othr + n_othr_c_othr)**2 - \
                 (n_self_c_nthr + n_othr_c_nthr)**2 
        
        return reward

    def reset(self):
        self.step_limit = step_limit
        self.initialize_map()
        self.initialize_resources()


batch_size = 10

class Ffunction(nn.Module):
    def __init__(self, input_size):
        super(Ffunction, self).__init__()
        self.input_size = input_size
        self.lstm = nn.LSTM(input_size, 64)
        self.fullycon1 = nn.Linear(64, 64)
        nn.init.orthogonal_(self.fullycon1.weight)
        self.fullycon2 = nn.Linear(64, 64)
        nn.init.orthogonal_(self.fullycon2.weight)
        self.policy = nn.Linear(64, 5)
        nn.init.orthogonal_(self.policy.weight)
        self.value = nn.Linear(64, 1)
        nn.init.orthogonal_(self.value.weight)

        self.beta = 0.01

    def forward(self, x): #x = input
        result, _ = self.lstm(x.view(-1, 1, self.input_size))
        result = nn.functional.elu(self.fullycon1(result.view(-1, 64)))
        result = nn.functional.elu(self.fullycon2(result))
        return nn.functional.softmax(self.policy(result)), self.value(result)

    def get_a2c_loss(self, states, actions, rewards):
        probs, values = self.forward(states)

        #print("rewards/values")
        #print(rewards)
        #print()
        #print(values)

        #print(torch.argmax(probs, dim=1))
        #print(actions)

        advantage = rewards - values.view(batch_size)
        actions_one_hot = torch.zeros(batch_size, 5)
        actions_one_hot[torch.arange(batch_size), actions.view(batch_size)] = 1
        logs = torch.log(torch.sum(probs.view(batch_size, 5) * actions_one_hot.view(batch_size, 5), (1)))
        #print(values)
        logloss = -logs * (advantage.view(batch_size)).detach()
        entropyloss = -torch.sum(probs * torch.log(probs), (1))
        #print("\nlogs", logs)

        return logloss + self.beta * entropyloss + 0.5 * advantage * advantage

agent1 = Ffunction(326)
opt1 = torch.optim.Adam(agent1.parameters(), betas=(0.9, 0.999), eps=1 * 10 ** -8)

agent2 = Ffunction(326)
opt2 = torch.optim.Adam(agent2.parameters(), betas=(0.9, 0.999), eps=1 * 10 ** -8)

# buffer_ag_N = [[input, action, revard]*number_of_samples]
buffer_ag_1 = []
buffer_ag_2 = []
reward_buf = []

def sample_from_buffer(buffer, batch_size=10):
    batch = random.sample(buffer, batch_size)
    inputs = [i[0] for i in batch]
    actions = [i[1] for i in batch]
    revards = [i[2] for i in batch]
    
    inputs = torch.FloatTensor(inputs)
    actions = torch.LongTensor(actions)
    revards = torch.FloatTensor(revards)
    
    return inputs, actions, revards

def sample_buffer(buffer):
    inputs = [i[0] for i in buffer]
    actions = [i[1] for i in buffer]
    revards = [i[2] for i in buffer]

    inputs = torch.FloatTensor(inputs)
    actions = torch.LongTensor(actions)
    revards = torch.FloatTensor(revards)

    return inputs, actions, revards

for episode in range(1, 11):

  
    for _ in range(batch_size // 10):
        session_saver.append([])
        environment = env(8, 20)
        session_saver[-1].append(environment.map.tolist())

        x_other1 = torch.FloatTensor([1] * 3) / 3
        x_other1.requires_grad_(True)
        opt_other1 = torch.optim.SGD([x_other1], lr=0.1)
        x_self1 = [0] * 3
        x_self1[environment.agents_goals[0]] = 1
        x_self1 = torch.FloatTensor(x_self1)
        x_self1.requires_grad_(False)

        x_other2 = torch.FloatTensor([1] * 3) / 3
        x_other2.requires_grad_(True)
        opt_other2 = torch.optim.SGD([x_other2], lr=0.1)
        x_self2 = [0] * 3
        x_self2[environment.agents_goals[1]] = 1
        x_self2 = torch.FloatTensor(x_self2)
        x_self2.requires_grad_(False)


        action1, action2 = 0, 0

        seq1 = []
        seq2 = []

        for step in range(10):
            #Make action
            state1 = environment.observation(0)
            input1 = torch.cat([torch.Tensor(state1), x_self1, x_other1])
            probs1, values1 = agent1.forward(input1)
            action1 = torch.argmax(probs1)
            seq1.append([input1.tolist(), action1.tolist()])


            state2 = environment.observation(1)
            input2 = torch.cat([torch.Tensor(state2), x_self2, x_other2])
            probs2, values2 = agent2.forward(input2)
            action2 = torch.argmax(probs2)
            seq2.append([input2.tolist(), action2.tolist()])

            #Update SOM
            opt_other1.zero_grad()
            input_other1 = torch.cat([torch.Tensor(state2), x_other1, x_self1])
            probs1, _ = agent1.forward(input_other1)
            loss = torch.nn.functional.cross_entropy(probs1.view(1, 5), torch.LongTensor([action2.item()]))
            loss.backward()
            opt_other1.step()

            probs1, values1 = agent1.forward(input1)
            #print(action1.item() == torch.argmax(probs1).item())

            opt_other2.zero_grad()
            input_other2 = torch.cat([torch.Tensor(state1), x_other2, x_self2])
            probs2, _ = agent2.forward(input_other2)
            loss = torch.nn.functional.cross_entropy(probs2.view(1, 5), torch.LongTensor([action1.item()]))
            loss.backward()
            opt_other2.step()


            #Add SA to sequences
            environment.step(0, action1.item())
            session_saver[-1].append(environment.map.tolist())

            environment.step(1, action2.item())
            session_saver[-1].append(environment.map.tolist())
    
    #Add sequences to buffer
        reward = environment.reward()
        reward_buf.append(reward)
        for i in range(10):
            buffer_ag_1.append(seq1[-i - 1] + [reward])
            buffer_ag_2.append(seq2[-i - 1] + [reward])
            reward *= 0.99

    #A2C update
    if(len(buffer_ag_1) >= batch_size and len(buffer_ag_2) >= batch_size):
        inputs, actions, rewards = sample_buffer(buffer_ag_1)
        opt1.zero_grad()
        loss1 = torch.sum(agent1.get_a2c_loss(inputs, actions, rewards)) / batch_size
        print(loss1.item(), ' ', end='')
        loss1.backward()
        opt1.step()

        inputs, actions, rewards = sample_buffer(buffer_ag_2)
        opt2.zero_grad()
        loss2 = torch.sum(agent2.get_a2c_loss(inputs, actions, rewards)) / batch_size
        print(loss2.item(), end='\n')
        loss2.backward()
        opt2.step()

        print(np.mean(reward_buf))

        wandb.log({"loss1": loss1.item(), "loss2": loss2.item(), "reward": np.mean(reward_buf), "episode": episode})

        buffer_ag_1 = []
        buffer_ag_2 = []
        reward_buf = []

import pickle

ftw = open('save10', 'wb')

pickle.dump(session_saver, file=ftw)
ftw.close()

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
wandb: Wandb version 0.8.31 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


-0.04942328855395317  -0.16911320388317108
0.0
0.04358840733766556  0.2628720998764038
0.0
1.9291702508926392  1.8190314769744873
1.0
30.68630027770996  31.078454971313477
7.0
2.0655932426452637  1.5969560146331787
1.0
1.5292800664901733  1.69171142578125
1.0
3.856539249420166  3.6289143562316895
2.0
-0.1896585375070572  -0.349037230014801
0.0
-0.10577650368213654  -0.29012686014175415
0.0
-0.45210400223731995  -0.4686129093170166
0.0
