In [113]:



import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import matplotlib.pyplot as plt

import GridWorld_env
import gymnasium as gym
import SharedGridWorld_env
import random
import math

import torch.multiprocessing as mp

In [30]:

# shared environment of [E1, E2, ... En] where Ei belongs to agent i
class sharedEnvironment:
    def __init__(self, state_dim, num_agents):
        self.state_dim = state_dim
        self.num_agents = num_agents
        # initialize the shared enviroinment = [E1, E2, ... En]
        self.env = [gym.make('GridWorld_env/GridWorld', dimension_size=state_dim) for i in range(num_agents)]

        for i in range(num_agents):
            self.env[i].reset()
        
    
    def getEnv(self, agentID):
        # todo, return shared environement 
        if (agentID < 0 or agentID >= self.num_agents):
            raise ValueError("Invalid agent ID")
        return self.env[agentID]

    def getOtherAgentsPos(self, agentID):
        # return the xor of all the other agents position except agentID
        otherAgentsPos = np.zeros((self.state_dim, self.state_dim, self.state_dim), dtype=int)

        for i in range(self.num_agents):
            if i != agentID:
                otherAgentsPos[self.env[i].agent_pos[0], self.env[i].agent_pos[1], self.env[i].agent_pos[2]] = 1
        toTensor = torch.tensor(otherAgentsPos).float()
        return toTensor 

    

    


class ActorCritic(nn.Module):
    # input dim (N, N, N, 4)  = (building zone, target, agentpos, other agent pos)
    def __init__(self, state_dim, action_dim):
        # state_dim^3 is the state space
        super(ActorCritic, self).__init__()

        self.state_dim = state_dim    
        self.action_dim = action_dim


        # just trying to replicate the paper
        self.leftConv = nn.Sequential(
            # todo: 1st layer conv should cut down the state dim in half. eg (10, 10, 10, 3) -> (5, 5, 5, 64)
            nn.Conv3d(in_channels=4, out_channels=64, kernel_size=3, stride=2, padding=1)  ,
            nn.Conv3d(in_channels = 64, out_channels = 128, kernel_size = 3, stride = 1, padding = 1),
            nn.Conv3d(in_channels = 128, out_channels = 256, kernel_size = 3, stride = 2, padding = 1),
            nn.Conv3d(in_channels = 256, out_channels = 512, kernel_size = 3, stride = 2, padding = 1)
        )

        self.rightConv = nn.Sequential(

        )


        self.fc1 = nn.Linear(512, 512)

        # we can experiment with h0 and c0 later
        self.lstm = nn.LSTMCell(512, 512)


        self.policy = nn.Linear(512, action_dim)
        self.value = nn.Linear(512, 1)

        self.isBuilt = nn.Linear(512, 1)  # 1 if the building is built, 0 otherwise
    


    def forward(self, state):
        x, (hx, cx) = state


        x = F.relu(self.leftConv(x));        

        # flatten dim to  [1, x.shape[0]*x.shape[1]*x.shape[2]*x.shape[3]
        x = x.view(-1, x.shape[0]*x.shape[1]*x.shape[2]*x.shape[3])
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc1(x))

        hx, cx = self.lstm(x, (hx, cx))

        x = hx
        return self.policy(x), self.value(x), self.isBuilt(x), (hx, cx)





In [91]:

# TEST ENVIRONMENT and AC network
#sharedE = sharedEnvironment(state_dim=4, num_agents=2)
#testA3c = ActorCritic(state_dim=4, action_dim=7)
#
#env1 = sharedE.getEnv(0)
#state, _, _, _, _ = env1.unwrapped.step(0)
#state = torch.tensor(state).float()
#otherAgentsPost = sharedE.getOtherAgentsPos(0)
## make otherAgentsPost [4, 4, 4]-> [1, 4, 4, 4]
#otherAgentsPost = otherAgentsPost.unsqueeze(0)
#
#
## NN TEST
#fourChannelState = torch.cat((state, otherAgentsPost), dim=0)
#
#hx = torch.zeros(1, 512)
#cx = torch.zeros(1, 512)
#input = (fourChannelState, (hx, cx))
#policy, value, isBuilt, (hx, cx) = testA3c(input)
#print("policy shape ", policy.shape)
#print("value shape ", value.shape)
#print("isBuilt shape ", isBuilt.shape)
#
#
#pytorch_total_params = sum(p.numel() for p in testA3c.parameters() if p.requires_grad)
#print("Total number of parameters: ", pytorch_total_params)

In [131]:
#param
num_agents = 1
num_episodes = 1
state_dim = 4
action_dim = 7
STEP_SIZE = 0.0001
BATCH_SIZE = 128
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000
TAU = 0.005
# craete gym shared environment
env = gym.make('SharedGridWorld_env/SharedGridWorld', dimension_size=state_dim, num_agents=num_agents)


In [132]:

def select_action( policy, device, steps_done):

    
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    
    if sample > eps_threshold:
        with torch.no_grad():
            distribution = F.softmax(policy, dim=1)
            
            return distribution

    else:
        return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)

In [135]:
# thread function
def agentTrain(index: int , globalNet, optimizer, device):
    
    # create local network
    localNet = ActorCritic(state_dim, action_dim)

    localNet.to(device)
    localNet.train()
    steps_done = 0

    state = env.unwrapped.get_obs(index);
    state = torch.tensor(state, device=device, dtype=torch.float32)  # dim (4, N, N, N)

    h = torch.zeros(1, 512).to(device)
    c = torch.zeros(1, 512).to(device)
    # load weight from global network

    # get the state
    state = env.unwrapped.get_obs(index)
    state = torch.tensor(state, device=device, dtype=torch.float32)

    policy, value, isBuilt, (h, c) = localNet((state, (h, c)))
    print("policy ", policy)
    distribution = F.softmax(policy, dim=1)
    print("distribution ", distribution)



    # select action
    return



In [136]:
def globalTrain(numAgents):
    # master thread

    # create shared environment
    globalNet = ActorCritic(state_dim=state_dim, action_dim=action_dim)
    torch.set_num_threads(1)  # THIS FIXES ALOT OF ISSUES
    # use cuda  if available
    device =    torch.device("cuda" if torch.cuda.is_available() else "cpu")
    globalNet.to(device)
    globalNet.share_memory()
    optimizer = optim.Adam(globalNet.parameters(), lr=STEP_SIZE)

    # create agents
    agents = []

    for i in range(numAgents):
        agents.append(mp.Process(target=agentTrain, args=(i, globalNet, optimizer, device)))
        agents[i].start()

    for agent in agents:
        agent.join()
    return

globalTrain(num_agents)

policy  tensor([[-0.0306, -0.0094,  0.0196,  0.0252,  0.0314,  0.0511,  0.0324]],
       grad_fn=<AddmmBackward0>)
distribution  tensor([[1., 1., 1., 1., 1., 1., 1.]], grad_fn=<SoftmaxBackward0>)
