# How to run the benchmarks and visualize
The only parts of the code you must change is the number of agents (2,3,4), and the grid size (6,10).
After changing those, you can add the path to the corresponding model in the line containing 
"mu_target.load_state_dict". Other settings can also be changed. 
The three models are, in order, Distance observations, No observations, and All observations.
Render is one by default, but can be turned off by modifying the "test(" function to render=False


In [None]:
import random
import gym
import collections
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch
import time
import math
from ma_gym.wrappers import Monitor
from datetime import datetime

class MuNet(nn.Module):
    # Actor Network
    def __init__(self, observation_space, action_space, weight_multiplier = 1):
        super(MuNet, self).__init__()
        self.num_agents = len(observation_space)
        self.action_space = action_space
        for agent_i in range(self.num_agents): # For each agent, instantiate a Individual Actor network
            n_obs = observation_space[agent_i].shape[0] + (self.num_agents-1) # To add the distance observation from all the other agents
            print('N_obs in MuNet',n_obs)
            num_action = action_space[agent_i].n
            setattr(self, 'agent_{}'.format(agent_i), nn.Sequential(nn.Linear(n_obs, int(128*weight_multiplier)),
                                                                    nn.ReLU(),
                                                                    nn.Linear(int(128*weight_multiplier), int(64*weight_multiplier)),
                                                                    nn.ReLU(),
                                                                    nn.Linear(int(64*weight_multiplier), num_action)))

    def forward(self, obs): # Returns the actions of the agents
        action_logits = [torch.empty(1, _.n) for _ in self.action_space]
        for agent_i in range(self.num_agents):
            x = getattr(self, 'agent_{}'.format(agent_i))(obs[:, agent_i, :]).unsqueeze(1)
            action_logits[agent_i] = x

        return torch.cat(action_logits, dim=1)

class QNet(nn.Module):
    # Critic network
    def __init__(self, observation_space, action_space,weight_multiplier = 1):
        super(QNet, self).__init__()
        self.num_agents = len(observation_space)
        total_action = sum([_.n for _ in action_space])
        total_obs = sum([_.shape[0] for _ in observation_space]) + self.num_agents*(self.num_agents-1)
        print("Total_obs in QNet",total_obs)
        for agent_i in range(self.num_agents):
            setattr(self, 'agent_{}'.format(agent_i), nn.Sequential(nn.Linear(total_obs + total_action, int(128*weight_multiplier)),
                                                                    nn.ReLU(),
                                                                    nn.Linear(int(128*weight_multiplier), int(64*weight_multiplier)),
                                                                    nn.ReLU(),
                                                                    nn.Linear(int(64*weight_multiplier), 1)))

    def forward(self, obs, action): # Returns the q value evaluations for the agent possible actions
        q_values = [torch.empty(obs.shape[0], )] * self.num_agents
        x = torch.cat((obs.view(obs.shape[0], obs.shape[1] * obs.shape[2]),
                       action.view(action.shape[0], action.shape[1] * action.shape[2])), dim=1)
        for agent_i in range(self.num_agents):
            q_values[agent_i] = getattr(self, 'agent_{}'.format(agent_i))(x)

        return torch.cat(q_values, dim=1)

def add_distance_obs(state,n_agents):
    sqrt2 = math.sqrt(2)
    if n_agents == 2:
        dist = math.dist(state[0][1:3],state[1][1:3])/sqrt2 
        state[0].append(dist)
        state[1].append(dist)
        return state
    elif n_agents == 3:
        distsfor0to1 = math.dist(state[0][1:3],state[1][1:3])/sqrt2 
        distsfor0to2 = math.dist(state[0][1:3],state[2][1:3])/sqrt2 
        distsfor1to2 = math.dist(state[1][1:3],state[2][1:3])/sqrt2 
        state[0].append(distsfor0to1) #1
        state[0].append(distsfor0to2) #2
        state[1].append(distsfor0to1) #0
        state[1].append(distsfor1to2) #2
        state[2].append(distsfor0to2) #0
        state[2].append(distsfor1to2) #1
        return state
    elif n_agents == 4:
        distsfor0to1 = math.dist(state[0][1:3],state[1][1:3])/sqrt2 
        distsfor0to2 = math.dist(state[0][1:3],state[2][1:3])/sqrt2 
        distsfor1to2 = math.dist(state[1][1:3],state[2][1:3])/sqrt2 
        distsfor0to3 = math.dist(state[0][1:3],state[3][1:3])/sqrt2 
        distsfor1to3 = math.dist(state[1][1:3],state[3][1:3])/sqrt2 
        distsfor2to3 = math.dist(state[2][1:3],state[3][1:3])/sqrt2 
        state[0].append(distsfor0to1) #1
        state[0].append(distsfor0to2) #2
        state[0].append(distsfor0to3) #3
        state[1].append(distsfor0to1) #0
        state[1].append(distsfor1to2) #2
        state[1].append(distsfor1to3) #3
        state[2].append(distsfor0to2) #0
        state[2].append(distsfor1to2) #1
        state[2].append(distsfor2to3) #3
        state[3].append(distsfor0to3) #0
        state[3].append(distsfor1to3) #1
        state[3].append(distsfor2to3) #2
        return state
    return "Only supports 2,3,4 agents"

def test(env, num_episodes, mu, n_agents,render=True): # Does not use the critic network, just uses the actor networks
    score = np.zeros(env.n_agents)
    epsilon_test = 0.01
    with torch.no_grad():
        for episode_i in range(num_episodes):
            state = add_distance_obs(env.reset(),n_agents)
            done = [False for _ in range(env.n_agents)]

            while not all(done):
                if render == True:
                        env.render()
                        time.sleep(0.0005)
                
                if np.random.rand() < epsilon_test:
                    action = env.action_space.sample()
                else:
                    action_logits = mu(torch.Tensor(state).unsqueeze(0))
                    action = action_logits.argmax(dim=2).squeeze(0).data.cpu().numpy().tolist()

                next_state, reward, done, info = env.step(action)
                next_state = add_distance_obs(next_state,n_agents) # Added
                score += np.array(reward)
                state = next_state
    return sum(score / num_episodes)

def main(env_name, lr_mu, lr_q, tau, gamma, batch_size, buffer_limit, max_episodes, log_interval, test_episodes,
         warm_up_steps, update_iter, gumbel_max_temp, gumbel_min_temp, grid_size, n_agents, n_trees, max_steps, agent_view,n_obstacles,step_cost
         ,max_steps_without_reward,tree_strength,weight_multiplier,render_interval,tree_cutdown_reward):

    load_saved_models = False
    
    gym.envs.register(
        id='my_Lumberjacks-v1',
        entry_point='ma_gym.envs.lumberjacks:Lumberjacks', # Points to the lumberjack class object
        kwargs={'tree_cutdown_reward':tree_cutdown_reward,'tree_strength':tree_strength, 'max_steps_without_reward':max_steps_without_reward, 'n_obstacles':n_obstacles,'n_agents': n_agents, 'n_trees':n_trees, 'full_observable': False, 'step_cost': step_cost, 'grid_shape':(grid_size,grid_size),'agent_view':agent_view,'max_steps':max_steps} # Add additional args
    )

    env = gym.make('my_Lumberjacks-v1')
    test_env = gym.make('my_Lumberjacks-v1')
    # Parameyers to adjust about the environment

    mu_target = MuNet(env.observation_space, env.action_space,weight_multiplier)
    # Add in the part here to load in the mu network
    mu_target.load_state_dict(torch.load('/Users/mingliu/Documents/R Learning/Final Project Code/emilygrid10/G_mu_target_DIST_OBS_agents4_grid10.pt'))


    test_score = test(test_env, test_episodes, mu_target, n_agents,render=True)
    print(test_score)
if __name__ == '__main__':
    # Only edit these parts
 
    n_agent = 4# 2,3,4 # Change this
    mapsize = 10 # Pair change to 6, Emily change to 10

    # DON"T EDIT BELOW THIS LINE
    #----------------------------------------------------------------------
    #----------------------------------------------------------------------
    #----------------------------------------------------------------------
    # DON"T EDIT BELOW THIS LINE
    if mapsize == 6: 
        num_trees = 12
        max_steps = 150
        num_obstacles = 5
        max_episodes = 30000
        multiplier = 1.25
    elif mapsize == 10: 
        num_trees = 36
        max_steps = 250
        num_obstacles = 12
        max_episodes = 30000
        multiplier = 1.25
    else:
        print("DO 4 or 6")
    tree_strength = []
    for i in range(1,n_agent+1):
        tree_strength.extend([int(i)]*int(num_trees / n_agent)) 

    print(tree_strength)
    print(multiplier)
    print(max_steps)
    print(max_episodes)
    print(num_trees)
    print(num_obstacles)
    
    kwargs = {'env_name': 'ma_gym:Lumberjacks-v0',
              'lr_mu': 0.0005,                      # Learning rate for Actors
              'lr_q': 0.001,                        # Learning rate for Critic
              'batch_size': 32,
              'tau': 0.005,
              'gamma': 0.99,
              'buffer_limit': 50000,
              'log_interval': 20,
              'render_interval': 500,              # Every this many games, show an example of the game
              'max_episodes': max_episodes,                # 10000 default
              'test_episodes': 3000,
              'warm_up_steps': 2000,
              'update_iter': 10,
              'gumbel_max_temp': 10,
              'gumbel_min_temp': 0.1,
              'weight_multiplier':multiplier,
              
              'grid_size' : mapsize,
              'n_agents' : n_agent, #3,4 change this
              'n_trees' : num_trees,
              'tree_strength' : tree_strength        ,  # #Even spread based on the number of agents
              'max_steps' : max_steps,
              'agent_view' : (2,2),
              'n_obstacles' : num_obstacles,
              'step_cost' : -0.1,
              'tree_cutdown_reward': 10,
              'max_steps_without_reward' : 50000 # Don't really use this
    }

    main(**kwargs)

In [None]:
import random
import gym
import collections
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch
import time
import math
from ma_gym.wrappers import Monitor
from datetime import datetime


class MuNet(nn.Module):
    # Actor Network
    def __init__(self, observation_space, action_space, weight_multiplier = 1):
        super(MuNet, self).__init__()
        self.num_agents = len(observation_space)
        self.action_space = action_space
        for agent_i in range(self.num_agents): # For each agent, instantiate a Individual Actor network
            n_obs = observation_space[agent_i].shape[0] 
            print('N_obs in MuNet',n_obs)
            num_action = action_space[agent_i].n
            setattr(self, 'agent_{}'.format(agent_i), nn.Sequential(nn.Linear(n_obs, int(128*weight_multiplier)),
                                                                    nn.ReLU(),
                                                                    nn.Linear(int(128*weight_multiplier), int(64*weight_multiplier)),
                                                                    nn.ReLU(),
                                                                    nn.Linear(int(64*weight_multiplier), num_action)))

    def forward(self, obs): # Returns the actions of the agents
        action_logits = [torch.empty(1, _.n) for _ in self.action_space]
        for agent_i in range(self.num_agents):
            x = getattr(self, 'agent_{}'.format(agent_i))(obs[:, agent_i, :]).unsqueeze(1)
            action_logits[agent_i] = x

        return torch.cat(action_logits, dim=1)

class QNet(nn.Module):
    # Critic network
    def __init__(self, observation_space, action_space,weight_multiplier = 1):
        super(QNet, self).__init__()
        self.num_agents = len(observation_space)
        total_action = sum([_.n for _ in action_space])
        total_obs = sum([_.shape[0] for _ in observation_space]) + self.num_agents*(self.num_agents-1)
        print("Total_obs in QNet",total_obs)
        for agent_i in range(self.num_agents):
            setattr(self, 'agent_{}'.format(agent_i), nn.Sequential(nn.Linear(total_obs + total_action, int(128*weight_multiplier)),
                                                                    nn.ReLU(),
                                                                    nn.Linear(int(128*weight_multiplier), int(64*weight_multiplier)),
                                                                    nn.ReLU(),
                                                                    nn.Linear(int(64*weight_multiplier), 1)))

    def forward(self, obs, action): # Returns the q value evaluations for the agent possible actions
        q_values = [torch.empty(obs.shape[0], )] * self.num_agents
        x = torch.cat((obs.view(obs.shape[0], obs.shape[1] * obs.shape[2]),
                       action.view(action.shape[0], action.shape[1] * action.shape[2])), dim=1)
        for agent_i in range(self.num_agents):
            q_values[agent_i] = getattr(self, 'agent_{}'.format(agent_i))(x)

        return torch.cat(q_values, dim=1)

def add_distance_obs(state,n_agents):
    return state

def test(env, num_episodes, mu, n_agents,render=False): # Does not use the critic network, just uses the actor networks
    score = np.zeros(env.n_agents)
    epsilon_test = 0.01
    with torch.no_grad():
        for episode_i in range(num_episodes):
            state = add_distance_obs(env.reset(),n_agents)
            done = [False for _ in range(env.n_agents)]

            while not all(done):
                if render == True:
                        env.render()
                        time.sleep(0.0005)
                
                if np.random.rand() < epsilon_test:
                    action = env.action_space.sample()
                else:
                    action_logits = mu(torch.Tensor(state).unsqueeze(0))
                    action = action_logits.argmax(dim=2).squeeze(0).data.cpu().numpy().tolist()

                next_state, reward, done, info = env.step(action)
                next_state = add_distance_obs(next_state,n_agents) # Added
                score += np.array(reward)
                state = next_state
    return sum(score / num_episodes)


def main(env_name, lr_mu, lr_q, tau, gamma, batch_size, buffer_limit, max_episodes, log_interval, test_episodes,
         warm_up_steps, update_iter, gumbel_max_temp, gumbel_min_temp, grid_size, n_agents, n_trees, max_steps, agent_view,n_obstacles,step_cost
         ,max_steps_without_reward,tree_strength,weight_multiplier,render_interval,tree_cutdown_reward):

    load_saved_models = False
    
    gym.envs.register(
        id='my_Lumberjacks-v1',
        entry_point='ma_gym.envs.lumberjacks:Lumberjacks', # Points to the lumberjack class object
        kwargs={'tree_cutdown_reward':tree_cutdown_reward,'tree_strength':tree_strength, 'max_steps_without_reward':max_steps_without_reward, 'n_obstacles':n_obstacles,'n_agents': n_agents, 'n_trees':n_trees, 'full_observable': False, 'step_cost': step_cost, 'grid_shape':(grid_size,grid_size),'agent_view':agent_view,'max_steps':max_steps} # Add additional args
    )

    env = gym.make('my_Lumberjacks-v1')
    test_env = gym.make('my_Lumberjacks-v1')
    # Parameyers to adjust about the environment

    mu_target = MuNet(env.observation_space, env.action_space,weight_multiplier)
    # Add in the part here to load in the mu network
    mu_target.load_state_dict(torch.load('/Users/mingliu/Documents/R Learning/Final Project Code/emilygrid10/G_mu_target_NO_OBS_agents4_grid10.pt'))

    test_score = test(test_env, test_episodes, mu_target, n_agents,render=True)
    print(test_score)
if __name__ == '__main__':
    # Only edit these parts
 
    n_agent = 4# 2,3,4 # Change this
    mapsize = 10 # Pair change to 6, Emily change to 10

    # DON"T EDIT BELOW THIS LINE
    #----------------------------------------------------------------------
    #----------------------------------------------------------------------
    #----------------------------------------------------------------------
    # DON"T EDIT BELOW THIS LINE
    if mapsize == 6: 
        num_trees = 12
        max_steps = 150
        num_obstacles = 5
        max_episodes = 30000
        multiplier = 1.25
    elif mapsize == 10: 
        num_trees = 36
        max_steps = 250
        num_obstacles = 12
        max_episodes = 30000
        multiplier = 1.25
    else:
        print("DO 4 or 6")
    tree_strength = []
    for i in range(1,n_agent+1):
        tree_strength.extend([int(i)]*int(num_trees / n_agent)) 

    print(tree_strength)
    print(multiplier)
    print(max_steps)
    print(max_episodes)
    print(num_trees)
    print(num_obstacles)
    
    kwargs = {'env_name': 'ma_gym:Lumberjacks-v0',
              'lr_mu': 0.0005,                      # Learning rate for Actors
              'lr_q': 0.001,                        # Learning rate for Critic
              'batch_size': 32,
              'tau': 0.005,
              'gamma': 0.99,
              'buffer_limit': 50000,
              'log_interval': 20,
              'render_interval': 500,              # Every this many games, show an example of the game
              'max_episodes': max_episodes,                # 10000 default
              'test_episodes': 3000,
              'warm_up_steps': 2000,
              'update_iter': 10,
              'gumbel_max_temp': 10,
              'gumbel_min_temp': 0.1,
              'weight_multiplier':multiplier,
              
              'grid_size' : mapsize,
              'n_agents' : n_agent, #3,4 change this
              'n_trees' : num_trees,
              'tree_strength' : tree_strength        ,  # #Even spread based on the number of agents
              'max_steps' : max_steps,
              'agent_view' : (2,2),
              'n_obstacles' : num_obstacles,
              'step_cost' : -0.1,
              'tree_cutdown_reward': 10,
              'max_steps_without_reward' : 50000 # Don't really use this
    }

    main(**kwargs)

In [None]:

import random
import gym
import collections
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch
import time
import math
from ma_gym.wrappers import Monitor
from datetime import datetime


class MuNet(nn.Module):
    # Actor Network
    def __init__(self, observation_space, action_space, weight_multiplier = 1):
        super(MuNet, self).__init__()
        self.num_agents = len(observation_space)
        self.action_space = action_space
        for agent_i in range(self.num_agents): # For each agent, instantiate a Individual Actor network
            n_obs = observation_space[agent_i].shape[0] 
            print('N_obs in MuNet',n_obs)
            num_action = action_space[agent_i].n
            setattr(self, 'agent_{}'.format(agent_i), nn.Sequential(nn.Linear(n_obs, int(128*weight_multiplier)),
                                                                    nn.ReLU(),
                                                                    nn.Linear(int(128*weight_multiplier), int(64*weight_multiplier)),
                                                                    nn.ReLU(),
                                                                    nn.Linear(int(64*weight_multiplier), num_action)))

    def forward(self, obs): # Returns the actions of the agents
        action_logits = [torch.empty(1, _.n) for _ in self.action_space]
        for agent_i in range(self.num_agents):
            x = getattr(self, 'agent_{}'.format(agent_i))(obs[:, agent_i, :]).unsqueeze(1)
            action_logits[agent_i] = x

        return torch.cat(action_logits, dim=1)

def add_distance_obs(state,n_agents):
    return state

def test(env, num_episodes, mu, n_agents,render=False): # Does not use the critic network, just uses the actor networks
    score = np.zeros(env.n_agents)
    epsilon_test = 0.01
    with torch.no_grad():
        for episode_i in range(num_episodes):
            state = add_distance_obs(env.reset(),n_agents)
            done = [False for _ in range(env.n_agents)]

            while not all(done):
                if render == True:
                        env.render()
                        time.sleep(0.0005)
                
                if np.random.rand() < epsilon_test:
                    action = env.action_space.sample()
                else:
                    action_logits = mu(torch.Tensor(state).unsqueeze(0))
                    action = action_logits.argmax(dim=2).squeeze(0).data.cpu().numpy().tolist()

                next_state, reward, done, info = env.step(action)
                next_state = add_distance_obs(next_state,n_agents) # Added
                score += np.array(reward)
                state = next_state
    return sum(score / num_episodes)


def main(env_name, lr_mu, lr_q, tau, gamma, batch_size, buffer_limit, max_episodes, log_interval, test_episodes,
         warm_up_steps, update_iter, gumbel_max_temp, gumbel_min_temp, grid_size, n_agents, n_trees, max_steps, agent_view,n_obstacles,step_cost
         ,max_steps_without_reward,tree_strength,weight_multiplier,render_interval,tree_cutdown_reward):

    load_saved_models = False
    
    gym.envs.register(
        id='my_Lumberjacks-v1',
        entry_point='ma_gym.envs.lumberjacks:Lumberjacks', # Points to the lumberjack class object
        kwargs={'tree_cutdown_reward':tree_cutdown_reward,'tree_strength':tree_strength, 'max_steps_without_reward':max_steps_without_reward, 'n_obstacles':n_obstacles,'n_agents': n_agents, 'n_trees':n_trees, 'full_observable': True, 'step_cost': step_cost, 'grid_shape':(grid_size,grid_size),'agent_view':agent_view,'max_steps':max_steps} # Add additional args
    )

    env = gym.make('my_Lumberjacks-v1')
    test_env = gym.make('my_Lumberjacks-v1')
    # Parameyers to adjust about the environment

    mu_target = MuNet(env.observation_space, env.action_space,weight_multiplier)
    # Add in the part here to load in the mu network
    mu_target.load_state_dict(torch.load('/Users/mingliu/Documents/R Learning/Final Project Code/emilygrid10/G_mu_target_ALL_OBS_agents4_grid10.pt'))


    test_score = test(test_env, test_episodes, mu_target, n_agents,render=True)
    print(test_score)
if __name__ == '__main__':
    # Only edit these parts
 
    n_agent = 4# 2,3,4 # Change this
    mapsize = 10 # Pair change to 6, Emily change to 10

    # DON"T EDIT BELOW THIS LINE
    #----------------------------------------------------------------------
    #----------------------------------------------------------------------
    #----------------------------------------------------------------------
    # DON"T EDIT BELOW THIS LINE
    if mapsize == 6: 
        num_trees = 12
        max_steps = 150
        num_obstacles = 5
        max_episodes = 30000
        multiplier = 1.25
    elif mapsize == 10: 
        num_trees = 36
        max_steps = 250
        num_obstacles = 12
        max_episodes = 30000
        multiplier = 1.25
    else:
        print("DO 4 or 6")
    tree_strength = []
    for i in range(1,n_agent+1):
        tree_strength.extend([int(i)]*int(num_trees / n_agent)) 

    print(tree_strength)
    print(multiplier)
    print(max_steps)
    print(max_episodes)
    print(num_trees)
    print(num_obstacles)
    
    kwargs = {'env_name': 'ma_gym:Lumberjacks-v0',
              'lr_mu': 0.0005,                      # Learning rate for Actors
              'lr_q': 0.001,                        # Learning rate for Critic
              'batch_size': 32,
              'tau': 0.005,
              'gamma': 0.99,
              'buffer_limit': 50000,
              'log_interval': 20,
              'render_interval': 500,              # Every this many games, show an example of the game
              'max_episodes': max_episodes,                # 10000 default
              'test_episodes': 3000,
              'warm_up_steps': 2000,
              'update_iter': 10,
              'gumbel_max_temp': 10,
              'gumbel_min_temp': 0.1,
              'weight_multiplier':multiplier,
              
              'grid_size' : mapsize,
              'n_agents' : n_agent, #3,4 change this
              'n_trees' : num_trees,
              'tree_strength' : tree_strength        ,  # #Even spread based on the number of agents
              'max_steps' : max_steps,
              'agent_view' : (2,2),
              'n_obstacles' : num_obstacles,
              'step_cost' : -0.1,
              'tree_cutdown_reward': 10,
              'max_steps_without_reward' : 50000 # Don't really use this
    }

    main(**kwargs)
