cuda


In [31]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from pettingzoo import ParallelEnv
from sklearn.metrics.pairwise import cosine_similarity
from agilerl.utils.utils import create_population as Population
from agilerl.algorithms.core.registry import HyperparameterConfig, RLParameter
from agilerl.training.train_offline import train_offline
from agilerl.training.train_on_policy import train_on_policy
from agilerl.training.train_multi_agent import train_multi_agent
from agilerl.hpo.tournament import TournamentSelection
from agilerl.hpo.mutation import Mutations
#set device to the dual t4s. 
from types import SimpleNamespace

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)
# -------------------------------
# Environment Definition
# -------------------------------
class CichyEnv(ParallelEnv):
    metadata = {"name": "cichyenv"}

    def __init__(self, images, y1, y2):
        self.images = images.reshape(92, 175, 175)  # Ensure correct shape
        self.y1 = y1  # Rewards for agent IT
        self.y2 = y2  # Rewards for agent EVC
        self.agents = ["IT", "EVC"]
        self.agent_ids = ["IT", "EVC"]
        self.current_step = 0  # Initialize current_step
        
        # Merging observation spaces into a single space
        self.observation_space = spaces.Dict({
            "image": spaces.Box(low=0, high=255, shape=(175, 175), dtype=np.uint8),  # Expecting image shape of (175, 175)
            "other_action": spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32),
        })

        self.action_space = spaces.Box(low=-1, high=1, shape=(93,), dtype=np.float32)

    def reset(self):
        self.current_step = 0  # Ensure it's set to 0 at the start of each episode
        self.actions = {agent: [] for agent in self.agent_ids}
        
        # Creating observation dictionary
        obs = {}
        for agent in self.agent_ids:
            image = self.images[self.current_step]  # Get the image for the current step
            if image.shape != (175, 175):
                raise ValueError(f"Unexpected image shape: {image.shape}, expected (175, 175)")

            # Expand the image dimensions to make it 5D for Conv3D (batch, channels, depth, height, width)
            image_expanded = np.expand_dims(image, axis=0)  # Adds a batch dimension, (1, 175, 175)
            image_expanded = np.expand_dims(image_expanded, axis=0)  # Adds a channel dimension, (1, 1, 175, 175)
            image_expanded = np.expand_dims(image_expanded, axis=0)  # Adds depth dimension, (1, 1, 1, 175, 175)

            obs[agent] = {
                "image": image_expanded,  # Now shape (1, 1, 1, 175, 175)
                "other_action": np.array([0.0]),  # Dummy value for the action of the other agent
            }
                    
        return obs, {}

    def step(self, action_dict):
        if self.current_step >= len(self.images):
            self.current_step = len(self.images) - 1

        obs, rewards, dones, infos = {}, {}, {}, {}

        for agent_id in self.agent_ids:
            other_agent_id = "EVC" if agent_id == "IT" else "IT"

            full_action = action_dict.get(agent_id, np.zeros((93,)))
            agent_actions = full_action[:92].reshape((92, 1))  
            other_action = full_action[92:]  

            self.actions[agent_id].append(agent_actions)  

            other_actions = action_dict.get(other_agent_id, np.zeros((93,)))[92:]  

            obs[agent_id] = {
                "image": torch.tensor(self.images[self.current_step], dtype=torch.float32),  # Convert to tensor
                "other_action": torch.tensor(other_actions, dtype=torch.float32),  # Convert to tensor
            }

            rewards[agent_id] = self._calculate_reward(agent_id)  
            dones[agent_id] = self.current_step >= len(self.images) - 1
            infos[agent_id] = {}

        dones["__all__"] = all(dones.values())
        self.current_step += 1  
        return obs, rewards, dones, infos
    def _calculate_reward(self, agent_id):
        """Computes the reward based on similarity to the expert RDM using all 92 actions per step."""
        actions = np.array(self.actions[agent_id])  # Shape: (num_steps, 92, 1)
        
        num_steps = actions.shape[0]  
        num_images = actions.shape[1]  

        if num_steps < 2:  
            return 0  

        simulated_rdm = np.zeros((num_images, num_images))

        for i in range(num_images):
            for j in range(num_images):
                if i != j:
                    sim = cosine_similarity(actions[:, i].reshape(-1, 1), actions[:, j].reshape(-1, 1))[0][0]
                    simulated_rdm[i, j] = 1 - sim

        expert_rdm = self.y1 if agent_id == "IT" else self.y2

        min_size = min(simulated_rdm.shape[0], expert_rdm.shape[0])
        reward = -np.sum((simulated_rdm[:min_size, :min_size] - expert_rdm[:min_size, :min_size]) ** 2)
        return reward


# -------------------------------
# Training Setup with AgileRL
# -------------------------------8-=
x_train = np.random.randint(0, 255, (175,175, 92), dtype=np.uint8)  # Dummy dataset
y_train1 = np.random.randn(92, 92)  # Dummy RDM for agent IT
y_train2 = np.random.randn(92, 92)  # Dummy RDM for agent EVC

env = CichyEnv(x_train, y_train1, y_train2)

# Initial Hyperparameters
INIT_HP = {
    "DOUBLE": True,
    "CHANNELS_LAST": False,
    "POPULATION_SIZE": 2,
    "O_U_NOISE": 0.2,
    "EXPL_NOISE": 0.1,
    "BATCH_SIZE": 64,
    "LR": 0.001,
    "LR_ACTOR": 0.002,
    "LR_CRITIC": 0.002,
    "TAU": 0.5,
    "GAMMA": 1.0,
    "LAMBDA": 1.0,
    "REG": 0.0625,
    "LEARN_STEP": 2,
    "MEAN_NOISE": 1,
    "THETA": 1,
    "DT": 1,
    "POLICY_FREQ": 2,
    "AGENT_IDS": ["IT","EVC"],
    "MEMORY_SIZE": 100000
        
}

hp_config = HyperparameterConfig(
    #lr=RLParameter(min=6.25e-5, max=1e-2),
    batch_size=RLParameter(min=8, max=512, dtype=int),
    learn_step=RLParameter(min=1, max=10, dtype=int, grow_factor=1.5, shrink_factor=0.75)
)

NET_CONFIG = {"head_config": {"hidden_size": [128]}}

# Create populations for each agent
pop_agent = Population(
    algo="MATD3",#NeuralTS",
    observation_space=[spaces.flatten_space(env.observation_space), spaces.flatten_space(env.observation_space)],
    action_space=[env.action_space,env.action_space],
    net_config=NET_CONFIG,
    INIT_HP=INIT_HP,
    hp_config=hp_config,
    population_size=INIT_HP["POPULATION_SIZE"],
    device='cpu'
)


# Tournament selection
tournament = TournamentSelection(
    tournament_size=2,
    elitism=True,
    population_size=INIT_HP["POPULATION_SIZE"],
    eval_loop=1,
)

# Mutation settings
mutations = Mutations(
    no_mutation=0.4,
    architecture=0.2,
    new_layer_prob=0.2,
    parameters=0.2,
    activation=0,
    rl_hp=0.2,
    mutation_sd=0.1,
    rand_seed=1,
    device="cuda",
)
from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer

field_names = ["state", "action", "reward", "next_state", "done"]
memory = MultiAgentReplayBuffer(
    INIT_HP["MEMORY_SIZE"],
    field_names=field_names,
    agent_ids=INIT_HP["AGENT_IDS"],
    device=device,
)


"""
# Offline training
trained_pop, pop_fitnesses = train_multi_agent(#train_on_policy(#train_offline(
    pop=[pop_agent1, pop_agent2],
    env=env,
    algo="MATD3",
    env_name="cichyenv",
    #dataset=[x_train, y_train1, y_train2],
    memory=memory,  # Replay buffer if needed
    swap_channels=False,  # Ensure channel order is correct
    max_steps=500000,
    evo_steps=10000,
    eval_steps=None,
    eval_loop=1,
    target=200.0,
    tournament=tournament,
    #mutation=mutations,
    wb=False,  # Weights & Biases logging
    #accelerator=device
)
"""
"""
trained_pop, fitnesses = train_multi_agent(
    pop=pop_agent,
    env=env,
    algo="MATD3",
    env_name="cichyenv",
    memory=memory,
    swap_channels=False,
    max_steps=500000,
    evo_steps=10000,
    eval_steps=None,
    eval_loop=1,
    target=200.0,
    tournament=tournament,
    wb=False,
)
"""

cuda


'\ntrained_pop, fitnesses = train_multi_agent(\n    pop=pop_agent,\n    env=env,\n    algo="MATD3",\n    env_name="cichyenv",\n    memory=memory,\n    swap_channels=False,\n    max_steps=500000,\n    evo_steps=10000,\n    eval_steps=None,\n    eval_loop=1,\n    target=200.0,\n    tournament=tournament,\n    wb=False,\n)\n'

In [42]:
from tqdm import trange

max_steps = 20000 
x_train = np.random.randint(0, 255, (175,175, 92), dtype=np.uint8)  # Dummy dataset
y_train1 = np.random.randn(92, 92)  # Dummy RDM for agent IT
y_train2 = np.random.randn(92, 92)  # Dummy RDM for agent EVC
training_steps = 6
env = CichyEnv(x_train, y_train1, y_train2)#.parallel_env()
action_dict = {
    "IT": np.random.uniform(-1, 1, 93),  # Random action for agent IT (size 93)
    "EVC": np.random.uniform(-1, 1, 93)  # Random action for agent EVC (size 93)
}
#env.step(action_dict)
NET_CONFIG = {"head_config": {"hidden_size": [128]}}
obs = env.reset()  # This ensures self.actions is initialized
obs, rewards, dones, infos = env.step(action_dict)
print(type(obs), obs)


pop_agent = Population(
    algo="MATD3",#NeuralTS",
    observation_space=[spaces.flatten_space(env.observation_space), spaces.flatten_space(env.observation_space)],
    action_space=[env.action_space,env.action_space],
    net_config=NET_CONFIG,
    INIT_HP=INIT_HP,
    hp_config=hp_config,
    population_size=INIT_HP["POPULATION_SIZE"],
    device='cpu'
)

#print(type(obs), obs)
num_envs = 1
agent_ids = ["IT", "EVC"]
agent = pop_agent[0]
#print(agent)
pbar = trange(max_steps, unit="step")
while np.less(agent.steps[-1], max_steps):
    obs, info = env.reset()  
    #print(type(obs), obs)  # Debugging line
    
    # Convert observations to a format AgileRL expects
    processed_obs = {
        agent_id: spaces.flatten(env.observation_space, obs[agent_id]) 
        for agent_id in agent_ids
    }
    
    scores = np.zeros((num_envs, len(agent_ids)))
    completed_episode_scores = []
    steps = 0

    for idx_step in range(training_steps // num_envs):
        # Ensure obs is flattened before passing to agent
        cont_actions, discrete_action = agent.get_action(obs=processed_obs, training=True, infos=info)
        print("actions calculated")

<class 'dict'> {'IT': {'image': tensor([[ 44., 114., 140.,  ...,  42., 141., 210.],
        [ 81.,  94., 106.,  ...,  87., 151., 251.],
        [ 17.,  58.,  54.,  ..., 124., 109., 217.],
        ...,
        [157., 102., 106.,  ..., 172., 176.,  55.],
        [125., 207., 172.,  ..., 103., 193.,  31.],
        [152., 189., 109.,  ...,  79.,   9., 148.]]), 'other_action': tensor([0.5137])}, 'EVC': {'image': tensor([[ 44., 114., 140.,  ...,  42., 141., 210.],
        [ 81.,  94., 106.,  ...,  87., 151., 251.],
        [ 17.,  58.,  54.,  ..., 124., 109., 217.],
        ...,
        [157., 102., 106.,  ..., 172., 176.,  55.],
        [125., 207., 172.,  ..., 103., 193.,  31.],
        [152., 189., 109.,  ...,  79.,   9., 148.]]), 'other_action': tensor([0.8837])}}


  0%|          | 0/20000 [02:00<?, ?step/s]


actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calculated
actions calc

KeyboardInterrupt: 

In [None]:
import random
from copy import copy
prisoner_y = 0
possible_agents = ["prisoner", "guard"]

agents = copy(possible_agents)
guard_x = 6
guard_y = 6
prisoner_x = 0
escape_x = random.randint(2, 5)
escape_y = random.randint(2, 5)
pobs = {
    a: (
            prisoner_x + 7 * prisoner_y,
            guard_x + 7 * guard_y,
            escape_x + 7 * escape_y,
            )
            for a in agents
}
print(p)

{'prisoner': (0, 48, 24), 'guard': (0, 48, 24)}


In [None]:
trained_pop, fitnesses = train_multi_agent(
    pop=pop_agent,
    env=env,
    algo="MATD3",
    env_name="cichyenv",
    memory=memory,
    swap_channels=False,
    max_steps=500000,
    evo_steps=10000,
    eval_steps=None,
    eval_loop=1,
    target=200.0,
    tournament=tournament,
    wb=False,
)