In [None]:
import wandb

cuda


In [67]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from pettingzoo import ParallelEnv
from sklearn.metrics.pairwise import cosine_similarity
from agilerl.utils.utils import create_population as Population
from agilerl.algorithms.core.registry import HyperparameterConfig, RLParameter
from agilerl.training.train_offline import train_offline
from agilerl.training.train_on_policy import train_on_policy
from agilerl.training.train_multi_agent import train_multi_agent
from agilerl.hpo.tournament import TournamentSelection
from agilerl.hpo.mutation import Mutations
#set device to the dual t4s. 
from types import SimpleNamespace

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)
# -------------------------------
# Environment Definition
# -------------------------------
class CichyEnv(ParallelEnv):
    metadata = {"name": "cichyenv"}

    def __init__(self, images, y1, y2):
        self.images = images.reshape(92, 175, 175)  # Ensure correct shape
        self.y1 = y1  # Rewards for agent IT
        self.y2 = y2  # Rewards for agent EVC
        self.agents = ["IT", "EVC"]
        self.agent_ids = ["IT", "EVC"]
        self.current_step = 0  # Initialize current_step
        
        # Merging observation spaces into a single space
        self.observation_space = spaces.Dict({
            "image": spaces.Box(low=0, high=255, shape=(175, 175), dtype=np.uint8),  # Expecting image shape of (175, 175)
            "other_action": spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32),
        })

        self.action_space = spaces.Box(low=-1, high=1, shape=(93,), dtype=np.float32)

    def reset(self):
        self.current_step = 0  # Ensure it's set to 0 at the start of each episode
        self.actions = {agent: [] for agent in self.agent_ids}
        
        # Creating observation dictionary
        obs = {}
        for agent in self.agent_ids:
            image = self.images[self.current_step]  # Get the image for the current step
            if image.shape != (175, 175):
                raise ValueError(f"Unexpected image shape: {image.shape}, expected (175, 175)")

            # Expand the image dimensions to make it 5D for Conv3D (batch, channels, depth, height, width)
            image_expanded = np.expand_dims(image, axis=0)  # Adds a batch dimension, (1, 175, 175)
            image_expanded = np.expand_dims(image_expanded, axis=0)  # Adds a channel dimension, (1, 1, 175, 175)
            image_expanded = np.expand_dims(image_expanded, axis=0)  # Adds depth dimension, (1, 1, 1, 175, 175)

            obs[agent] = {
                "image": image_expanded,  # Now shape (1, 1, 1, 175, 175)
                "other_action": np.array([0.0]),  # Dummy value for the action of the other agent
            }
            """
            uncomment out soon 

            obs = {
                agent_id: spaces.flatten(self.observation_space, obs[agent_id]) 
                for agent_id in self.agent_ids
                }
            """
                    
        return obs, {}

    def step(self, action_dict):
        if self.current_step >= len(self.images):
            self.current_step = len(self.images) - 1

        obs, rewards, dones, infos = {}, {}, {}, {}

        for agent_id in self.agent_ids:
            other_agent_id = "EVC" if agent_id == "IT" else "IT"

            full_action = action_dict.get(agent_id, np.zeros((93,)))
            agent_actions = full_action[:92].reshape(-1, 1)#(full_action[:92]).reshape((92, 1))  
            other_action = full_action[92:]  

            self.actions[agent_id].append(agent_actions)  

            other_actions = action_dict.get(other_agent_id, np.zeros((93,)))[92:]  

            obs[agent_id] = {
                "image": torch.tensor(self.images[self.current_step], dtype=torch.float32),  # Convert to tensor
                "other_action": torch.tensor(other_actions, dtype=torch.float32),  # Convert to tensor
            }

            #rewards[agent_id] = self._calculate_reward(agent_id)  

            rewards[agent_id] = np.array([self._calculate_reward(agent_id)], dtype=np.float32)
            dones[agent_id] = self.current_step >= len(self.images) - 1
            infos[agent_id] = {}

        dones["__all__"] = all(dones.values())
        self.current_step += 1  
        return obs, rewards, dones, {}, infos
    def _calculate_reward(self, agent_id):
        """Computes the reward based on similarity to the expert RDM using all 92 actions per step."""
        actions = np.array(self.actions[agent_id])  # Shape: (num_steps, 92, 1)
        
        num_steps = actions.shape[0]  
        num_images = actions.shape[1]  

        if num_steps < 2:  
            return 0  

        simulated_rdm = np.zeros((num_images, num_images))

        for i in range(num_images):
            for j in range(num_images):
                if i != j:
                    sim = cosine_similarity(actions[:, i].reshape(-1, 1), actions[:, j].reshape(-1, 1))[0][0]
                    simulated_rdm[i, j] = 1 - sim

        expert_rdm = self.y1 if agent_id == "IT" else self.y2

        min_size = min(simulated_rdm.shape[0], expert_rdm.shape[0])
        reward = -np.sum((simulated_rdm[:min_size, :min_size] - expert_rdm[:min_size, :min_size]) ** 2)
        return reward


# -------------------------------
# Training Setup with AgileRL
# -------------------------------8-=
x_train = np.random.randint(0, 255, (175,175, 92), dtype=np.uint8)  # Dummy dataset
y_train1 = np.random.randn(92, 92)  # Dummy RDM for agent IT
y_train2 = np.random.randn(92, 92)  # Dummy RDM for agent EVC

env = CichyEnv(x_train, y_train1, y_train2)

# Initial Hyperparameters
INIT_HP = {
    "DOUBLE": True,
    "CHANNELS_LAST": False,
    "POPULATION_SIZE": 2,
    "O_U_NOISE": 0.2,
    "EXPL_NOISE": 0.1,
    "BATCH_SIZE": 64,
    "LR": 0.001,
    "LR_ACTOR": 0.002,
    "LR_CRITIC": 0.002,
    "TAU": 0.5,
    "GAMMA": 1.0,
    "LAMBDA": 1.0,
    "REG": 0.0625,
    "LEARN_STEP": 2,
    "MEAN_NOISE": 1,
    "THETA": 1,
    "DT": 1,
    "POLICY_FREQ": 2,
    "AGENT_IDS": ["IT","EVC"],
    "MEMORY_SIZE": 100000
        
}

hp_config = HyperparameterConfig(
    #lr=RLParameter(min=6.25e-5, max=1e-2),
    batch_size=RLParameter(min=8, max=512, dtype=int),
    learn_step=RLParameter(min=1, max=10, dtype=int, grow_factor=1.5, shrink_factor=0.75)
)

NET_CONFIG = {"head_config": {"hidden_size": [128]}}

# Create populations for each agent
pop_agent = Population(
    algo="MATD3",#NeuralTS",
    observation_space=[spaces.flatten_space(env.observation_space), spaces.flatten_space(env.observation_space)],
    action_space=[env.action_space,env.action_space],
    net_config=NET_CONFIG,
    INIT_HP=INIT_HP,
    hp_config=hp_config,
    population_size=INIT_HP["POPULATION_SIZE"],
    device='cpu'
)


# Tournament selection
tournament = TournamentSelection(
    tournament_size=2,
    elitism=True,
    population_size=INIT_HP["POPULATION_SIZE"],
    eval_loop=1,
)

# Mutation settings
mutations = Mutations(
    no_mutation=0.4,
    architecture=0.2,
    new_layer_prob=0.2,
    parameters=0.2,
    activation=0,
    rl_hp=0.2,
    mutation_sd=0.1,
    rand_seed=1,
    device="cuda",
)
from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer

field_names = ["state", "action", "reward", "next_state", "done"]
memory = MultiAgentReplayBuffer(
    INIT_HP["MEMORY_SIZE"],
    field_names=field_names,
    agent_ids=INIT_HP["AGENT_IDS"],
    device=device,
)


"""
# Offline training
trained_pop, pop_fitnesses = train_multi_agent(#train_on_policy(#train_offline(
    pop=[pop_agent1, pop_agent2],
    env=env,
    algo="MATD3",
    env_name="cichyenv",
    #dataset=[x_train, y_train1, y_train2],
    memory=memory,  # Replay buffer if needed
    swap_channels=False,  # Ensure channel order is correct
    max_steps=500000,
    evo_steps=10000,
    eval_steps=None,
    eval_loop=1,
    target=200.0,
    tournament=tournament,
    #mutation=mutations,
    wb=False,  # Weights & Biases logging
    #accelerator=device
)
"""
"""
trained_pop, fitnesses = train_multi_agent(
    pop=pop_agent,
    env=env,
    algo="MATD3",
    env_name="cichyenv",
    memory=memory,
    swap_channels=False,
    max_steps=500000,
    evo_steps=10000,
    eval_steps=None,
    eval_loop=1,
    target=200.0,
    tournament=tournament,
    wb=False,
)
"""

cuda


'\ntrained_pop, fitnesses = train_multi_agent(\n    pop=pop_agent,\n    env=env,\n    algo="MATD3",\n    env_name="cichyenv",\n    memory=memory,\n    swap_channels=False,\n    max_steps=500000,\n    evo_steps=10000,\n    eval_steps=None,\n    eval_loop=1,\n    target=200.0,\n    tournament=tournament,\n    wb=False,\n)\n'

The research team suggests that the number of representative agents that are needed are 5 agents for EVC and 3 agents for IT. Therefore we will use 9 agents total since 1 needs to represent the occipital lobe 

Other TODOs

TODO: figure out visualization and figure out how to increase number of enviroments

other TODO: switch over from RDMs to MEG 

In order to do this switch over the research team needs to figure out which channels correspond to which parts of the brain so that irrelavent channels can be removed. 

how to do groupings? the answer is to use hierarchical learning 

other TODO: swap out classical neural network for UODE

In [None]:
from tqdm import trange

max_steps = 20000 
x_train = np.random.randint(0, 255, (175,175, 92), dtype=np.uint8)  # Dummy dataset
y_train1 = np.random.randn(92, 92)  # Dummy RDM for agent IT
y_train2 = np.random.randn(92, 92)  # Dummy RDM for agent EVC
training_steps = 6
env = CichyEnv(x_train, y_train1, y_train2)#.parallel_env()
action_dict = {
    "IT": np.random.uniform(-1, 1, 93),  # Random action for agent IT (size 93)
    "EVC": np.random.uniform(-1, 1, 93)  # Random action for agent EVC (size 93)
}
#env.step(action_dict)
NET_CONFIG = {"head_config": {"hidden_size": [128]}}
obs = env.reset()  # This ensures self.actions is initialized
obs, rewards, dones,bb, infos = env.step(action_dict)
print(type(obs), obs)


pop_agent = Population(
    algo="MATD3",#NeuralTS",
    observation_space=[spaces.flatten_space(env.observation_space), spaces.flatten_space(env.observation_space)],
    action_space=[env.action_space,env.action_space],
    net_config=NET_CONFIG,
    INIT_HP=INIT_HP,
    hp_config=hp_config,
    population_size=INIT_HP["POPULATION_SIZE"],
    device=device
)

#print(type(obs), obs)
num_envs = 4
learning_delay = 0  # Steps before starting learning
evo_steps = 10000  # Evolution frequency
eval_steps = None  # Evaluation steps per episode - go until done
eval_loop = 1  

total_steps = 0

agent_ids = ["IT", "EVC"]
#agent = pop_agent[0]
#print(agent)
pbar = trange(max_steps, unit="step")
while np.less([agent.steps[-1] for agent in pop_agent], max_steps).all():
    pop_episode_scores = []
    for agent in pop_agent:  # Loop through population
        state, info = env.reset()  # Reset environment at start of episode
        scores = np.zeros(num_envs)
        completed_episode_scores = []
        steps = 0
        processed_obs = {
            agent_id: spaces.flatten(env.observation_space, obs[agent_id]) 
            for agent_id in agent_ids
            }

    for idx_step in range(training_steps // num_envs):
        # Ensure obs is flattened before passing to agent
        cont_actions, discrete_action = agent.get_action(obs=processed_obs, training=True, infos=info)
        if agent.discrete_actions:
            action = discrete_action
        else:
            action = cont_actions

        # Act in environment
        next_state, reward, termination, truncation, info = env.step(action)

        scores += np.sum(np.array(list(reward.values())).transpose(), axis=-1)
        total_steps += num_envs
        steps += num_envs
        """
        # Save experiences to replay buffer

        #erroring out must fix

        memory.save_to_memory(
            state,
            cont_actions,
            reward,
            next_state,
            termination,
            is_vectorised=True,
        )
        """

        # Learn according to learning frequency
        # Handle learn steps > num_envs
        if agent.learn_step > num_envs:
            learn_step = agent.learn_step // num_envs
            if (
                idx_step % learn_step == 0
                and len(memory) >= agent.batch_size
                and memory.counter > learning_delay
            ):
                # Sample replay buffer
                experiences = memory.sample(agent.batch_size)
                # Learn according to agent's RL algorithm
                agent.learn(experiences)
        # Handle num_envs > learn step; learn multiple times per step in env
        elif (
            len(memory) >= agent.batch_size and memory.counter > learning_delay
        ):
            for _ in range(num_envs // agent.learn_step):
                # Sample replay buffer
                experiences = memory.sample(agent.batch_size)
                # Learn according to agent's RL algorithm
                agent.learn(experiences)

        state = next_state

        # Calculate scores and reset noise for finished episodes
        reset_noise_indices = []
        term_array = np.array(list(termination.values())).transpose()
        trunc_array = np.array(list(truncation.values())).transpose()
        for idx, (d, t) in enumerate(zip(term_array, trunc_array)):
            if np.any(d) or np.any(t):
                completed_episode_scores.append(scores[idx])
                agent.scores.append(scores[idx])
                scores[idx] = 0
                reset_noise_indices.append(idx)
        agent.reset_action_noise(reset_noise_indices)

    pbar.update(evo_steps // len(pop_agent))

    agent.steps[-1] += steps
    pop_episode_scores.append(completed_episode_scores)

# Evaluate population
fitnesses = [
    agent.test(
        env,
        swap_channels=INIT_HP["CHANNELS_LAST"],
        max_steps=eval_steps,
        loop=eval_loop,
    )
    for agent in pop_agent
]
mean_scores = [
    (
        np.mean(episode_scores)
        if len(episode_scores) > 0
        else "0 completed episodes"
    )
    for episode_scores in pop_episode_scores
]

print(f"--- Global steps {total_steps} ---")
print(f"Steps {[agent.steps[-1] for agent in pop_agent]}")
print(f"Scores: {mean_scores}")
print(f'Fitnesses: {["%.2f"%fitness for fitness in fitnesses]}')
print(
    f'5 fitness avgs: {["%.2f"%np.mean(agent.fitness[-5:]) for agent in pop_agent]}'
)

# Tournament selection and population mutation
elite, pop = tournament.select(pop_agent)
pop = mutations.mutation(pop)

# Update step counter
for agent in pop:
    agent.steps.append(agent.steps[-1])

pbar.close()
env.close()

<class 'dict'> {'IT': {'image': tensor([[ 37., 244., 193.,  ..., 205.,  63., 145.],
        [ 60.,  80.,  61.,  ...,  56.,  26.,   5.],
        [109.,  26., 153.,  ...,  11.,  52., 114.],
        ...,
        [252.,  93.,  67.,  ..., 226.,  78.,  35.],
        [246.,  24.,  80.,  ..., 183., 222., 165.],
        [ 34., 136., 152.,  ..., 175., 230., 136.]]), 'other_action': tensor([0.7162])}, 'EVC': {'image': tensor([[ 37., 244., 193.,  ..., 205.,  63., 145.],
        [ 60.,  80.,  61.,  ...,  56.,  26.,   5.],
        [109.,  26., 153.,  ...,  11.,  52., 114.],
        ...,
        [252.,  93.,  67.,  ..., 226.,  78.,  35.],
        [246.,  24.,  80.,  ..., 183., 222., 165.],
        [ 34., 136., 152.,  ..., 175., 230., 136.]]), 'other_action': tensor([0.6542])}}


25000000step [01:01, 405757.86step/s] ep/s]
24985000step [00:14, 1484579.37step/s]     

AssertionError: Expected torch.Tensor, got <class 'dict'>

25000000step [00:26, 1484579.37step/s]

In [61]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from pettingzoo import ParallelEnv
from sklearn.metrics.pairwise import cosine_similarity
from agilerl.utils.utils import create_population as Population
from agilerl.algorithms.core.registry import HyperparameterConfig, RLParameter
from agilerl.training.train_offline import train_offline
from agilerl.training.train_on_policy import train_on_policy
from agilerl.training.train_multi_agent import train_multi_agent
from agilerl.hpo.tournament import TournamentSelection
from agilerl.hpo.mutation import Mutations
#set device to the dual t4s. 
from types import SimpleNamespace

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)
# -------------------------------
# Environment Definition
# -------------------------------
class CichyEnv(ParallelEnv):
    metadata = {"name": "cichyenv"}

    def __init__(self, images, y1, y2):
        self.images = images.reshape(92, 175, 175)  # Ensure correct shape
        self.y1 = y1  # Rewards for agent IT
        self.y2 = y2  # Rewards for agent EVC
        self.agents = ["IT1", "IT2", "IT3", "EVC1", "EVC2", "EVC3", "EVC4", "EVC5"]
        self.agent_ids = ["IT1", "IT2", "IT3", "EVC1", "EVC2", "EVC3", "EVC4", "EVC5"]#["IT", "EVC"]
        self.current_step = 0  # Initialize current_step
        
        # Merging observation spaces into a single space
        self.observation_space = spaces.Dict({
            "image": spaces.Box(low=0, high=255, shape=(175, 175), dtype=np.uint8),  # Expecting image shape of (175, 175)
            "other_action": spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32),
        })

        self.action_space = spaces.Box(low=-1, high=1, shape=(93,), dtype=np.float32)

    def reset(self):
        self.current_step = 0  # Ensure it's set to 0 at the start of each episode
        self.actions = {agent: [] for agent in self.agent_ids}
        
        # Creating observation dictionary
        obs = {}
        for agent in self.agent_ids:
            image = self.images[self.current_step]  # Get the image for the current step
            if image.shape != (175, 175):
                raise ValueError(f"Unexpected image shape: {image.shape}, expected (175, 175)")

            # Expand the image dimensions to make it 5D for Conv3D (batch, channels, depth, height, width)
            image_expanded = np.expand_dims(image, axis=0)  # Adds a batch dimension, (1, 175, 175)
            image_expanded = np.expand_dims(image_expanded, axis=0)  # Adds a channel dimension, (1, 1, 175, 175)
            image_expanded = np.expand_dims(image_expanded, axis=0)  # Adds depth dimension, (1, 1, 1, 175, 175)

            obs[agent] = {
                "image": image_expanded,  # Now shape (1, 1, 1, 175, 175)
                "other_action": np.array([0.0]),  # Dummy value for the action of the other agent
            }
            """
            uncomment out soon 

            obs = {
                agent_id: spaces.flatten(self.observation_space, obs[agent_id]) 
                for agent_id in self.agent_ids
                }
            """
                    
        return obs, {}

    def step(self, action_dict):
        if self.current_step >= len(self.images):
            self.current_step = len(self.images) - 1

        obs, rewards, dones, infos = {}, {}, {}, {}

        for agent_id in self.agent_ids:
            other_agent_id = "EVC" if agent_id == "IT" else "IT"

            full_action = action_dict.get(agent_id, np.zeros((93,)))
            agent_actions = full_action[:92].reshape(-1, 1)#(full_action[:92]).reshape((92, 1))  
            other_action = full_action[92:]  

            self.actions[agent_id].append(agent_actions)  

            other_actions = action_dict.get(other_agent_id, np.zeros((93,)))[92:]  

            obs[agent_id] = {
                "image": torch.tensor(self.images[self.current_step], dtype=torch.float32),  # Convert to tensor
                "other_action": torch.tensor(other_actions, dtype=torch.float32),  # Convert to tensor
            }

            #rewards[agent_id] = self._calculate_reward(agent_id)  

            rewards[agent_id] = np.array([self._calculate_reward(agent_id)], dtype=np.float32)
            dones[agent_id] = self.current_step >= len(self.images) - 1
            infos[agent_id] = {}

        dones["__all__"] = all(dones.values())
        self.current_step += 1  
        return obs, rewards, dones, {}, infos
    def _calculate_reward(self, agent_id):
        """Computes the reward based on similarity to the expert RDM using all 92 actions per step."""
        actions = np.array(self.actions[agent_id])  # Shape: (num_steps, 92, 1)
        
        num_steps = actions.shape[0]  
        num_images = actions.shape[1]  

        if num_steps < 2:  
            return 0  

        simulated_rdm = np.zeros((num_images, num_images))

        for i in range(num_images):
            for j in range(num_images):
                if i != j:
                    sim = cosine_similarity(actions[:, i].reshape(-1, 1), actions[:, j].reshape(-1, 1))[0][0]
                    simulated_rdm[i, j] = 1 - sim

        expert_rdm = self.y1 if agent_id == "IT" else self.y2

        min_size = min(simulated_rdm.shape[0], expert_rdm.shape[0])
        reward = -np.sum((simulated_rdm[:min_size, :min_size] - expert_rdm[:min_size, :min_size]) ** 2)
        return reward


# -------------------------------
# Training Setup with AgileRL
# -------------------------------8-=
x_train = np.random.randint(0, 255, (175,175, 92), dtype=np.uint8)  # Dummy dataset
y_train1 = np.random.randn(92, 92)  # Dummy RDM for agent IT
y_train2 = np.random.randn(92, 92)  # Dummy RDM for agent EVC

env = CichyEnv(x_train, y_train1, y_train2)

# Initial Hyperparameters
INIT_HP = {
    "DOUBLE": True,
    "CHANNELS_LAST": False,
    "POPULATION_SIZE": 8,
    "O_U_NOISE": 0.2,
    "EXPL_NOISE": 0.1,
    "BATCH_SIZE": 64,
    "LR": 0.001,
    "LR_ACTOR": 0.002,
    "LR_CRITIC": 0.002,
    "TAU": 0.5,
    "GAMMA": 1.0,
    "LAMBDA": 1.0,
    "REG": 0.0625,
    "LEARN_STEP": 2,
    "MEAN_NOISE": 1,
    "THETA": 1,
    "DT": 1,
    "POLICY_FREQ": 2,
    "AGENT_IDS": ["IT1", "IT2", "IT3", "EVC1", "EVC2", "EVC3", "EVC4", "EVC5"],
    "MEMORY_SIZE": 100000
        
}

hp_config = HyperparameterConfig(
    #lr=RLParameter(min=6.25e-5, max=1e-2),
    batch_size=RLParameter(min=8, max=512, dtype=int),
    learn_step=RLParameter(min=1, max=10, dtype=int, grow_factor=1.5, shrink_factor=0.75)
)

NET_CONFIG = {"head_config": {"hidden_size": [128]}}

# Create populations for each agent
pop_agent = Population(
    algo="MATD3",#NeuralTS",
    observation_space=[spaces.flatten_space(env.observation_space), spaces.flatten_space(env.observation_space),spaces.flatten_space(env.observation_space),spaces.flatten_space(env.observation_space),spaces.flatten_space(env.observation_space), spaces.flatten_space(env.observation_space),spaces.flatten_space(env.observation_space),spaces.flatten_space(env.observation_space)],
    action_space=[env.action_space,env.action_space,env.action_space,env.action_space,env.action_space,env.action_space,env.action_space,env.action_space],
    net_config=NET_CONFIG,
    INIT_HP=INIT_HP,
    hp_config=hp_config,
    population_size=INIT_HP["POPULATION_SIZE"],
    device='cpu'
)


# Tournament selection
tournament = TournamentSelection(
    tournament_size=2,
    elitism=True,
    population_size=INIT_HP["POPULATION_SIZE"],
    eval_loop=1,
)

# Mutation settings
mutations = Mutations(
    no_mutation=0.4,
    architecture=0.2,
    new_layer_prob=0.2,
    parameters=0.2,
    activation=0,
    rl_hp=0.2,
    mutation_sd=0.1,
    rand_seed=1,
    device="cuda",
)
from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer

field_names = ["state", "action", "reward", "next_state", "done"]
memory = MultiAgentReplayBuffer(
    INIT_HP["MEMORY_SIZE"],
    field_names=field_names,
    agent_ids=INIT_HP["AGENT_IDS"],
    device=device,
)

cuda


In [None]:
from tqdm import trange

max_steps = 20000 
x_train = np.random.randint(0, 255, (175,175, 92), dtype=np.uint8)  # Dummy dataset
y_train1 = np.random.randn(92, 92)  # Dummy RDM for agent IT
y_train2 = np.random.randn(92, 92)  # Dummy RDM for agent EVC
training_steps = 6
env = CichyEnv(x_train, y_train1, y_train2)#.parallel_env()
action_dict = {
    "IT": np.random.uniform(-1, 1, 93),  # Random action for agent IT (size 93)
    "EVC": np.random.uniform(-1, 1, 93)  # Random action for agent EVC (size 93)
}
#env.step(action_dict)
NET_CONFIG = {"head_config": {"hidden_size": [128]}}
obs = env.reset()  # This ensures self.actions is initialized
obs, rewards, dones,bb, infos = env.step(action_dict)
print(type(obs), obs)


pop_agent = Population(
    algo="MATD3",#NeuralTS",
    observation_space=[spaces.flatten_space(env.observation_space), spaces.flatten_space(env.observation_space),spaces.flatten_space(env.observation_space),spaces.flatten_space(env.observation_space),spaces.flatten_space(env.observation_space), spaces.flatten_space(env.observation_space),spaces.flatten_space(env.observation_space),spaces.flatten_space(env.observation_space)],
    action_space=[env.action_space,env.action_space,env.action_space,env.action_space,env.action_space,env.action_space,env.action_space,env.action_space],
    net_config=NET_CONFIG,
    INIT_HP=INIT_HP,
    hp_config=hp_config,
    population_size=INIT_HP["POPULATION_SIZE"],
    device='cpu'
)

#print(type(obs), obs)
num_envs = 4
learning_delay = 0  # Steps before starting learning
evo_steps = 10000  # Evolution frequency
eval_steps = None  # Evaluation steps per episode - go until done
eval_loop = 1  

total_steps = 0

agent_ids = ["IT1", "IT2", "IT3", "EVC1", "EVC2", "EVC3", "EVC4", "EVC5"]
#agent = pop_agent[0]
#print(agent)
pbar = trange(max_steps, unit="step")
while np.less([agent.steps[-1] for agent in pop_agent], max_steps).all():
    pop_episode_scores = []
    for agent in pop_agent:  # Loop through population
        state, info = env.reset()  # Reset environment at start of episode
        scores = np.zeros(num_envs)
        completed_episode_scores = []
        steps = 0
        processed_obs = {
            agent_id: spaces.flatten(env.observation_space, obs[agent_id]) 
            for agent_id in agent_ids
            }

    for idx_step in range(training_steps // num_envs):
        # Ensure obs is flattened before passing to agent
        cont_actions, discrete_action = agent.get_action(obs=processed_obs, training=True, infos=info)
        if agent.discrete_actions:
            action = discrete_action
        else:
            action = cont_actions

        # Act in environment
        next_state, reward, termination, truncation, info = env.step(action)

        scores += np.sum(np.array(list(reward.values())).transpose(), axis=-1)
        total_steps += num_envs
        steps += num_envs
        """
        # Save experiences to replay buffer

        #erroring out must fix

        memory.save_to_memory(
            state,
            cont_actions,
            reward,
            next_state,
            termination,
            is_vectorised=True,
        )
        """

        # Learn according to learning frequency
        # Handle learn steps > num_envs
        if agent.learn_step > num_envs:
            learn_step = agent.learn_step // num_envs
            if (
                idx_step % learn_step == 0
                and len(memory) >= agent.batch_size
                and memory.counter > learning_delay
            ):
                # Sample replay buffer
                experiences = memory.sample(agent.batch_size)
                # Learn according to agent's RL algorithm
                agent.learn(experiences)
        # Handle num_envs > learn step; learn multiple times per step in env
        elif (
            len(memory) >= agent.batch_size and memory.counter > learning_delay
        ):
            for _ in range(num_envs // agent.learn_step):
                # Sample replay buffer
                experiences = memory.sample(agent.batch_size)
                # Learn according to agent's RL algorithm
                agent.learn(experiences)

        state = next_state

        # Calculate scores and reset noise for finished episodes
        reset_noise_indices = []
        term_array = np.array(list(termination.values())).transpose()
        trunc_array = np.array(list(truncation.values())).transpose()
        for idx, (d, t) in enumerate(zip(term_array, trunc_array)):
            if np.any(d) or np.any(t):
                completed_episode_scores.append(scores[idx])
                agent.scores.append(scores[idx])
                scores[idx] = 0
                reset_noise_indices.append(idx)
        agent.reset_action_noise(reset_noise_indices)

    pbar.update(evo_steps // len(pop_agent))

    agent.steps[-1] += steps
    pop_episode_scores.append(completed_episode_scores)

# Evaluate population
fitnesses = [
    agent.test(
        env,
        swap_channels=INIT_HP["CHANNELS_LAST"],
        max_steps=eval_steps,
        loop=eval_loop,
    )
    for agent in pop_agent
]
mean_scores = [
    (
        np.mean(episode_scores)
        if len(episode_scores) > 0
        else "0 completed episodes"
    )
    for episode_scores in pop_episode_scores
]

print(f"--- Global steps {total_steps} ---")
print(f"Steps {[agent.steps[-1] for agent in pop_agent]}")
print(f"Scores: {mean_scores}")
print(f'Fitnesses: {["%.2f"%fitness for fitness in fitnesses]}')
print(
    f'5 fitness avgs: {["%.2f"%np.mean(agent.fitness[-5:]) for agent in pop_agent]}'
)

# Tournament selection and population mutation
elite, pop = tournament.select(pop_agent)
pop = mutations.mutation(pop)

# Update step counter
for agent in pop:
    agent.steps.append(agent.steps[-1])

pbar.close()
env.close()

<class 'dict'> {'IT1': {'image': tensor([[105., 105.,  42.,  ..., 243., 246., 226.],
        [ 35.,  32., 211.,  ...,  69., 152.,  22.],
        [160., 106.,  60.,  ..., 233.,   2.,  88.],
        ...,
        [143.,   8., 218.,  ..., 159., 206.,  47.],
        [ 43.,  52.,  43.,  ...,  32., 136., 207.],
        [128.,  81., 162.,  ..., 209., 136., 111.]]), 'other_action': tensor([-0.5256])}, 'IT2': {'image': tensor([[105., 105.,  42.,  ..., 243., 246., 226.],
        [ 35.,  32., 211.,  ...,  69., 152.,  22.],
        [160., 106.,  60.,  ..., 233.,   2.,  88.],
        ...,
        [143.,   8., 218.,  ..., 159., 206.,  47.],
        [ 43.,  52.,  43.,  ...,  32., 136., 207.],
        [128.,  81., 162.,  ..., 209., 136., 111.]]), 'other_action': tensor([-0.5256])}, 'IT3': {'image': tensor([[105., 105.,  42.,  ..., 243., 246., 226.],
        [ 35.,  32., 211.,  ...,  69., 152.,  22.],
        [160., 106.,  60.,  ..., 233.,   2.,  88.],
        ...,
        [143.,   8., 218.,  ..., 159.,

 19%|█▉        | 3750/20000 [05:51<25:24, 10.66step/s]
6247500step [00:37, 166485.00step/s]                       

AssertionError: Expected torch.Tensor, got <class 'dict'>

6250000step [00:54, 166485.00step/s]

In [None]:
"""
import random
from copy import copy
prisoner_y = 0
possible_agents = ["prisoner", "guard"]

agents = copy(possible_agents)
guard_x = 6
guard_y = 6
prisoner_x = 0
escape_x = random.randint(2, 5)
escape_y = random.randint(2, 5)
pobs = {
    a: (
            prisoner_x + 7 * prisoner_y,
            guard_x + 7 * guard_y,
            escape_x + 7 * escape_y,
            )
            for a in agents
}
print(p)"
""""

{'prisoner': (0, 48, 24), 'guard': (0, 48, 24)}


In [None]:
trained_pop, fitnesses = train_multi_agent(
    pop=pop_agent,
    env=env,
    algo="MATD3",
    env_name="cichyenv",
    memory=memory,
    swap_channels=False,
    max_steps=500000,
    evo_steps=10000,
    eval_steps=None,
    eval_loop=1,
    target=200.0,
    tournament=tournament,
    wb=False,
)

In [None]:
env = CichyEnv(x_train, y_train1, y_train2)

# Initial Hyperparameters
INIT_HP = {
    "DOUBLE": True,
    "CHANNELS_LAST": False,
    "POPULATION_SIZE": 9,
    "O_U_NOISE": 0.2,
    "EXPL_NOISE": 0.1,
    "BATCH_SIZE": 64,
    "LR": 0.001,
    "LR_ACTOR": 0.002,
    "LR_CRITIC": 0.002,
    "TAU": 0.5,
    "GAMMA": 1.0,
    "LAMBDA": 1.0,
    "REG": 0.0625,
    "LEARN_STEP": 2,
    "MEAN_NOISE": 1,
    "THETA": 1,
    "DT": 1,
    "POLICY_FREQ": 2,
    "AGENT_IDS": ["IT","EVC"],
    "MEMORY_SIZE": 100000
        
}

hp_config = HyperparameterConfig(
    #lr=RLParameter(min=6.25e-5, max=1e-2),
    batch_size=RLParameter(min=8, max=512, dtype=int),
    learn_step=RLParameter(min=1, max=10, dtype=int, grow_factor=1.5, shrink_factor=0.75)
)

NET_CONFIG = {"head_config": {"hidden_size": [128]}}

# Create populations for each agent
pop_agent = Population(
    algo="MATD3",#NeuralTS",
    observation_space=[spaces.flatten_space(env.observation_space), spaces.flatten_space(env.observation_space)],
    action_space=[env.action_space,env.action_space],
    net_config=NET_CONFIG,
    INIT_HP=INIT_HP,
    hp_config=hp_config,
    population_size=INIT_HP["POPULATION_SIZE"],
    device=device
)


# Tournament selection
tournament = TournamentSelection(
    tournament_size=2,
    elitism=True,
    population_size=INIT_HP["POPULATION_SIZE"],
    eval_loop=1,
)

# Mutation settings
mutations = Mutations(
    no_mutation=0.4,
    architecture=0.2,
    new_layer_prob=0.2,
    parameters=0.2,
    activation=0,
    rl_hp=0.2,
    mutation_sd=0.1,
    rand_seed=1,
    device="cuda",
)
from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer

field_names = ["state", "action", "reward", "next_state", "done"]
memory = MultiAgentReplayBuffer(
    INIT_HP["MEMORY_SIZE"],
    field_names=field_names,
    agent_ids=INIT_HP["AGENT_IDS"],
    device=device,
)

code below trains skills

note: skills are not full envs. They are classes that connect to an env and only consist of a single reward function. 

In [None]:
from agilerl.wrappers.learning import Skill
import os
from agilerl.algorithms.ppo import PPO
from agilerl.training.train_on_policy import train_on_policy
from agilerl.wrappers.learning import Skill
from agilerl.utils.algo_utils import obs_channels_to_first
from agilerl.utils.utils import (
   create_population,
   make_skill_vect_envs,
   make_vect_envs,
   observation_space_channels_to_first
)
NET_CONFIG = {
   "encoder_config": {"hidden_size": [64, 64]}  # Actor encoder hidden size
}

INIT_HP = {
   "ENV_NAME": "LunarLander-v2",
   "ALGO": "PPO",
   "POPULATION_SIZE": 1,  # Population size
   "BATCH_SIZE": 128,  # Batch size
   "LR": 1e-3,  # Learning rate
   "LEARN_STEP": 128,  # Learning frequency
   "GAMMA": 0.99,  # Discount factor
   "GAE_LAMBDA": 0.95,  # Lambda for general advantage estimation
   "ACTION_STD_INIT": 0.6,  # Initial action standard deviation
   "CLIP_COEF": 0.2,  # Surrogate clipping coefficient
   "ENT_COEF": 0.01,  # Entropy coefficient
   "VF_COEF": 0.5,  # Value function coefficient
   "MAX_GRAD_NORM": 0.5,  # Maximum norm for gradient clipping
   "TARGET_KL": None,  # Target KL divergence threshold
   "TARGET_SCORE": 2000,
   "MAX_STEPS": 1_000_000,
   "EVO_STEPS": 10_000,
   "UPDATE_EPOCHS": 4,  # Number of policy update epochs
   # Swap image channels dimension from last to first [H, W, C] -> [C, H, W]
   "CHANNELS_LAST": False,
   "WANDB": True,
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Directory to save trained agents and skills
save_dir = "./models/PPO"
os.makedirs(save_dir, exist_ok=True)

skills = {
   "stabilize": StabilizeSkill,
   "center": CenterSkill,
   "landing": LandingSkill,
}

In [None]:
for skill in skills.keys():
   env = make_skill_vect_envs(
         INIT_HP["ENV_NAME"], skills[skill], num_envs=1
   )  # Create environment

   observation_space = env.single_observation_space
   action_space = env.single_action_space
   if INIT_HP["CHANNELS_LAST"]:
         observation_space = observation_space_channels_to_first(observation_space)

   pop = create_population(
         algo="PPO",  # Algorithm
         observation_space=observation_space,  # Observation space
         action_space=action_space,  # Action space
         net_config=NET_CONFIG,  # Network configuration
         INIT_HP=INIT_HP,  # Initial hyperparameters
         population_size=INIT_HP["POPULATION_SIZE"],  # Population size
         device=device,
   )

   trained_pop, pop_fitnesses = train_on_policy(
         env=env,  # Gym-style environment
         env_name=f"{INIT_HP['ENV_NAME']}-{skill}",  # Environment name
         algo=INIT_HP["ALGO"],  # Algorithm
         pop=pop,  # Population of agents
         swap_channels=INIT_HP[
            "CHANNELS_LAST"
         ],  # Swap image channel from last to first
         max_steps=INIT_HP["MAX_STEPS"],  # Max number of training episodes
         evo_steps=INIT_HP["EVO_STEPS"],  # Evolution frequency
         evo_loop=3,  # Number of evaluation episodes per agent
         target=INIT_HP["TARGET_SCORE"],  # Target score for early stopping
         tournament=None,  # Tournament selection object
         mutation=None,  # Mutations object
         wb=INIT_HP["WANDB"],  # Weights and Biases tracking
   )

   # Save the trained algorithm
   filename = f"PPO_trained_agent_{skill}.pt"
   save_path = os.path.join(save_dir, filename)
   trained_pop[0].save_checkpoint(save_path)

   env.close()

Code below is for the meta selector agent

In [None]:
stabilize_agent = PPO.load(os.path.join(save_dir, "PPO_trained_agent_stabilize.pt"))
center_agent = PPO.load(os.path.join(save_dir, "PPO_trained_agent_center.pt"))
landing_agent = PPO.load(os.path.join(save_dir, "PPO_trained_agent_landing.pt"))

trained_skills = {
   0: {"skill": "stabilize", "agent": stabilize_agent, "skill_duration": 40},
   1: {"skill": "center", "agent": center_agent, "skill_duration": 40},
   2: {"skill": "landing", "agent": landing_agent, "skill_duration": 40},
}


In [None]:
env = make_vect_envs(INIT_HP["ENV_NAME"], num_envs=1)  # Create environment

observation_space = env.single_observation_space

action_dim = len(
   trained_skills
)  # Selector will be trained to choose which trained skill to use

action_space = spaces.Discrete(action_dim)

if INIT_HP["CHANNELS_LAST"]:
   observation_space = observation_space_channels_to_first(observation_space)

pop = create_population(
   algo="PPO",  # Algorithm
   observation_space=observation_space,  # Observation space
   action_space=action_space,  # Action space
   net_config=NET_CONFIG,  # Network configuration
   INIT_HP=INIT_HP,  # Initial hyperparameters
   population_size=INIT_HP["POPULATION_SIZE"],  # Population size
   device=device,
)

if INIT_HP["WANDB"]:
   wandb.init(
         # set the wandb project where this run will be logged
         project="EvoWrappers",
         name="{}-EvoHPO-{}-{}".format(
            INIT_HP["ENV_NAME"],
            INIT_HP["ALGO"],
            datetime.now().strftime("%m%d%Y%H%M%S"),
         ),
         # track hyperparameters and run metadata
         config={
            "algo": f"Evo HPO {INIT_HP['ALGO']}",
            "env": INIT_HP["ENV_NAME"],
            "INIT_HP": INIT_HP,
         },
   )

bar_format = "{l_bar}{bar:10}| {n:4}/{total_fmt} [{elapsed:>7}<{remaining:>7}, {rate_fmt}{postfix}]"
pbar = trange(
  INIT_HP["MAX_STEPS"],
  unit="step",
  bar_format=bar_format,
  ascii=True)

total_steps = 0

In [None]:
while np.less([agent.steps[-1] for agent in pop], INIT_HP["MAX_STEPS"]).all():
   for agent in pop:  # Loop through population
         state = env.reset()[0]  # Reset environment at start of episode
         score = 0

         states = []
         actions = []
         log_probs = []
         rewards = []
         terminations = []
         values = []

         for idx_step in range(500):
            # Get next action from agent
            action, log_prob, _, value = agent.get_action(state)

            # Internal loop to execute trained skill
            skill_agent = trained_skills[action[0]]["agent"]
            skill_duration = trained_skills[action[0]]["skill_duration"]
            reward = 0
            for skill_step in range(skill_duration):
               # If landed, do nothing
               if state[0][6] or state[0][7]:
                     next_state, skill_reward, termination, truncation, _ = env.step(
                        [0]
                     )
               else:
                     skill_action, _, _, _ = skill_agent.get_action(state)
                     next_state, skill_reward, termination, truncation, _ = env.step(
                        skill_action
                     )  # Act in environment
               reward += skill_reward
               if np.any(termination) or np.any(truncation):
                     break
               state = next_state
            score += reward

            states.append(state)
            actions.append(action)
            log_probs.append(log_prob)
            rewards.append(reward)
            terminations.append(termination)
            values.append(value)

         agent.scores.append(score)

         # Learn according to agent's RL algorithm
         agent.learn(
            (
               states,
               actions,
               log_probs,
               rewards,
               terminations,
               values,
               next_state,
            )
         )

         agent.steps[-1] += idx_step + 1
         total_steps += idx_step + 1

   if (agent.steps[-1]) % INIT_HP["EVO_STEPS"] == 0:
      mean_scores = np.mean([agent.scores[-20:] for agent in pop], axis=1)
      if INIT_HP["WANDB"]:
          wandb.log(
              {
                  "global_step": total_steps,
                  "train/mean_score": np.mean(mean_scores),
              }
          )
      print(
          f"""
          --- Global Steps {total_steps} ---
          Score:\t\t{mean_scores}
          """,
          end="\r",
      )

if INIT_HP["WANDB"]:
   wandb.finish()
env.close()

# Save the trained selector
filename = "PPO_trained_agent_selector.pt"
save_path = os.path.join(save_dir, filename)
pop[0].save_checkpoint(save_path)