In [None]:
# Import libraries
import gym
import gym_d2d
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

# Set hyperparameters
GAMMA = 0.99 # discount factor
EPSILON = 1.0 # exploration rate
EPSILON_DECAY = 0.995 # exploration decay rate
EPSILON_MIN = 0.01 # minimum exploration rate
LEARNING_RATE = 0.001 # learning rate
BATCH_SIZE = 32 # batch size
MEMORY_SIZE = 10000 # replay memory size
UPDATE_TARGET = 1000 # update target network frequency

# Define the Q-network
class QNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Define the agent
class DQNAgent:
    def __init__(self, input_size, output_size):
        self.input_size = input_size
        self.output_size = output_size
        self.memory = deque(maxlen=MEMORY_SIZE) # replay memory
        self.model = QNetwork(input_size, output_size) # online network
        self.target = QNetwork(input_size, output_size) # target network
        self.target.load_state_dict(self.model.state_dict()) # copy weights
        self.optimizer = optim.Adam(self.model.parameters(), lr=LEARNING_RATE) # optimizer
        self.epsilon = EPSILON # exploration rate
        self.step = 0 # step counter

    def act(self, state):
        # Choose an action using epsilon-greedy policy
        if random.random() < self.epsilon:
            # Explore
            return random.randrange(self.output_size)
        else:
            # Exploit
            state = torch.from_numpy(state).float().unsqueeze(0)
            with torch.no_grad():
                action_values = self.model(state)
            return action_values.max(1)[1].item()

    def remember(self, state, action, reward, next_state, done):
        # Store a transition in the replay memory
        self.memory.append((state, action, reward, next_state, done))

    def learn(self):
        # Train the online network using a batch of transitions
        if len(self.memory) < BATCH_SIZE:
            return None  # not enough samples
        # Sample a batch of transitions
        batch = random.sample(self.memory, BATCH_SIZE)
        states, actions, rewards, next_states, dones = zip(*batch)
        dones = np.array(dones)
        dones = dones.astype(bool)
        states = torch.from_numpy(np.array(states)).float()
        actions = torch.from_numpy(np.array(actions)).long()
        rewards = torch.from_numpy(np.array(rewards)).float()
        next_states = torch.from_numpy(np.array(next_states)).float()
        dones = torch.from_numpy(np.array(dones)).float()

        # Compute the current Q-values
        Q_current = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        # Compute the target Q-values
        with torch.no_grad():
            Q_next = self.target(next_states).max(1)[0]
        Q_target = rewards + GAMMA * Q_next * (1 - dones)
        # Compute the loss
        loss = nn.MSELoss()(Q_current, Q_target)
        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        # Update the exploration rate
        self.epsilon = max(EPSILON_MIN, EPSILON_DECAY * self.epsilon)
        # Update the target network
        self.step += 1
        if self.step % UPDATE_TARGET == 0:
            self.target.load_state_dict(self.model.state_dict())

        return loss.item()

# Create the environment
env = gym.make('D2DEnv-v0')
# Get the number of agents
#num_agents = len(env.action_space['due'])
num_agents = env.action_space['due'].n

# Get the observation and action sizes
obs_size = env.observation_space.shape[0]
#obs_size = env.spaces['due'].observation_space.shape[0]

act_size = env.action_space['due'].n
# Create the agents
agents = [DQNAgent(obs_size, act_size) for _ in range(num_agents)]
# Train the agents
num_episodes = 1000 # number of episodes
for i in range(num_episodes):
    # Reset the environment and get the initial observations
    obses = env.reset()
    # Initialize the total rewards
    total_rewards = [0 for _ in range(num_agents)]
    # Initialize the done flag
    done = False
    losses = []
    # Loop until the episode ends
    while not done:
        # Choose actions for each agent
        actions = {}
        for agent_id, obs in obses.items():
            agent_index = int(agent_id[-2:])
            action = agents[agent_index].act(obs)
            actions[agent_id] = action
        # Take actions and get the next observations, rewards, done flag and info
        next_obses, rewards, done, info = env.step(actions)
        # Store the transitions in the replay memory


        for agent_id, obs in obses.items():
            agent_index = int(agent_id[-2:])
            action = actions[agent_id]
            if agent_id in rewards:
               reward = rewards[agent_id]
            else:
               reward = 0
            next_obs = next_obses[agent_id]
            agents[agent_index].remember(obs, action, reward, next_obs, done)
            # Update the total rewards
            total_rewards[agent_index] += reward
        # Train the agents
        for agent in agents:
            agent.learn()
            losses.append(agent.learn())
        # Update the observations
        obses = next_obses
    # Print the episode summary
    print(f"Episode {i+1}:")
    for agent_id, reward,loss in zip(obses.keys(), total_rewards,losses):
        print(f"{agent_id}: {reward:.2f}: {loss}")
    valid_losses = [l for l in losses if l is not None]
    if valid_losses:
        avg_loss = np.mean(valid_losses)
        print(f"Average Loss: {avg_loss:.6f}")
    else:
        print("No valid losses recorded.")



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
due36: 0.00: None
due38: 2.70: 0.11459862440824509
due40: 0.00: None
due42: 2.70: 0.2654601037502289
due44: 0.00: None
due46: 2.70: 0.35519129037857056
due48: 0.00: None
Average Loss: 0.326467
Episode 905:
cue00: 2.65: 1.4899603128433228
cue01: 0.00: 0.001793286413885653
cue02: 2.65: 0.933486819267273
cue03: 0.00: 0.0008665244677104056
cue04: 2.65: 0.5132357478141785
cue05: 0.00: 0.0016682668356224895
cue06: 2.65: 0.2867247760295868
cue07: 0.00: 0.0014608294004574418
cue08: 2.65: 0.8765869140625
cue09: 0.00: 0.0007011070847511292
cue10: 2.65: 0.6320237517356873
cue11: 0.00: 0.0011011518072336912
cue12: 2.65: 0.7609531879425049
cue13: 0.00: 0.05127914622426033
cue14: 2.65: 0.46408429741859436
cue15: 0.00: 0.005318840965628624
cue16: 2.65: 0.6258088946342468
cue17: 0.00: 0.0014596699038520455
cue18: 2.65: 0.5288353562355042
cue19: 0.00: 0.002831588266417384
cue20: 2.65: 1.0126125812530518
cue21: 0.00: 0.0011184802278876305


In [None]:
np.mean(losses)

  and should_run_async(code)


TypeError: ignored

In [None]:
import gym
import gym_d2d
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

class AttentionTransformer(nn.Module):
    def __init__(self, input_size, output_size, num_heads, hidden_size, num_layers):
        super(AttentionTransformer, self).__init__()
        self.embedding = nn.Linear(input_size, hidden_size)
        self.transformer = nn.Transformer(hidden_size, num_heads, num_layers)
        self.attention = nn.MultiheadAttention(hidden_size, num_heads)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(1, 0, 2)
        x, _ = self.transformer(x)
        x, _ = self.attention(x, x, x)
        x = x.permute(1, 0, 2)
        x = self.fc(x)
        return x

# Set hyperparameters
GAMMA = 0.99 # discount factor
LEARNING_RATE = 0.001 # learning rate
BATCH_SIZE = 32 # batch size
MEMORY_SIZE = 10000 # replay memory size
UPDATE_TARGET = 1000 # update target network frequency
NUM_EPISODES = 1000 # number of episodes

# Create the environment
env = gym.make('D2DEnv-v0')
# Get the number of agents
num_agents = (env.action_space['due'].n)
# Get the observation and action sizes
obs_size = env.observation_space.shape[0]
act_size = env.action_space['due'].n
# Create the agents
agents = [AttentionTransformer(obs_size, act_size, 8, 64, 2) for _ in range(num_agents)]
# Define the optimizer
optimizer = optim.Adam(agents[0].parameters(), lr=LEARNING_RATE)
# Define the replay memory
memory = deque(maxlen=MEMORY_SIZE)
# Define the step counter
step = 0
# Train the agents
for i in range(NUM_EPISODES):
    # Reset the environment and get the initial observations
    obses = env.reset()
    # Initialize the total rewards
    total_rewards = [0 for _ in range(num_agents)]
    # Initialize the done flag
    done = False
    # Loop until the episode ends
    while not done:
        # Choose actions for each agent
        actions = {}
        for agent_id, obs in obses.items():
            agent_index = int(agent_id[-2:])
            agent_index = torch.tensor(agent_index)

            obs = torch.from_numpy(obs).float().unsqueeze(0)
            action = agentsagent_index.max(1)[1].item()

            actions[agent_id] = action
        # Take actions and get the next observations, rewards, done flag and info
        next_obses, rewards, done, info = env.step(actions)
        # Store the transitions in the replay memory
        for agent_id, obs in obses.items():
            agent_index = int(agent_id[-2:])
            action = actions[agent_id]
            reward = rewards[agent_id]
            next_obs = next_obses[agent_id]
            memory.append((obs, action, reward, next_obs, done))
            # Update the total rewards
            total_rewards[agent_index] += reward
        # Train the agents
        if len(memory) >= BATCH_SIZE:
            # Sample a batch of transitions
            batch = random.sample(memory, BATCH_SIZE)
            obses, actions, rewards, next_obses, dones = zip(*batch)
            obses = torch.from_numpy(np.array(obses)).float()
            actions = torch.from_numpy(np.array(actions)).long()
            rewards = torch.from_numpy(np.array(rewards)).float()
            next_obses = torch.from_numpy(np.array(next_obses)).float()
            dones = torch.from_numpy(np.array(dones)).float()
            # Compute the predicted Q-values
            q_values = agents0.gather(1, actions.unsqueeze(1)).squeeze(1)
            # Compute the target Q-values
            with torch.no_grad():
                next_q_values = agents0.max(1)[0]
                target_q_values = rewards + GAMMA * next_q_values * (1 - dones)
            # Compute the MSE loss
            loss = F.mse_loss(q_values, target_q_values)
            print(f"Episode {episode + 1}, Loss: {loss.item()}")
            # Optimize the model
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # Update the exploration rate
            step += 1
            if step % UPDATE_TARGET == 0:
                agents[1].load_state_dict(agents[0].state_dict())



  deprecation(
  deprecation(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn("Casting input x to numpy array.")
  logger.warn(f"{pre} is not within the observation space with exception: {e}")


NameError: ignored

In [None]:
import torch.nn.functional as F

def compute_loss(model, target_model, states, actions, rewards, next_states, dones, gamma):
    # Convert the inputs to tensors
    states = torch.tensor(states, dtype=torch.float32)
    actions = torch.tensor(actions, dtype=torch.int64)
    rewards = torch.tensor(rewards, dtype=torch.float32)
    next_states = torch.tensor(next_states, dtype=torch.float32)
    dones = torch.tensor(dones, dtype=torch.bool)

    # Compute the predicted Q-values
    q_values = model(states).gather(1, actions.unsqueeze(1)).squeeze(1)

    # Compute the target Q-values
    with torch.no_grad():
        next_q_values = target_model(next_states).max(1)[0]
        target_q_values = rewards + gamma * next_q_values * (1 - dones)

    # Compute the MSE loss
    loss = F.mse_loss(q_values, target_q_values)

    return loss


In [None]:
pip install gym_d2d

Collecting gym_d2d
  Downloading gym_d2d-0.0.3-py3-none-any.whl (18 kB)
Installing collected packages: gym_d2d
Successfully installed gym_d2d-0.0.3


In [None]:
env.observation_space.shape[0]


300

In [None]:
print(dones.dtype)


NameError: ignored

In [None]:
import gym
import gym_d2d
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random
from collections import deque

class AttentionTransformer(nn.Module):
    def __init__(self, input_size, output_size, num_heads, hidden_size, num_layers):
        super(AttentionTransformer, self).__init__()
        self.embedding = nn.Linear(input_size, hidden_size)
        self.transformer = nn.Transformer(hidden_size, num_heads, num_layers)
        self.attention = nn.MultiheadAttention(hidden_size, num_heads)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(1, 0, 2)
        x, _ = self.transformer(x)
        x, _ = self.attention(x, x, x)
        x = x.permute(1, 0, 2)
        x = self.fc(x)
        return x

# Set hyperparameters
GAMMA = 0.99
LEARNING_RATE = 0.001
BATCH_SIZE = 32
MEMORY_SIZE = 10000
UPDATE_TARGET = 1000
NUM_EPISODES = 1000

# Create the environment
env = gym.make('D2DEnv-v0')
# Get the number of agents
num_agents = env.action_space['due'].n
# Get the observation and action sizes
obs_size = env.observation_space.shape[0]
act_size = env.action_space['due'].n
# Create the agents
agents = [AttentionTransformer(obs_size, act_size, 8, 64, 2) for _ in range(num_agents)]
# Define the optimizer
optimizer = optim.Adam(agents[0].parameters(), lr=LEARNING_RATE)
# Define the replay memory
memory = deque(maxlen=MEMORY_SIZE)
# Define the step counter
step = 0
# Train the agents
for i in range(NUM_EPISODES):
    # Reset the environment and get the initial observations
    obses = env.reset()
    # Initialize the total rewards
    total_rewards = [0 for _ in range(num_agents)]
    # Initialize the done flag
    done = False
    # Loop until the episode ends
    while not done:
        # Choose actions for each agent
        actions = {}
        for agent_id, obs in obses.items():
            agent_index = int(agent_id[-2:])
            agent_tensor = torch.tensor(agent_index)

            obs_tensor = torch.from_numpy(obs).float().unsqueeze(0)
            action = agents[agent_index](obs_tensor).max(1)[1


SyntaxError: ignored

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.functional import mse_loss

# Define the AttentionTransformer model
class AttentionTransformer(nn.Module):
    def __init__(self, input_size, output_size, num_heads, hidden_size, num_layers):
        super(AttentionTransformer, self).__init__()
        self.embedding = nn.Linear(input_size, hidden_size)
        self.transformer = nn.Transformer(hidden_size, num_heads, num_layers)
        self.attention = nn.MultiheadAttention(hidden_size, num_heads)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(0)  # Add a batch dimension
        x = x.permute(1, 0, 2)  # Fix the permutation dimensions
        x = self.transformer(x, x)  # Provide the same sequence as source and target
        x, _ = self.attention(x, x, x)
        x = x.squeeze(0)  # Remove the batch dimension
        x = self.fc(x)
        return x




# Hyperparameters
NUM_EPISODES = 1000
BATCH_SIZE = 32
LEARNING_RATE = 0.001
NUM_HEADS = 8
HIDDEN_SIZE = 64
NUM_LAYERS = 2

# Create the environment
env = gym.make('D2DEnv-v0')
obs_size = env.observation_space.shape[0]
act_size = env.action_space['due'].n

# Create the AttentionTransformer model
model = AttentionTransformer(obs_size, act_size, NUM_HEADS, HIDDEN_SIZE, NUM_LAYERS)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

# Training loop
for episode in range(NUM_EPISODES):
    obses = env.reset()
    done = False
    total_reward = 0

    # Collect data for training
    data = {'obs': [], 'action': []}
    while not done:
        actions = {}
        for agent_id, obs in obses.items():
            obs = torch.from_numpy(obs).float().unsqueeze(0)
            with torch.no_grad():
                action_values = model(obs)
            action = torch.argmax(action_values, dim=-1).item()
            actions[agent_id] = action

            # Store data for training
            data['obs'].append(obs)
            data['action'].append(action)

        next_obses, rewards, done, _ = env.step(actions)
        obses = next_obses
        total_reward += sum(rewards.values())

    # Convert data to tensors
    data['obs'] = torch.cat(data['obs'])
    data['action'] = torch.tensor(data['action'])

    # Create DataLoader for batching
    dataset = TensorDataset(data['obs'], data['action'])
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    # Training
# Training loop
for batch_obs, batch_action in dataloader:
    optimizer.zero_grad()
    output = model(batch_obs)

    # Convert batch_action to one-hot encoding
    batch_action_one_hot = nn.functional.one_hot(batch_action, num_classes=act_size).float()

    # ... (rest of your code)

    # Update the loss calculation
    loss = criterion(output, batch_action_one_hot)

    loss.backward()
    optimizer.step()

# Print results


    # Print results
    print(f"Episode {episode + 1}, Total Reward: {total_reward}, Loss: {loss.item()}")


RuntimeError: ignored