In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym

In [13]:
class QNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def __init__(self, input_size, output_size, num_hidden_layers, hidden_layer_nodes, activation, dropout_prob):
        super(MC_Dropout_Net, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.num_hidden_layers = num_hidden_layers
        self.hidden_layer_nodes = hidden_layer_nodes
        self.activation = activation
        
        # Define the layers
        self.input_layer = nn.Linear(input_size, hidden_layer_nodes)
        self.hidden_layers = nn.ModuleList()
        for _ in range(num_hidden_layers):
            self.hidden_layers.append(nn.Linear(hidden_layer_nodes, hidden_layer_nodes))
            self.hidden_layers.append(nn.Dropout(p=dropout_prob))
        self.output_layer = nn.Linear(hidden_layer_nodes, output_size)

    def forward(self, x):
        x = self.activation(self.input_layer(x))
        for hidden_layer in self.hidden_layers:
            x = self.activation(hidden_layer(x))
        output = self.output_layer(x)
        return output

In [14]:
class Model(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [15]:
class DynaQ:
    def __init__(self, state_dim, action_dim, learning_rate=0.001, gamma=0.99, epsilon=0.1, model_learning_rate=0.001, model_epochs=5):
        self.q_network = QNetwork(state_dim, action_dim)
        self.target_q_network = QNetwork(state_dim, action_dim)
        self.model = Model(state_dim + action_dim, state_dim)
        self.optimizer_q = optim.Adam(self.q_network.parameters(), lr=learning_rate)
        self.optimizer_model = optim.Adam(self.model.parameters(), lr=model_learning_rate)
        self.gamma = gamma
        self.epsilon = epsilon
        self.model_epochs = model_epochs

    def select_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.randint(0, self.q_network.fc3.out_features)
        else:
            with torch.no_grad():
                q_values = self.q_network(torch.tensor(state, dtype=torch.float32))
                return torch.argmax(q_values).item()

    def update(self, state, action, reward, next_state, terminal):
        self.optimizer_q.zero_grad()
        q_values = self.q_network(torch.tensor(state, dtype=torch.float32))
        next_q_values = self.target_q_network(torch.tensor(next_state, dtype=torch.float32))
        target_q = reward + (1 - terminal) * self.gamma * torch.max(next_q_values).item()
        loss = nn.MSELoss()(q_values[action], target_q)
        loss.backward()
        self.optimizer_q.step()

        # Update the model
        self.optimizer_model.zero_grad()
        input_model = torch.cat((torch.tensor(state, dtype=torch.float32), torch.tensor(action, dtype=torch.float32)))
        predicted_next_state = self.model(input_model)
        target_next_state = torch.tensor(next_state, dtype=torch.float32)
        model_loss = nn.MSELoss()(predicted_next_state, target_next_state)
        model_loss.backward()
        self.optimizer_model.step()

    def plan(self, replay_buffer, num_steps):
        for _ in range(num_steps):
            state, action, reward, next_state, terminal = replay_buffer.sample()
            self.update(state, action, reward, next_state, terminal)

    def update_target_network(self):
        self.target_q_network.load_state_dict(self.q_network.state_dict())

In [16]:
# Example usage:
# Initialize DynaQ agent
#env = Environment()  # Replace with your environment class
#env_name = 'CartPole-v1'
#env_name = 'MountainCarContinuous-v0'
env_name = 'MountainCar-v0'
#env_name = 'Pendulum-v1'

# Create the CartPole environment
env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action = env.action_space.sample()
if isinstance(action, int):
    action_dim = env.observation_space.shape[0] + 1
elif isinstance(action, np.ndarray):
    action_dim = env.observation_space.shape[0] + len(action)
#action_dim = len(env.action_space.sample())
dyna_q_agent = DynaQ(state_dim, action_dim)

num_episodes = 10
max_episode_length = 200

# Training loop
for episode in range(num_episodes):
    state = env.reset()[0]
    total_reward = 0
    episode_length = 0
    
    while episode_length < max_episode_length:
        action = dyna_q_agent.select_action(state)
        next_state, reward, done, _, _ = env.step(action)
        dyna_q_agent.update(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        
        if done:
                break
        episode_length = episode_length + 1

    dyna_q_agent.update_target_network()
    dyna_q_agent.plan(replay_buffer, num_steps)

  if not isinstance(terminated, (bool, np.bool8)):


AttributeError: 'float' object has no attribute 'size'

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pickle
from torch.utils.data import DataLoader, Dataset
from collections import deque

# Define Gaussian Non-deterministic Neural Network for Environment Model
class EnvironmentModel(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim):
        super(EnvironmentModel, self).__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, state_dim)
        self.fc3 = nn.Linear(hidden_dim, state_dim)

    def forward(self, state, action):
        x = torch.cat((state, action), dim=-1)
        x = torch.relu(self.fc1(x))
        next_state_mean = self.fc2(x)
        next_state_stddev = torch.exp(self.fc3(x))
        return next_state_mean, next_state_stddev


# Define Normal Deterministic Neural Network for Policy Model
class PolicyModel(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim):
        super(PolicyModel, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, action_dim)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        action = torch.tanh(self.fc2(x))
        return action


# Dyna-Q Algorithm
class DynaQ:
    def __init__(self, state_dim, action_dim, hidden_dim, gamma, alpha, beta, buffer_size, batch_size):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.alpha = alpha
        self.beta = beta
        self.buffer = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.env_model = EnvironmentModel(state_dim, action_dim, hidden_dim)
        self.policy_model = PolicyModel(state_dim, action_dim, hidden_dim)
        self.env_optimizer = optim.Adam(self.env_model.parameters(), lr=alpha)
        self.policy_optimizer = optim.Adam(self.policy_model.parameters(), lr=beta)

    def select_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0)
        return self.policy_model(state).detach().numpy()

    def update_model(self):
        if len(self.buffer) < self.batch_size:
            return

        batch = np.random.choice(len(self.buffer), self.batch_size, replace=False)
        state_batch, action_batch, next_state_batch, reward_batch = zip(*[self.buffer[i] for i in batch])
        state_batch = torch.FloatTensor(state_batch)
        action_batch = torch.FloatTensor(action_batch)
        next_state_batch = torch.FloatTensor(next_state_batch)
        reward_batch = torch.FloatTensor(reward_batch)

        # Update policy model
        predicted_action = self.policy_model(state_batch)
        q_values = torch.sum(predicted_action * self.env_model(state_batch, predicted_action)[0], dim=1)
        loss_policy = -torch.mean(q_values)
        self.policy_optimizer.zero_grad()
        loss_policy.backward()
        self.policy_optimizer.step()

        # Update environment model
        next_state_pred_mean, next_state_pred_stddev = self.env_model(state_batch, action_batch)
        loss_env = nn.MSELoss()(next_state_pred_mean, next_state_batch) + torch.mean(next_state_pred_stddev)
        self.env_optimizer.zero_grad()
        loss_env.backward()
        self.env_optimizer.step()

    def update_q_values(self, state, action, next_state, reward):
        state = torch.FloatTensor(state)
        action = torch.FloatTensor(action)
        next_state = torch.FloatTensor(next_state)
        reward = torch.FloatTensor(reward)

        predicted_action = self.policy_model(state)
        q_value = torch.sum(predicted_action * self.env_model(state, predicted_action)[0])

        next_predicted_action = self.policy_model(next_state)
        next_q_value = torch.sum(next_predicted_action * self.env_model(next_state, next_predicted_action)[0])

        target = reward + self.gamma * next_q_value
        loss_q = nn.MSELoss()(q_value, target.detach())

        self.policy_optimizer.zero_grad()
        loss_q.backward()
        self.policy_optimizer.step()

    def train(self, train_loader, episodes):
        for episode in range(episodes):
            for batch_idx, (state, action, next_state, reward) in enumerate(train_loader):
                state = state.float()
                action = action.float()
                next_state = next_state.float()
                reward = reward.float()

                self.buffer.append((state, action, next_state, reward))
                self.update_model()

                for i in range(self.batch_size):
                    self.update_q_values(state[i], action[i], next_state[i], reward[i])

            print("Episode:", episode + 1)


# Example Usage
if __name__ == "__main__":
    # Set random seed for reproducibility
    torch.manual_seed(0)
    np.random.seed(0)
    
    # Initialize CartPole environment
    env_name = 'CartPole-v1'
    #env_name = 'MountainCarContinuous-v0'
    #env_name = 'MountainCar-v0'
    #env_name = 'Pendulum-v1'

    # Create the CartPole environment
    #env = gym.make(env_name)
    batch_size = 32

    # Load datasets
    with open(env_name + '_train_dataset.pkl', 'rb') as f:
        train_dataset_loaded = pickle.load(f)

    with open(env_name + '_test_dataset.pkl', 'rb') as f:
        test_dataset_loaded = pickle.load(f)

    train_loader = DataLoader(train_dataset_loaded, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset_loaded, batch_size=batch_size, shuffle=False)

    # Define environment parameters
    state_dim = train_dataset_loaded[0][0].shape[0]
    action_dim = train_dataset_loaded[0][1].shape[0]
    hidden_dim = 32
    gamma = 0.99
    alpha = 0.001
    beta = 0.001
    buffer_size = 10000
    episodes = 100

    # Initialize Dyna-Q agent
    agent = DynaQ(state_dim, action_dim, hidden_dim, gamma, alpha, beta, buffer_size, batch_size)

    # Train the agent
    agent.train(train_loader, episodes)


ValueError: not enough values to unpack (expected 4, got 2)