In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gymnasium as gym

In [2]:
class PPOPolicyNetwork(nn.Module): # Actor
    def __init__(self, input_dim, output_dim):
        super(PPOPolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        y = torch.tanh(self.fc3(x))  # Use tanh for continuous action spaces
        return y

In [3]:
class PPOValueNetwork(nn.Module): # Critic
    def __init__(self, input_dim):
        super(PPOValueNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 1)  # Output value (single scalar)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.fc3(x)

In [4]:
def compute_gae(rewards, values, gamma=0.99, tau=0.95):
    deltas = []
    gae = 0
    for t in reversed(range(len(rewards))):
        delta = rewards[t] + gamma * values[t + 1] - values[t]
        gae = delta + gamma * tau * gae
        deltas.insert(0, gae)
    return deltas

def ppo_loss(old_log_probs, new_log_probs, advantages, clip_epsilon=0.2):
    # Compute the ratio (pi_theta / pi_theta_old)
    ratio = torch.exp(new_log_probs - old_log_probs)
    
    # Compute the surrogate loss
    obj_surrogate = ratio * advantages
    obj_clipped = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantages

    # Final PPO objective (minimize the negative objective)
    loss = -torch.min(obj_surrogate, obj_clipped).mean()
    return loss

In [5]:
def train_ppo(env, policy, value_net, policy_optimizer, value_optimizer, num_episodes=100, batch_size=64):
    # Hyperparameters for PPO
    gamma = 0.99
    tau = 0.95
    clip_epsilon = 0.2
    n_epochs = 10  # Number of epochs to update the policy after each batch
    
    for episode in range(num_episodes):
        states, actions, rewards, log_probs = [], [], [], []

        # Collect trajectory
        obs, _ = env.reset()
        done = False
        while not done:
            obs_tensor = torch.tensor(obs, dtype=torch.float32)
            action = policy(obs_tensor).detach().numpy()  # Get action from policy
            log_prob = torch.log(policy(obs_tensor))  # Log probability of taken action
            next_obs, reward, done, _, _ = env.step(action)

            states.append(obs)
            actions.append(action)
            rewards.append(reward)
            log_probs.append(log_prob)

            obs = next_obs

        # Convert lists to tensors
        states_tensor = torch.tensor(states, dtype=torch.float32)
        actions_tensor = torch.tensor(actions, dtype=torch.float32)
        rewards_tensor = torch.tensor(rewards, dtype=torch.float32)

        # Calculate values and advantages using the critic
        values = value_net(states_tensor)
        advantages = compute_gae(rewards_tensor, values)

        # Update policy and critic
        for _ in range(n_epochs):
            # Compute the loss for the policy
            old_log_probs = torch.tensor(log_probs, dtype=torch.float32)
            new_log_probs = torch.log(policy(states_tensor))

            # Calculate PPO loss and perform backpropagation
            policy_loss = ppo_loss(old_log_probs, new_log_probs, advantages)
            policy_optimizer.zero_grad()
            policy_loss.backward()
            policy_optimizer.step()

            # Compute the loss for the value function
            value_loss = ((values - rewards_tensor) ** 2).mean()  # Mean squared error
            value_optimizer.zero_grad()
            value_loss.backward()
            value_optimizer.step()

        print(f"Episode {episode} completed")

    env.close()

In [6]:
# Reserve this cell for all the modifying parameters functions
def modify_mass_parameters(model):
    # Modify the mass of specific body parts
    model.body_mass[1] = 4.15  # Set mass of torso 
    model.body_mass[2], model.body_mass[5] = 0.6, 0.6  # Set mass of thigh
    model.body_mass[3], model.body_mass[6] = 0.3, 0.3  # Set mass of leg
    model.body_mass[4], model.body_mass[7]= 0.1, 0.1  # Set mass of foot

In [7]:
# Initialize environment
env = gym.make('Walker2d-v5')
# env.reset()
model = env.unwrapped.model
modify_mass_parameters(model) # Modify mass parameters for the environment's model

# Create policy and value networks
policy = PPOPolicyNetwork(input_dim=env.observation_space.shape[0], output_dim=env.action_space.shape[0])
value_net = PPOValueNetwork(input_dim=env.observation_space.shape[0])

# Create optimizers
policy_optimizer = optim.Adam(policy.parameters(), lr=3e-4)
value_optimizer = optim.Adam(value_net.parameters(), lr=3e-4)

# Train PPO
train_ppo(env, policy, value_net, policy_optimizer, value_optimizer)

  states_tensor = torch.tensor(states, dtype=torch.float32)


IndexError: index 12 is out of bounds for dimension 0 with size 12