In [9]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np

In [16]:
env = gym.make("LunarLander-v3", render_mode="human")

print("Action Space:", env.action_space)  # Discrete(4)
print("Observation Space:", env.observation_space)  # Box(-inf, inf, (8,), float32)
print("Observation Space High:", env.observation_space.high)
print("Observation Space Low:", env.observation_space.low)

# format the prints properly

Action Space: Discrete(4)
Observation Space: Box([ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
  -0.         -0.       ], [ 2.5        2.5       10.        10.         6.2831855 10.
  1.         1.       ], (8,), float32)
Observation Space High: [ 2.5        2.5       10.        10.         6.2831855 10.
  1.         1.       ]
Observation Space Low: [ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
  -0.         -0.       ]


In [17]:
learning_rate = 0.01
gamma = 0.99  # Discount factor

In [18]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return torch.softmax(x, dim=-1)

In [19]:
policy = PolicyNetwork(input_dim=env.observation_space.shape[0], output_dim=env.action_space.n)
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

In [20]:
def reinforce(policy, optimizer, episodes=1000, max_steps=500):
    """
    Train a policy using REINFORCE.
    Args:
        policy: The policy network.
        optimizer: Optimizer for training.
        episodes: Number of episodes to train.
        max_steps: Max steps per episode.
    """
    for episode in range(episodes):
        state, _ = env.reset()
        log_probs = []
        rewards = []
        total_reward = 0

        for t in range(max_steps):
            state_tensor = torch.tensor(state, dtype=torch.float32)
            action_probs = policy(state_tensor)
            action_dist = Categorical(action_probs)
            action = action_dist.sample()
            log_probs.append(action_dist.log_prob(action))

            next_state, reward, done, truncated, _ = env.step(action.item())
            rewards.append(reward)
            total_reward += reward

            if done or truncated:
                break
            state = next_state

        # Compute the discounted rewards
        discounted_rewards = []
        cumulative_reward = 0
        for r in reversed(rewards):
            cumulative_reward = r + gamma * cumulative_reward
            discounted_rewards.insert(0, cumulative_reward)

        discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)  # Normalize rewards

        # Compute the loss
        loss = 0
        for log_prob, reward in zip(log_probs, discounted_rewards):
            loss -= log_prob * reward  # REINFORCE loss

        # Perform gradient update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Print progress
        print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}")

In [21]:
reinforce(policy, optimizer, episodes=1000)

Episode 1/1000, Total Reward: -131.77107184455983
Episode 2/1000, Total Reward: -94.66680211962263
Episode 3/1000, Total Reward: -373.1908939005401
Episode 4/1000, Total Reward: -368.308617666904
Episode 5/1000, Total Reward: -105.93620936360325
Episode 6/1000, Total Reward: -151.55454679809594
Episode 7/1000, Total Reward: -105.3103958923069
Episode 8/1000, Total Reward: -120.40409673863729
Episode 9/1000, Total Reward: -208.68411825060628
Episode 10/1000, Total Reward: -300.12074119293516
Episode 11/1000, Total Reward: -127.08048550514302
Episode 12/1000, Total Reward: -216.32593486052565
Episode 13/1000, Total Reward: -226.07969650017725
Episode 14/1000, Total Reward: -136.96993909683272
Episode 15/1000, Total Reward: -102.6773581873331
Episode 16/1000, Total Reward: -112.85734743274034
Episode 17/1000, Total Reward: -188.9916938805147
Episode 18/1000, Total Reward: -125.55122726010467
Episode 19/1000, Total Reward: -149.90441680087622
Episode 20/1000, Total Reward: -234.46327709258

In [24]:
policy = PolicyNetwork(input_dim=env.observation_space.shape[0], output_dim=env.action_space.n)
policy.load_state_dict(torch.load("policy.pth"))
policy.eval()  # Set the policy to evaluation mode

# Test the model
def test_model(policy, env, episodes=5, max_steps=500):
    for episode in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        for t in range(max_steps):
            state_tensor = torch.tensor(state, dtype=torch.float32)
            with torch.no_grad():  # Disable gradient computation for testing
                action_probs = policy(state_tensor)
                action = torch.argmax(action_probs).item()  # Select the action with the highest probability
            state, reward, done, truncated, _ = env.step(action)
            total_reward += reward
            if done or truncated:
                break
        print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}")

# Run the test
test_model(policy, env, episodes=5)

  policy.load_state_dict(torch.load("policy.pth"))


Episode 1/5, Total Reward: -65.24008461725293
Episode 2/5, Total Reward: -76.54082740677666
Episode 3/5, Total Reward: 145.44502031333118
Episode 4/5, Total Reward: -112.52386275861734
Episode 5/5, Total Reward: 13.164954692355296


: 