In [2]:
%pip install torch gym gymnasium matplotlib numpy

Collecting gym
  Downloading gym-0.26.2.tar.gz (721 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting gym_notices>=0.0.4
  Downloading gym_notices-0.0.8-py3-none-any.whl (3.0 kB)
Collecting farama-notifications>=0.0.1
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Building wheels for collected packages: gym
  Building wheel for gym (pyproject.toml) ... [?25ldone
[?25h  Created wheel for gym: filename=gym-0.26.2-py3-none-any.whl size=827694 sha256=6c15a2e2e32d47f6266a08fcac1419

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt

# Set device for computations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
EPISODES = 1000
LEARNING_RATE = 1e-2
GAMMA = 0.99

# Create environment
env = gym.make("CartPole-v1")

# Define the Policy Network (MLP) that outputs action probabilities
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=128):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
            nn.Softmax(dim=-1)  # Output probabilities for actions
        )

    def forward(self, x):
        return self.fc(x)

# Initialize policy network and optimizer
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
policy_net = PolicyNetwork(state_dim, action_dim).to(device)
optimizer = optim.Adam(policy_net.parameters(), lr=LEARNING_RATE)

# Function to compute discounted returns efficiently using vectorized operations
def compute_discounted_returns(rewards, gamma=GAMMA):
    R = 0
    returns = []
    # Compute returns in reverse order
    for r in reversed(rewards):
        R = r + gamma * R
        returns.insert(0, R)
    return torch.tensor(returns, dtype=torch.float32, device=device)

# Training loop
reward_history = []

for episode in range(EPISODES):
    state, _ = env.reset()
    log_probs, rewards = [], []
    done = False
   
    # Generate an episode
    while not done:
        print(".", end="")
        state_tensor = torch.FloatTensor(state).to(device)
        # Get action probabilities from policy network
        action_probs = policy_net(state_tensor)
        dist = torch.distributions.Categorical(action_probs)
        action = dist.sample()

        # Store log probability for policy gradient update
        log_probs.append(dist.log_prob(action))
        state, reward, done, truncated, _ = env.step(action.item())
        print(reward, end="")
        rewards.append(reward)

    # Compute discounted returns and normalize for stability
    returns = compute_discounted_returns(rewards)
    returns = (returns - returns.mean()) / (returns.std() + 1e-9)

    # Compute policy loss: negative log-likelihood weighted by returns
    policy_loss = -torch.stack(log_probs) * returns
    loss = policy_loss.sum()

    # Update policy network
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_reward = sum(rewards)
    reward_history.append(total_reward)

    if (episode + 1) % 50 == 0:
        print(f"Episode {episode+1}/{EPISODES} - Total Reward: {total_reward:.2f}")

# Plot training performance
plt.plot(reward_history)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("REINFORCE on CartPole-v1")
plt.show()

env.close()

  if not isinstance(terminated, (bool, np.bool8)):


Episode 50/1000 - Total Reward: 79.00
Episode 100/1000 - Total Reward: 173.00


KeyboardInterrupt: 