In [4]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

In [5]:
# set randoms seeds
torch.manual_seed(42)
np.random.seed(42)

In [6]:
# Create enviroment
env = gym.make("CartPole-v1")

In [7]:
# Policy Network
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNetwork, self).__init__()

        # Simple NN with one hidden layer
        self.network = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128,action_dim),
            nn.Softmax(dim=1)
        )

    def forward(self, x):

        # Convert State to tensor
        if not isinstance(x, torch.Tensor):
            x = torch.FloatTensor(x).unsqueeze(0)

        # Forward pass through the network to get action probabilities
        action_probs = self.network(x)
        return action_probs

In [8]:
# Initalize Policy Network
state_dim = env.observation_space.shape[0]  # 4 for CartPole (position, velocity, angle, angular velocity)
action_dim = env.action_space.n # 2 for CartPole (push left, push right)
policy_network = PolicyNetwork(state_dim, action_dim) 

In [9]:
# Set up optimizer
optimizer = optim.Adam(policy_network.parameters(), lr=0.01)

In [10]:
# Function to collect a single episode
def collect_episode(policy_network, env):

    # Initialize lists to store the episode data
    states = []
    actions = []
    rewards = []
    
    # Reset the enviroment
    state, _ = env.reset()
    done = False

    # Collect episode data
    while not done:

        # Store current state
        states.append(state)

        # Get action probabilities
        action_probs = policy_network(state)

        # Sample action from the policy
        action_dist = Categorical(action_probs)
        action = action_dist.sample()

        # Store action
        actions.append(action)

        # Take the action in the enviroment
        next_state, reward, terminated, truncated, _ = env.step(action.item())
        done = terminated or truncated

        # Store reward
        rewards.append(reward)

        # Update state
        state = next_state

    return states, actions, rewards


# Function to compute returns (Discounted Future Rewards)
def compute_returns(rewards, gamma=0.99):
    returns = []
    G = 0

    # Iterate over the rewards in reverse order
    for reward in reversed(rewards):
        G = reward + gamma * G
        returns.insert(0, G) # Insert at the beginning

    # Convert to tensor and normalize
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + 1e-9) # Normalize returns for stable learning

    return returns

# Main Training Loop
def train_reinforce(policy_network, env, num_episodes=1000):

    # List to store metrics
    episode_rewards = []

    for episode in range(num_episodes):

        # Collect episode data
        states, actions, rewards = collect_episode(policy_network, env)

        # Compute returns
        returns = compute_returns(rewards)

        # Compute loss
        optimizer.zero_grad()
        loss = 0

         # Compute log probabilities for each action taken
        for t in range(len(states)):
            state = states[t]
            action = actions[t]
            G = returns[t]

            #Get action probabilities
            action_probs = policy_network(state)

            # Create a distribution and compute log probability of the taken action
            action_dist = Categorical(action_probs)
            log_prob = action_dist.log_prob(torch.tensor(action))

            # Add negative log probability multiplied by return to loss
            # (negative because we're minimizing loss but want to maximize expected return)
            loss += -log_prob * G

        # Backpropagate loss
        loss.backward()

        # Update the policy network
        optimizer.step()

        # Record total reward for the episode
        total_reward = sum(rewards)
        episode_rewards.append(total_reward)

        # Print progress
        if (episode + 1) % 10 == 0:
            avg_reward = np.mean(episode_rewards[-10:])
            print(f"Episode {episode+1}/{num_episodes}, Average Reward (last 10): {avg_reward:.2f}")
    
    return episode_rewards
        
            
            
        
    



In [None]:
# Train the agent
reward_history = train_reinforce(policy_network, env)



  log_prob = action_dist.log_prob(torch.tensor(action))


Episode 10/1000, Average Reward (last 10): 22.40
Episode 20/1000, Average Reward (last 10): 28.80
Episode 30/1000, Average Reward (last 10): 32.50
Episode 40/1000, Average Reward (last 10): 103.10
Episode 50/1000, Average Reward (last 10): 241.00
Episode 60/1000, Average Reward (last 10): 64.60
Episode 70/1000, Average Reward (last 10): 28.40
Episode 80/1000, Average Reward (last 10): 39.60
Episode 90/1000, Average Reward (last 10): 128.70
Episode 100/1000, Average Reward (last 10): 384.50
Episode 110/1000, Average Reward (last 10): 500.00
Episode 120/1000, Average Reward (last 10): 455.80
Episode 130/1000, Average Reward (last 10): 429.00
Episode 140/1000, Average Reward (last 10): 480.20
Episode 150/1000, Average Reward (last 10): 491.40
Episode 160/1000, Average Reward (last 10): 467.60
Episode 170/1000, Average Reward (last 10): 479.20
Episode 180/1000, Average Reward (last 10): 477.00
Episode 190/1000, Average Reward (last 10): 489.90
Episode 200/1000, Average Reward (last 10): 50

: 

In [None]:
def visualize_policy_simple(policy_network, env_name='CartPole-v1', episodes=3):
    """
    A simplified function to evaluate and report on a trained policy network.
    This version avoids heavy rendering that might crash the kernel.
    """
    # Create environment without rendering
    env = gym.make(env_name)
    
    episode_rewards = []
    episode_steps = []
    
    for episode in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        steps = 0
        done = False
        
        while not done:
            # Get action probabilities from policy network
            action_probs = policy_network(state)
            
            # Select the action with highest probability
            action = torch.argmax(action_probs).item()
            
            # Take the action
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            
            total_reward += reward
            steps += 1
            state = next_state
        
        episode_rewards.append(total_reward)
        episode_steps.append(steps)
        print(f"Episode {episode+1}: Reward = {total_reward}, Steps = {steps}")
    
    env.close()
    
    # Calculate statistics
    avg_reward = sum(episode_rewards) / len(episode_rewards)
    avg_steps = sum(episode_steps) / len(episode_steps)
    
    print(f"\nEvaluation Results:")
    print(f"Average Reward: {avg_reward:.2f}")
    print(f"Average Steps: {avg_steps:.2f}")
    print(f"Best Episode Reward: {max(episode_rewards)}")
    
    # Create a simple bar chart of episode rewards
    plt.figure(figsize=(10, 6))
    plt.bar(range(1, episodes+1), episode_rewards, color='blue')
    plt.axhline(y=avg_reward, color='r', linestyle='--', label=f'Average: {avg_reward:.2f}')
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.title('Performance of Trained Policy')
    plt.legend()
    plt.show()
    
    return episode_rewards


# After training your policy network
results = visualize_policy_simple(policy_network, episodes=5)

Episode 1: Reward = 500.0, Steps = 500
Episode 2: Reward = 500.0, Steps = 500
Episode 3: Reward = 500.0, Steps = 500
Episode 4: Reward = 500.0, Steps = 500
Episode 5: Reward = 500.0, Steps = 500

Evaluation Results:
Average Reward: 500.00
Average Steps: 500.00
Best Episode Reward: 500.0
