In [1]:
# Niveen Abdul-Mohsen (bvn9ad)
# Reinforcement Learning (CS 4771) - Figure 13.2 REINFORCE vs REINFORCE with Baseline
# This is the code for the right graph of Figure 13.2 in Sutton and Barto's RL textbook
# i used numpy for numerical operations and matplotlib for plotting

import numpy as np
import matplotlib.pyplot as plt

class SimpleCorridorEnv:
    """
    Simple corridor gridworld with state 1 having reversed action effects.
    
    States: 0 (start), 1, 2, 3 (terminal)
    Normal state mechanics: right increases state, left decreases
    State 1 exception: right decreases, left increases (asymmetry!)
    """
    
    def __init__(self):
        self.state = 0
    
    def reset(self):
        """Reset to starting state"""
        self.state = 0
    
    def take_action(self, action_is_right):
        """
        Execute one action step.
        
        Args:
            action_is_right (bool): True for right action, False for left
        
        Returns:
            reward (int): -1 for each step, 0 when done
            is_done (bool): True if reached terminal state (state 3)
        """
        
        # States 0 and 2 have normal action effects
        if self.state == 0 or self.state == 2:
            if action_is_right:
                self.state += 1
            else:
                self.state = max(0, self.state - 1)
        
        # State 1 has reversed action effects (the asymmetry)
        elif self.state == 1:
            if action_is_right:
                self.state -= 1  # right actually goes left!
            else:
                self.state += 1  # left actually goes right!
        
        # Check if terminal
        if self.state == 3:
            return 0, True  # terminal reward is 0
        else:
            return -1, False  # step penalty
        
class PolicyGradientAgent:
    """
    REINFORCE agent with optional learned baseline.
    
    Features softmax policy parameterized with linear preference function:
        h = theta^T * x(state)
        pi(action|state) = softmax(h)
    
    where x represents actions as one-hot features.
    """
    
    def __init__(self, step_size_policy, discount=1.0, step_size_baseline=None):
        """
        Initialize the agent.
        
        Args:
            step_size_policy (float): Alpha for policy parameter updates
            discount (float): Gamma (discount factor)
            step_size_baseline (float): Alpha_w for baseline updates (optional)
        """
        # Policy parameters theta (one for each action)
        self.theta = np.array([-1.47, 1.47])
        
        # Feature matrix: rows=states, cols=actions
        # Each state has one-hot action features
        self.features = np.array([
            [0, 1],  # state 0: [left_feature, right_feature]
            [1, 0]   # state 1: [left_feature, right_feature]
        ])
        
        self.step_size_policy = step_size_policy
        self.discount = discount
        self.step_size_baseline = step_size_baseline if step_size_baseline else 0
        self.has_baseline = step_size_baseline is not None
        
        # Learned state value baseline
        self.baseline_value = 0.0
        
        # Store trajectory for learning
        self.episode_rewards = []
        self.episode_actions = []
    
    def compute_policy(self):
        """
        Compute action probabilities using softmax.
        
        Returns:
            pmf (array): [prob_left, prob_right]
        """
        # Compute preferences: h = theta^T * features
        h = np.dot(self.theta, self.features)
        
        # Softmax with numerical stability
        h_stable = h - np.max(h)
        exp_h = np.exp(h_stable)
        pmf = exp_h / np.sum(exp_h)
        
        # Ensure exploration (never fully deterministic)
        epsilon = 0.05
        min_idx = np.argmin(pmf)
        if pmf[min_idx] < epsilon:
            pmf[:] = 1.0 - epsilon
            pmf[min_idx] = epsilon
        
        return pmf
    
    def select_action(self, step_reward):
        """
        Select action according to policy and record step.
        
        Args:
            step_reward (float or None): Reward from previous step
        
        Returns:
            action_is_right (bool): Action choice
        """
        if step_reward is not None:
            self.episode_rewards.append(step_reward)
        
        pmf = self.compute_policy()
        action_is_right = np.random.uniform() < pmf[1]
        self.episode_actions.append(action_is_right)
        
        return action_is_right
    
    def learn_from_episode(self, final_reward):
        """
        Update parameters after episode completes.
        
        Args:
            final_reward (float): Final step reward
        """
        self.episode_rewards.append(final_reward)
        
        # Compute returns G_t (backward from episode end)
        num_steps = len(self.episode_rewards)
        returns = np.zeros(num_steps)
        returns[-1] = self.episode_rewards[-1]
        
        for step_idx in range(num_steps - 2, -1, -1):
            returns[step_idx] = self.episode_rewards[step_idx] + self.discount * returns[step_idx + 1]
        
        # Update parameters for each step
        discount_power = 1.0
        
        for step_idx in range(num_steps):
            action_idx = 1 if self.episode_actions[step_idx] else 0
            return_t = returns[step_idx]
            
            pmf = self.compute_policy()
            
            # Score function gradient: nabla log pi(a|s) = x(a) - E[x]
            score_grad = self.features[:, action_idx] - np.dot(self.features, pmf)
            
            if self.has_baseline:
                # Advantage: return minus baseline
                advantage = return_t - self.baseline_value
                
                # Update baseline: w <- w + alpha_w * gamma^t * (G_t - w)
                self.baseline_value += self.step_size_baseline * discount_power * advantage
                
                # Update policy: theta <- theta + alpha * gamma^t * advantage * nabla_log_pi
                self.theta += self.step_size_policy * discount_power * advantage * score_grad
            else:
                # No baseline: update only on return
                self.theta += self.step_size_policy * discount_power * return_t * score_grad
            
            discount_power *= self.discount
        
        # Clear trajectory
        self.episode_rewards = []
        self.episode_actions = []
