In [None]:
# Niveen Abdul-Mohsen (bvn9ad)
# Reinforcement Learning (CS 4771) - Figure 6.5 Maximization Bias Q-learning and Double Q-learning 
# Comparison of Q-learning and Double Q-learning on a simple episodic MDP environment
# i used numpy for numerical operations and matplotlib for plotting

import numpy as np
import matplotlib.pyplot as plt

class MaxBiasEnvironment:
    """
    simple two-state episodic environment that demonstrates maximization bias.
    
    structure:
    - start in state A with two actions: left and right
    - left action: transition to state B with reward 0
    - from B: many actions all terminate with reward ~ N(-0.1, 1)
    - right action: immediate termination with reward 0
    
    the optimal policy is to take RIGHT (avoiding the noisy negative rewards).
    but q-learning with maximization bias will overestimate the value of
    going LEFT because of noise in the B rewards.
    """
    
    def __init__(self, seed=None):
        self.rng = np.random.RandomState(seed)
        self.current_state = 'A'
        self.is_terminal = False
        
    def reset(self):
        """start a new episode in state A"""
        self.current_state = 'A'
        self.is_terminal = False
        return 'A'
    
    def step(self, action):
        """
        execute one step in the environment
        
        returns: (next_state, reward, is_terminal)
        """
        if self.is_terminal:
            raise RuntimeError("episode already terminated")
        
        if self.current_state == 'A':
            if action == 0:  # left action
                # transition to state B with zero immediate reward
                self.current_state = 'B'
                return ('B', 0.0, False)
            else:  # right action (action == 1)
                # immediate termination with zero reward (optimal choice)
                self.is_terminal = True
                return (None, 0.0, True)
        
        elif self.current_state == 'B':
            # any action in B terminates
            # reward drawn from N(-0.1, 1.0) - typically negative!
            reward = self.rng.normal(loc=-0.1, scale=1.0)
            self.is_terminal = True
            return (None, reward, True)
    
    def get_num_states(self):
        """return total number of non-terminal states"""
        return 2
    
    def get_num_actions(self):
        """return number of actions available in state A"""
        return 2


# ============================================================================
# ALGORITHM: q-learning (with maximization bias)
# ============================================================================

class QLearningAgent:
    """
    standard q-learning with epsilon-greedy exploration.
    
    key equation: Q(s, a) += alpha * [r + gamma * max_a' Q(s', a') - Q(s, a)]
    
    problem: uses the SAME estimate for both:
    1) selecting which action maximizes value in next state
    2) estimating that maximum value
    
    in state B with many actions, the max of noisy estimates is biased upward
    even though true max is negative. this causes Q(A, left) to be overestimated.
    """
    
    def __init__(self, num_states, num_actions, alpha=0.1, gamma=1.0, epsilon=0.1):
        self.num_states = num_states
        self.num_actions = num_actions
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.rng = np.random.RandomState()
        
        # q-values indexed by state name ('A' or 'B')
        # state A has 2 actions, state B has 10 possible actions
        self.q_values = {
            'A': np.zeros(num_actions),      # left (0) and right (1)
            'B': np.zeros(10)                 # 10 actions in state B
        }
    
    def set_seed(self, seed):
        """set random seed for reproducibility"""
        self.rng = np.random.RandomState(seed)
    
    def get_action(self, state):
        """
        epsilon-greedy action selection
        
        with probability epsilon: select random action
        otherwise: select action with highest q-value
        """
        if self.rng.rand() < self.epsilon:
            # explore: random action
            if state == 'A':
                return self.rng.randint(0, self.num_actions)
            else:  # state B
                return self.rng.randint(0, 10)
        else:
            # exploit: greedy action (highest q-value)
            if state == 'A':
                # break ties randomly
                q_vals = self.q_values['A']
                max_q = np.max(q_vals)
                best_actions = np.where(q_vals == max_q)[0]
                return self.rng.choice(best_actions)
            else:  # state B
                q_vals = self.q_values['B']
                max_q = np.max(q_vals)
                best_actions = np.where(q_vals == max_q)[0]
                return self.rng.choice(best_actions)
    
    def update(self, state, action, reward, next_state, next_action=None):
        """
        q-learning update rule
        
        uses max of next_state values as bootstrap target
        this is where maximization bias occurs: the max of noisy estimates
        in state B tends to be positive even though true values are ~-0.1
        """
        if next_state is None:
            # terminal state: bootstrap value is just the reward
            target = reward
        else:
            # non-terminal: bootstrap using max q-value in next state
            if next_state == 'A':
                max_next_q = np.max(self.q_values['A'])
            else:  # next_state == 'B'
                max_next_q = np.max(self.q_values['B'])
            
            target = reward + self.gamma * max_next_q
        
        # temporal difference error
        td_error = target - self.q_values[state][action]
        
        # update q-value
        self.q_values[state][action] += self.alpha * td_error
    
    def get_greedy_action_from_a(self):
        """
        return greedy action from state A (0=left, 1=right)
        based on current q-value estimates
        """
        q_left = self.q_values['A'][0]
        q_right = self.q_values['A'][1]
        
        if q_left > q_right:
            return 0  # choose left
        elif q_right > q_left:
            return 1  # choose right
        else:
            # tied: random choice
            return self.rng.choice([0, 1])


# ============================================================================
# ALGORITHM: double q-learning (corrects maximization bias)
# ============================================================================

class DoubleQLearningAgent:
    """
    double q-learning algorithm that addresses maximization bias.
    
    maintain TWO independent q-value estimates (Q1 and Q2).
    on each step, randomly update one using the other.
    
    update for Q1: Q1(s,a) += alpha * [r + gamma * Q2(argmax_a' Q1(s',a')) - Q1(s,a)]
    update for Q2: Q2(s,a) += alpha * [r + gamma * Q1(argmax_a' Q2(s',a')) - Q2(s,a)]
    
    by decoupling which estimate selects the action (Q1 or Q2) from which
    one estimates its value, the bias cancels out.
    
    if Q1 gets lucky and overestimates, Q2 (learned
    independently) will tend to underestimate. averaging them (or alternating)
    cancels out the bias.
    """
    
    def __init__(self, num_states, num_actions, alpha=0.1, gamma=1.0, epsilon=0.1):
        pass
    
    def set_seed(self, seed):
        """set random seed for reproducibility"""
        pass
    
    def get_action(self, state):
        """
        epsilon-greedy using average of both q-estimates
        
        averaging both estimates for action selection provides more
        stable behavior than using a single estimate
        """
        # TODO: if random number < epsilon:
        #   - if state == 'A': return random action from [0, 1]
        #   - else: return random action from [0, 10)
        
        # TODO: else (exploit):
        #   - compute average q-value: (self.q_values_1[state] + self.q_values_2[state]) / 2.0
        #   - find max of average q-value
        #   - find ALL actions that achieve this max
        #   - randomly choose among them
        pass
    
    def update(self, state, action, reward, next_state, next_action=None):
        """
        double q-learning update
        
        randomly choose to update either Q1 or Q2 on this step
        """
        # TODO: randomly decide: should we update Q1 or Q2? (50/50 chance)
        
        # TODO: if updating Q1:
        #   - if next_state is None: target = reward
        #   - else:
        #     - find best action in next_state using Q1: argmax(Q1[next_state])
        #     - get value of that action from Q2: Q2[next_state][best_action]
        #     - target = reward + gamma * bootstrap_value
        #   - calculate and apply td_error to Q1[state][action]
        
        # TODO: else (updating Q2):
        #   - if next_state is None: target = reward
        #   - else:
        #     - find best action in next_state using Q2: argmax(Q2[next_state])
        #     - get value of that action from Q1: Q1[next_state][best_action]
        #     - target = reward + gamma * bootstrap_value
        #   - calculate and apply td_error to Q2[state][action]
        pass
    
    def get_greedy_action_from_a(self):
        """
        return greedy action from state A (0=left, 1=right)
        based on average of both q-value estimates
        """
        pass


# ============================================================================
# TRAINING LOOP
# ============================================================================

def run_episode(agent, env):
    """
    run one complete episode with given agent and environment
    """
    # TODO: reset environment, get initial state = 'A'
    # TODO: initialize action_in_a = None
    
    # TODO: while not env.is_terminal:
    #   - agent gets action using epsilon-greedy
    #   - if state == 'A': store this action in action_in_a
    #   - environment executes action -> get next_state, reward, is_terminal
    #   - agent updates q-values with this experience
    #   - move to next_state
    
    # TODO: return action_in_a (the action taken from state A)
    pass


def run_experiment(num_episodes, num_runs, algorithm_type='qlearning'):
    """
    run complete experiment comparing algorithms
    
    tracks which action is taken from state A in each episode.
    we want to track LEFT action percentage (action 0).
 
    """
    
    # TODO: for each run:
    #   - create fresh environment with seed=run_idx
    #   - create fresh agent (either QLearningAgent or DoubleQLearningAgent)
    #   - set agent seed
    #   - for each episode:
    #     - run_episode with this agent and environment
    #     - store whether action was LEFT (action 0) or not
    
    pass


# ============================================================================
# VISUALIZATION
# ============================================================================

def create_figure_6_5():
    """
    recreate figure 6.5 from the textbook
    """
    print("=" * 70)
    print("figure 6.5: maximization bias - q-learning vs double q-learning")
    print("=" * 70)
    
    # experimental parameters
    num_episodes = 300
    num_runs = 10000
    
    print(f"\nrunning experiment with:")
    print(f"  episodes per run: {num_episodes}")
    print(f"  number of independent runs: {num_runs}")
    
    # TODO: run experiments for both algorithms
    
    # TODO: create figure and plot
    
    # TODO: formatting

    
    # TODO: save figure and show
    pass


# ============================================================================
# MAIN
# ============================================================================

if __name__ == "__main__":
    # TODO: call create_figure_6_5() to run the full experiment
    pass


