In [None]:
#Niveen Abdul-Mohsen (bvn9ad)
# Reinforcement Learning (CS 4771) - Fgure 5.1
# Monte Carlo On policy
# i used numpy for numerical operations and matplotlib for plotting

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from collections import defaultdict

class BlackjackEnvironment:
    """
    Blackjack Environment Implementation
    
    From Sutton & Barto Section 5.1, Example 5.1:
    - The object of blackjack is to obtain cards whose sum is as great as 
      possible without exceeding 21
    - All face cards count as 10, and an ace can count either as 1 or as 11
    - Each player competes independently against the dealer
    - The game begins with two cards dealt to both dealer and player
    - One of the dealer's cards is face up (showing card) and one is face down
    
    State Space (200 states total):
    - Player's current sum (12-21): 10 possible values
    - Dealer's showing card (Ace-10): 10 possible values  
    - Whether player holds a usable ace (True/False): 2 possible values
    - Total: 10 × 10 × 2 = 200 states
    
    Note: Player sums below 12 are not included because the player should 
    always hit with sum < 12 (no decision to be made)
    
    Reward Structure:
    - +1 for winning
    - -1 for losing
    - 0 for draw
    - All intermediate rewards are 0
    - Discount factor γ = 1 (no discounting) as stated in textbook
    
    Environment Assumptions (from textbook):
    - Cards are dealt from an infinite deck (with replacement)
    - No advantage to keeping track of cards already dealt
    """
    
    def __init__(self):
        """
        Initialize the blackjack environment
        """
        # TODO: Initialize deck probabilities for infinite deck assumption
        # Each card from 1-10 has equal probability (with 10 representing 10, J, Q, K)
        pass
    
    def draw_card(self):
        """
        Draw a card from infinite deck
        
        Returns:
            int: Card value (1-10, where 1 is Ace)
        
        Note from textbook: 
        - Cards are dealt with replacement (infinite deck assumption)
        - This simplifies the problem by making draws independent
        """
        # TODO: Implement random card draw
        # Face cards (J, Q, K) all count as 10
        # Ace is represented as 1 (can be counted as 11 via usable_ace flag)
        pass
    
    def usable_ace(self, hand):
        """
        Determine if the hand has a usable ace
        
        From textbook Section 5.1:
        "If the player holds an ace that he could count as 11 without going bust,
        then the ace is said to be usable. In this case it is always counted as 11
        because counting it as 1 would make the sum 11 or less, in which case there
        is no decision to be made because, obviously, the player should always hit."
        
        Args:
            hand: List of card values
            
        Returns:
            bool: True if hand has usable ace, False otherwise
        """
        # TODO: Check if hand contains an ace (value 1)
        # TODO: Check if counting ace as 11 keeps sum <= 21
        pass
    
    def sum_hand(self, hand):
        """
        Calculate the sum of a hand, accounting for usable ace
        
        Args:
            hand: List of card values
            
        Returns:
            int: Sum of hand (with ace counted optimally)
        """
        # TODO: Calculate sum
        # TODO: If usable ace exists, add 10 to sum (counting ace as 11 instead of 1)
        pass
    
    def is_bust(self, hand):
        """
        Check if hand is bust (sum > 21)
        
        Args:
            hand: List of card values
            
        Returns:
            bool: True if bust, False otherwise
        """
        # TODO: Return True if sum_hand(hand) > 21
        pass
    
    def reset(self):
        """
        Start a new episode
        
        From textbook:
        "The game begins with two cards dealt to both dealer and player.
        One of the dealer's cards is face up and the other is face down."
        
        Returns:
            state: Tuple of (player_sum, dealer_showing, usable_ace)
        """
        # TODO: Deal two cards to player
        # TODO: Deal two cards to dealer (one face up, one face down)
        # TODO: Check for natural (immediate 21)
        # TODO: Return initial state as (player_sum, dealer_showing, usable_ace)
        pass
    
    def step(self, action):
        """
        Execute one step in the environment
        
        Args:
            action: 0 = stick, 1 = hit
            
        Returns:
            next_state: Next state after action
            reward: Reward received
            done: Whether episode is complete
            
        From textbook Section 5.1:
        "The player's actions are to hit or to stick."
        """
        # TODO: If action is hit (1):
        #   - Draw a card
        #   - Add to player hand
        #   - Check if bust (done=True, reward=-1)
        #   - Update state
        
        # TODO: If action is stick (0):
        #   - Dealer plays according to fixed policy
        #   - Dealer hits on sum < 17, sticks on sum >= 17
        #   - Determine winner and assign reward
        #   - Set done=True
        pass
    
    def dealer_policy(self):
        """
        Dealer's fixed strategy
        
        From textbook Section 5.1:
        "The dealer hits or sticks according to a fixed strategy without choice:
        he sticks on any sum of 17 or greater, and hits otherwise."
        
        Returns:
            final_sum: Dealer's final hand sum
            is_bust: Whether dealer went bust
        """
        # TODO: Dealer hits until sum >= 17
        # TODO: Return final sum and bust status
        pass
    
    def natural(self, hand):
        """
        Check if hand is a natural (Ace + 10-card = 21 with 2 cards)
        
        From textbook:
        "If the player has 21 immediately (an ace and a 10-card), it is called 
        a natural. He then wins unless the dealer also has a natural, in which 
        case the game is a draw."
        
        Args:
            hand: List of card values
            
        Returns:
            bool: True if natural, False otherwise
        """
        # TODO: Check if hand has exactly 2 cards
        # TODO: Check if hand contains an ace and a 10-value card
        # TODO: Check if sum equals 21
        pass



def simple_policy(state):
    """
    The policy to evaluate (from Figure 5.1 caption)
    
    From textbook Section 5.1:
    "Consider the policy that sticks if the player's sum is 20 or 21, 
    and otherwise hits."
    
    This is a fixed, deterministic policy:
    - Stick (action 0) if player sum is 20 or 21
    - Hit (action 1) otherwise
    
    Args:
        state: Tuple of (player_sum, dealer_showing, usable_ace)
        
    Returns:
        action: 0 = stick, 1 = hit
    """
    # TODO: Extract player_sum from state
    # TODO: Return 0 (stick) if player_sum >= 20
    # TODO: Return 1 (hit) otherwise
    pass

def first_visit_mc_prediction(policy, env, num_episodes, gamma=1.0):
    """
    First-visit Monte Carlo policy evaluation
    
    Algorithm from Sutton & Barto Section 5.1 (page 92):
    
    Input: a policy π to be evaluated
    Initialize:
        V(s) ∈ ℝ, arbitrarily, for all s ∈ S
        Returns(s) ← an empty list, for all s ∈ S
    Loop forever (for each episode):
        Generate an episode following π: S₀, A₀, R₁, S₁, A₁, R₂, ..., S_{T-1}, A_{T-1}, R_T
        G ← 0
        Loop for each step of episode, t = T-1, T-2, ..., 0:
            G ← γG + R_{t+1}
            Unless S_t appears in S₀, S₁, ..., S_{t-1}:
                Append G to Returns(S_t)
                V(S_t) ← average(Returns(S_t))
    
    Key concepts from textbook:
    - "First-visit MC method estimates v_π(s) as the average of the returns 
       following first visits to s"
    - "Each return is an independent, identically distributed estimate of v_π(s)"
    - "By the law of large numbers the sequence of averages converges to the 
       expected value"
    - "The standard deviation of error falls as 1/√n where n is the number of 
       returns averaged"
    
    Args:
        policy: Function mapping states to actions
        env: BlackjackEnvironment instance
        num_episodes: Number of episodes to run (10,000 or 500,000 for Figure 5.1)
        gamma: Discount factor (1.0 for blackjack as per textbook)
        
    Returns:
        V: Dictionary mapping states to estimated values
    """
    
    # Initialize value function arbitrarily
    # V maps state -> estimated value
    # TODO: Initialize V as defaultdict(float) or nested dict
    V = None
    
    # Returns(s) stores list of returns following first visits to state s
    # From textbook: We need to track all returns to compute the average
    # TODO: Initialize Returns as defaultdict(list) or nested dict
    Returns = None
    
    # Loop for each episode
    for episode_num in range(num_episodes):
        
        # GENERATE AN EPISODE following policy π
        # Episode is sequence: S₀, A₀, R₁, S₁, A₁, R₂, ..., S_{T-1}, A_{T-1}, R_T
        # TODO: Initialize episode list to store (state, action, reward) tuples
        episode = []
        
        # TODO: Reset environment to get initial state S₀
        state = None
        
        # TODO: Generate episode until terminal state
        # Loop:
        #   - Get action from policy
        #   - Take step in environment
        #   - Store (state, action, reward) in episode
        #   - Update state
        #   - Break if done
        
        # CALCULATE RETURNS AND UPDATE VALUE FUNCTION
        # Work backwards from end of episode
        # G is the return (cumulative discounted reward)
        G = 0
        
        # TODO: Loop backwards through episode (t = T-1, T-2, ..., 0)
        # for t in range(len(episode)-1, -1, -1):
        
            # TODO: Extract state and reward from episode[t]
            
            # Calculate return: G ← γG + R_{t+1}
            # From textbook: For blackjack, γ = 1, so G ← G + R_{t+1}
            # TODO: Update G
            
            # FIRST-VISIT CHECK
            # "Unless S_t appears in S₀, S₁, ..., S_{t-1}"
            # This is the key difference between first-visit and every-visit MC
            # TODO: Check if current state appeared earlier in episode
            # if state not in [episode[i] for i in range(t)]:
            
                # Append return to Returns(S_t)
                # TODO: Append G to Returns[state]
                
                # Update value estimate: V(S_t) ← average(Returns(S_t))
                # From textbook: "The value estimate is the sample mean of returns"
                # TODO: V[state] = mean(Returns[state])

    return V


def state_value_to_arrays(V, usable_ace):
    """
    Convert state-value dictionary to 2D arrays for plotting
    
    The state space for plotting is:
    - X-axis: Dealer showing card (1-10, representing Ace through 10)
    - Y-axis: Player sum (12-21)
    - Z-axis: Value V(s)
    - Separate plots for usable ace vs. no usable ace
    
    Args:
        V: Dictionary mapping (player_sum, dealer_showing, usable_ace) -> value
        usable_ace: Boolean, which ace condition to extract
        
    Returns:
        values: 2D numpy array of shape (10, 10) for plotting
                rows = player sum (12-21)
                cols = dealer showing (1-10)
    """
    # TODO: Initialize 10x10 array (player sums 12-21, dealer showing 1-10)
    # TODO: Extract values from V for given usable_ace condition
    # TODO: Return 2D array suitable for surface plot
    pass


def plot_value_function(V, num_episodes, title_suffix=""):
    """
    Create 3D surface plots matching Figure 5.1 layout
    
    Figure 5.1 shows:
    - Left column: After 10,000 episodes
    - Right column: After 500,000 episodes
    - Top row: Usable ace
    - Bottom row: No usable ace
    
    Each subplot shows:
    - X-axis: Dealer showing (A, 2, 3, ..., 10)
    - Y-axis: Player sum (12, 13, ..., 21)
    - Z-axis: State value (approximately -1 to +1)
    
    The plots are wireframe/surface plots showing the value function as a 3D surface
    
    Args:
        V: Value function dictionary
        num_episodes: Number of episodes run (for title)
        title_suffix: Additional text for title
    """
    # TODO: Create figure with 2 subplots (usable ace, no usable ace)
    
    # TODO: For usable_ace in [True, False]:
    #   - Convert V to 2D array
    #   - Create meshgrid for X (dealer) and Y (player sum)
    #   - Create 3D surface plot
    #   - Set axis labels: "Dealer showing", "Player sum", value
    #   - Set title: "Usable ace" or "No usable ace"
    #   - Set z-axis limits approximately [-1, +1]
    
    # TODO: Add overall title: f"After {num_episodes} episodes"
    # TODO: Adjust layout and show plot
    pass

def main():
    """
    Main function to recreate Figure 5.1
    
    From textbook Section 5.1:
    "In this way, we obtained the estimates of the state-value function shown 
    in Figure 5.1. The estimates for states with a usable ace are less certain 
    and less regular because these states are less common. In any event, after 
    500,000 games the value function is very well approximated."
    
    The figure shows results after:
    1. 10,000 episodes (left column)
    2. 500,000 episodes (right column)
    """
    
    print("Recreating Figure 5.1: Blackjack Monte Carlo Policy Evaluation")
    print("=" * 70)
    
    # Initialize environment
    # TODO: env = BlackjackEnvironment()
    
    # Run First-Visit MC for 10,000 episodes
    print("Running First-Visit Monte Carlo Prediction...")
    print("Episodes: 10,000")
    # TODO: V_10k = first_visit_mc_prediction(simple_policy, env, 10000)
    print("Completed 10,000 episodes")
    
    # Plot results for 10,000 episodes
    # TODO: plot_value_function(V_10k, 10000)
    
    # Run First-Visit MC for 500,000 episodes  
    print("\nEpisodes: 500,000")
    # TODO: V_500k = first_visit_mc_prediction(simple_policy, env, 500000)
    print("Completed 500,000 episodes")
    
    # Plot results for 500,000 episodes
    # TODO: plot_value_function(V_500k, 500000)
    

if __name__ == "__main__":
    # TODO: main()
    pass

