<div style="background: linear-gradient(90deg, #17a2b8 0%, #0e5a63 60%, #0a3d44 100%); color: white; padding: 18px 25px; margin-bottom: 20px;">
    <div style="display: flex; justify-content: space-between; align-items: baseline;">
        <h1 style="font-family: 'Helvetica Neue', sans-serif; font-size: 24px; margin: 0; font-weight: 300;">
            Lab 5-1: Blackjack with Monte Carlo ES
        </h1>
    </div>
    <p style="font-size: 13px; margin-top: 6px; margin-bottom: 0; opacity: 0.9;">
        IE 7295 Reinforcement Learning | Sutton and Barto Chapter 5 | 75 minutes
    </p>
</div>

<div style="background: white; padding: 15px 20px; margin-bottom: 12px; border-left: 3px solid #17a2b8;">
    <h3 style="color: #17a2b8; font-size: 14px; margin: 0 0 8px 0;">Background</h3>
    <p style="color: #555; line-height: 1.6; margin: 0; font-size: 13px;">
        This lab implements Monte Carlo ES exactly as described in the textbook Figure 5.2. 
        The key insight is Exploring Starts: each episode begins with a random state-action pair, 
        ensuring all pairs are explored.
    </p>
</div>

<table style="width: 100%; border-spacing: 12px;">
<tr>
<td style="background: white; padding: 12px 15px; border-top: 3px solid #17a2b8; width: 50%;">
    <h4 style="color: #17a2b8; font-size: 13px; margin: 0 0 8px 0;">Learning Objectives</h4>
    <ul style="color: #555; line-height: 1.4; margin: 0; padding-left: 18px; font-size: 12px;">
        <li>Implement Monte Carlo ES algorithm</li>
        <li>Understand exploring starts mechanism</li>
        <li>Learn optimal Blackjack policy</li>
        <li>Reproduce textbook results</li>
    </ul>
</td>
<td style="background: white; padding: 12px 15px; border-top: 3px solid #00acc1; width: 50%;">
    <h4 style="color: #00acc1; font-size: 13px; margin: 0 0 8px 0;">Blackjack Rules</h4>
    <div style="color: #555; font-size: 12px; line-height: 1.6;">
        <div>Actions: 0=Stick, 1=Hit</div>
        <div>States: (sum, dealer, ace)</div>
        <div>Rewards: +1, 0, -1</div>
    </div>
</td>
</tr>
</table>

---
<div style="border-left: 4px solid #17a2b8; padding-left: 12px; margin: 20px 0;">
  <h2 style="color: #17a2b8; margin: 0; font-size: 18px;">Section 1: Environment Setup</h2>
</div>

In [None]:
import sys
import gymnasium as gym
import numpy as np
from collections import defaultdict
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.dpi'] = 100
plt.rcParams['figure.figsize'] = (12, 8)

try:
    import requests
    url = 'https://raw.githubusercontent.com/mdehghani86/RL_labs/master/utility/rl_utility.py'
    exec(requests.get(url).text)
    pretty_print("Environment Ready", 
                 "Implementing MC ES from Figure 5.2", 
                 style='success')
except:
    print("Libraries loaded")

env = gym.make('Blackjack-v1')

---
<div style="border-left: 4px solid #17a2b8; padding-left: 12px; margin: 20px 0;">
  <h2 style="color: #17a2b8; margin: 0; font-size: 18px;">Section 2: Monte Carlo ES Algorithm</h2>
</div>

<div style="text-align: center; margin: 20px 0;">
    <img src="https://github.com/mdehghani86/RL_labs/blob/master/Lab%2005/MCM_ES.jpg?raw=true" 
         alt="Monte Carlo ES" 
         style="width: 70%; border: 2px solid #17a2b8; border-radius: 8px;">
</div>

In [None]:
def generate_episode_es(env, policy):
    episode = []
    state, _ = env.reset()
    
    # EXPLORING START: Random first action
    action = env.action_space.sample()
    next_state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    episode.append((state, action, reward))
    
    # Follow greedy policy for rest of episode
    state = next_state
    while not done:
        action = policy.get(state, env.action_space.sample())
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode.append((state, action, reward))
        state = next_state
    
    return episode

print("Episode generation ready")

In [None]:
def monte_carlo_es(env, num_episodes=500000):
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    returns = defaultdict(list)
    policy = {}
    
    print(f"Starting MC ES with {num_episodes:,} episodes...")
    
    for ep in range(1, num_episodes + 1):
        episode = generate_episode_es(env, policy)
        visited = set()
        G = 0
        
        for t in range(len(episode) - 1, -1, -1):
            state, action, reward = episode[t]
            G = reward + G
            sa = (state, action)
            
            if sa not in visited:
                visited.add(sa)
                returns[sa].append(G)
                Q[state][action] = np.mean(returns[sa])
                policy[state] = np.argmax(Q[state])
        
        if ep % 100000 == 0:
            print(f"Episode {ep:,}")
    
    print("\nMC ES complete")
    return Q, policy

print("MC ES algorithm ready")

---
<div style="border-left: 4px solid #17a2b8; padding-left: 12px; margin: 20px 0;">
  <h2 style="color: #17a2b8; margin: 0; font-size: 18px;">Section 3: Visualization</h2>
</div>

In [None]:
def plot_value_function(Q):
    def get_Z(ps, dc, ua):
        s = (ps, dc, ua)
        return np.max(Q[s]) if s in Q else 0
    
    def create_surface(ua, ax):
        pr = np.arange(12, 22)
        dr = np.arange(1, 11)
        X, Y = np.meshgrid(pr, dr)
        Z = np.array([[get_Z(x, y, ua) for x in pr] for y in dr])
        surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm, vmin=-1, vmax=1, alpha=0.8)
        ax.set_xlabel('Player Sum')
        ax.set_ylabel('Dealer')
        ax.set_zlabel('Value')
        ax.set_zlim(-1, 1)
        ax.view_init(25, -130)
        return surf
    
    fig = plt.figure(figsize=(14, 10))
    ax1 = fig.add_subplot(211, projection='3d')
    ax1.set_title('With Usable Ace', fontweight='bold')
    surf1 = create_surface(True, ax1)
    fig.colorbar(surf1, ax=ax1, shrink=0.5)
    
    ax2 = fig.add_subplot(212, projection='3d')
    ax2.set_title('No Usable Ace', fontweight='bold')
    surf2 = create_surface(False, ax2)
    fig.colorbar(surf2, ax=ax2, shrink=0.5)
    plt.tight_layout()
    plt.show()

def plot_policy(pol):
    def get_a(ps, dc, ua):
        return pol.get((ps, dc, ua), 1)
    
    def create_hm(ua, ax):
        pr = np.arange(12, 22)
        dr = np.arange(1, 11)
        Z = np.array([[get_a(p, d, ua) for p in pr] for d in dr])
        im = ax.pcolormesh(pr, dr, Z, cmap='RdYlGn_r', edgecolors='black',
                          linewidth=0.5, vmin=0, vmax=1, shading='flat')
        ax.set_xticks(pr)
        ax.set_yticks(dr)
        ax.set_yticklabels(['A'] + list(range(2, 11)))
        ax.set_xlabel('Player Sum')
        ax.set_ylabel('Dealer')
        ax.set_aspect('equal')
        cbar = plt.colorbar(im, ax=ax, ticks=[0.25, 0.75])
        cbar.ax.set_yticklabels(['STICK', 'HIT'])
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    ax1.set_title('With Usable Ace', fontweight='bold')
    create_hm(True, ax1)
    ax2.set_title('No Usable Ace', fontweight='bold')
    create_hm(False, ax2)
    plt.tight_layout()
    plt.show()

print("Visualization functions ready")

---
<div style="border-left: 4px solid #17a2b8; padding-left: 12px; margin: 20px 0;">
  <h2 style="color: #17a2b8; margin: 0; font-size: 18px;">Section 4: Run Experiments</h2>
</div>

In [None]:
Q, policy = monte_carlo_es(env, num_episodes=500000)

stick = sum(1 for a in policy.values() if a == 0)
hit = sum(1 for a in policy.values() if a == 1)
total = len(policy)

print(f"\nResults:")
print(f"States: {total}")
print(f"STICK: {stick} ({100*stick/total:.1f}%)")
print(f"HIT: {hit} ({100*hit/total:.1f}%)")

In [None]:
print("Generating visualizations...")
plot_value_function(Q)
plot_policy(policy)
print("Complete - results should match textbook Figure 5.2")

<div style="background: linear-gradient(90deg, #17a2b8 0%, #0e5a63 60%, #0a3d44 100%); color: white; padding: 15px 20px; margin-top: 30px; text-align: center;">
    <p style="margin: 0;">End of Lab 5-1: Monte Carlo ES</p>
</div>