# DCEO Agent Training with Visualization

This notebook runs the DCEO training while capturing and displaying visualizations properly.

In [1]:
# Import necessary libraries
import os
import sys
import torch
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display, clear_output
import time

# Check CUDA
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

CUDA available: True
CUDA device: NVIDIA GeForce RTX 3070 Ti


In [2]:
# Import your environment and agent classes
from key_door_maze_env import KeyDoorMazeEnv
from key_maze_wrapper import KeyMazeWrapper
from key_maze_dceo_agent import KeyMazeDCEOAgent
import compare_key_maze_agents as cmp

usage: ipykernel_launcher.py [-h] [--episodes EPISODES]
                             [--eval_episodes EVAL_EPISODES]
                             [--eval_interval EVAL_INTERVAL]
                             [--maze_size MAZE_SIZE] [--max_steps MAX_STEPS]
                             [--num_keys NUM_KEYS] [--results_dir RESULTS_DIR]
                             [--fixed_maze] [--maze_seed MAZE_SEED]
                             [--seed SEED] [--epsilon_start EPSILON_START]
                             [--epsilon_end EPSILON_END]
                             [--epsilon_decay EPSILON_DECAY]
                             [--buffer_size BUFFER_SIZE]
                             [--skip {standard,count,rnd,dceo} [{standard,count,rnd,dceo} ...]]
                             [--plot_results]
ipykernel_launcher.py: error: argument --fixed_maze: ignored explicit argument 'c:\\Users\\abrio\\AppData\\Roaming\\jupyter\\runtime\\kernel-v3ac9c7fdb91c411eeb96a04f211519d35dbdeca0a.json'


AttributeError: 'tuple' object has no attribute 'tb_frame'

In [None]:
# Temporarily modify the rendering behavior for notebook display
# Original savefig function
original_savefig = plt.savefig

# Create a patched version that will display in notebook
def patched_savefig(*args, **kwargs):
    # Call the original savefig function
    result = original_savefig(*args, **kwargs)
    
    # Get the current figure and display it
    fig = plt.gcf()
    display(fig)
    plt.close(fig)  # Close to prevent double display
    
    return result

# Replace the savefig function with our patched version
plt.savefig = patched_savefig

# Make other necessary adjustments for notebook display
plt.ion()  # Turn on interactive mode

In [None]:
# Create a visualization function that works with the notebook
def visualize_maze(env):
    """Display the current state of the maze environment"""
    plt.figure(figsize=(8, 8))
    maze_img = env.render(mode='rgb_array')
    if maze_img is not None:
        plt.imshow(maze_img)
        plt.title("Maze Visualization")
        plt.axis('off')
        plt.show()
    else:
        print("Unable to render maze image")

In [None]:
# Set up parameters (same as your command line)
episodes = 20
eval_interval = 10
maze_size = 10
num_keys = 1
max_steps = 300
fixed_maze = True
maze_seed = 42
seed = 42
buffer_size = 100000

# Set seeds for reproducibility
cmp.set_seeds(seed)

# Print configuration
print(f"Training for {episodes} episodes")
print(f"Evaluating every {eval_interval} episodes with {eval_interval} eval episodes")
print(f"Maze size: {maze_size}, Keys: {num_keys}, Max steps: {max_steps}")
print(f"Using fixed maze: {fixed_maze}, Maze seed: {maze_seed}")

# Configure device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Create environment
env = KeyDoorMazeEnv(
    maze_size=maze_size, 
    num_keys=num_keys, 
    max_steps=max_steps,
    use_fixed_layout=fixed_maze,
    use_fixed_seed=fixed_maze,
    fixed_seed=maze_seed
)

# Visualize the initial maze
visualize_maze(env)

# Wrap environment
env = KeyMazeWrapper(
    env,
    frame_stack=4,
    resize_shape=(84, 84),
    proximity_reward=True
)

# Get observation shape
obs, _ = env.reset()
pytorch_shape = obs.shape
print(f"Environment: Key-Door Maze")
print(f"Observation shape: {obs.shape}, PyTorch shape: {pytorch_shape}")
print(f"Action size: {env.action_space.n}")

In [None]:
# Create DCEO agent
print("\n===== Creating Rainbow DCEO Agent =====")
num_options = 8
rep_dim = 8
gamma = 0.99
learning_rate = 1e-4

dceo_agent = KeyMazeDCEOAgent(
    state_shape=pytorch_shape,
    action_size=env.action_space.n,
    buffer_size=buffer_size,
    learning_rate=learning_rate,
    gamma=gamma,
    num_options=num_options,
    rep_dim=rep_dim,
    option_prob=0.8,
    option_duration=15,
    prioritized_replay=True,
    noisy_nets=True,
    dueling=True,
    double_dqn=True,
    eps_start=1.0,
    eps_end=0.01,
    eps_decay=0.995
)

# Move agent to device
dceo_agent.to(device)

In [None]:
# Modified train_single_episode function with visualization for notebook
def train_single_episode(agent, env, viz_interval=10, max_steps=300):
    """Train the agent for a single episode with visualization."""
    # Reset the environment
    obs, _ = env.reset()
    done = False
    episode_reward = 0
    steps = 0
    keys_collected = 0
    doors_opened = 0
    option_changes = 0
    prev_option = None
    
    # Get current epsilon for display
    if hasattr(agent, 'epsilon'):
        epsilon = agent.epsilon
        print(f"Starting episode with epsilon: {epsilon:.4f}")
    
    # Create figure for visualization
    fig, ax = plt.subplots(figsize=(8, 8))
    
    # Episode loop
    while not done and steps < max_steps:
        # Select action
        action = agent.select_action(obs)
        
        # Apply action to environment
        step_result = env.step(action)
        if len(step_result) == 5:
            next_obs, reward, terminated, truncated, info = step_result
            done = terminated or truncated
        else:
            next_obs, reward, done, info = step_result
        
        # Update agent
        agent.step(obs, action, reward, next_obs, done)
        obs = next_obs
        
        # Track metrics
        steps += 1
        episode_reward += reward
        
        # Track option changes
        if hasattr(agent, 'current_option'):
            if agent.current_option != prev_option:
                option_changes += 1
                prev_option = agent.current_option
                print(f"  Step {steps}: Switched to option {agent.current_option}")
        
        # Track key collection and door opening
        if info.get('key_collected', False):
            keys_collected += 1
            print(f"  Step {steps}: Collected key {keys_collected}")
            
        if info.get('door_opened', False):
            doors_opened += 1
            print(f"  Step {steps}: Opened door {doors_opened}")
        
        # Periodic progress report
        if steps % 50 == 0:
            if hasattr(agent, 'epsilon'):
                epsilon = agent.epsilon
            print(f"  Step {steps}: Reward so far: {episode_reward:.2f}, Keys: {keys_collected}, Doors: {doors_opened}")
            print(f"  Current epsilon: {epsilon:.4f}")
            print(f"  Current option: {agent.current_option if hasattr(agent, 'current_option') else 'N/A'}")
        
        # Visualization
        if steps % viz_interval == 0 or info.get('key_collected', False) or info.get('door_opened', False) or done:
            # Clear previous plot
            ax.clear()
            
            # Get maze image from the unwrapped environment
            maze_img = env.unwrapped.render(mode='rgb_array')
            if maze_img is not None:
                ax.imshow(maze_img)
                ax.set_title(f"Step: {steps}, Reward: {episode_reward:.2f}\nKeys: {keys_collected}, Doors: {doors_opened}")
                ax.axis('off')
                display(fig)
                clear_output(wait=True)
                time.sleep(0.1)  # Short pause for animation effect
    
    # Close the figure
    plt.close(fig)
    
    # Episode summary
    print(f"Episode completed in {steps} steps")
    print(f"  Total reward: {episode_reward:.2f}")
    print(f"  Keys collected: {keys_collected}/{num_keys}")
    print(f"  Doors opened: {doors_opened}")
    print(f"  Goal reached: {info.get('goal_reached', False)}")
    print(f"  Option changes: {option_changes}")
    
    return episode_reward, keys_collected, doors_opened, info.get('goal_reached', False)

In [None]:
# Training loop with visualization
def train_with_viz():
    # Tracking metrics
    rewards = []
    eval_rewards = []
    eval_success_rates = []
    eval_key_rates = []
    eval_door_rates = []
    eval_episodes = []
    
    # Training loop
    for episode in range(1, episodes + 1):
        print(f"\n===== Starting Episode {episode}/{episodes} =====")
        
        # Train for one episode
        episode_reward, keys, doors, success = train_single_episode(
            dceo_agent, env, viz_interval=5, max_steps=max_steps)
        
        rewards.append(episode_reward)
        
        # Evaluation phase
        if episode % eval_interval == 0 or episode == episodes:
            print(f"\n----- Evaluating after episode {episode} -----")
            
            # Use your evaluation function
            eval_results = cmp.evaluate_agent(
                dceo_agent, env, num_episodes=eval_interval, epsilon=0.01)
            
            # Store evaluation results
            eval_rewards.append(eval_results['avg_reward'])
            eval_success_rates.append(eval_results['success_rate'])
            eval_key_rates.append(eval_results['avg_keys'])
            eval_door_rates.append(eval_results['avg_doors'])
            eval_episodes.append(episode)
            
            # Print evaluation results
            print(f"Evaluation Results:")
            print(f"  Average Reward: {eval_results['avg_reward']:.2f}")
            print(f"  Success Rate: {eval_results['success_rate']:.2f}")
            print(f"  Average Keys: {eval_results['avg_keys']:.2f}")
            print(f"  Average Doors: {eval_results['avg_doors']:.2f}")
            
            # Plot current progress
            plt.figure(figsize=(12, 8))
            
            plt.subplot(2, 2, 1)
            plt.plot(range(1, len(rewards) + 1), rewards, 'b-')
            plt.title('Training Rewards')
            plt.xlabel('Episode')
            plt.ylabel('Reward')
            plt.grid(True)
            
            if len(eval_rewards) > 0:
                plt.subplot(2, 2, 2)
                plt.plot(eval_episodes, eval_rewards, 'r-')
                plt.title('Evaluation Rewards')
                plt.xlabel('Episode')
                plt.ylabel('Reward')
                plt.grid(True)
                
                plt.subplot(2, 2, 3)
                plt.plot(eval_episodes, eval_success_rates, 'g-')
                plt.title('Success Rate')
                plt.xlabel('Episode')
                plt.ylabel('Rate')
                plt.grid(True)
                
                plt.subplot(2, 2, 4)
                plt.plot(eval_episodes, eval_key_rates, 'y-', label='Keys')
                plt.plot(eval_episodes, eval_door_rates, 'b-', label='Doors')
                plt.title('Key and Door Collection Rates')
                plt.xlabel('Episode')
                plt.ylabel('Count')
                plt.legend()
                plt.grid(True)
            
            plt.tight_layout()
            plt.show()
            
            # Save the model checkpoint
            torch.save(dceo_agent.state_dict(), f'dceo_agent_ep{episode}.pth')
            print(f"Model checkpoint saved to dceo_agent_ep{episode}.pth")
    
    # Final results plotting
    plt.figure(figsize=(12, 8))
    
    plt.subplot(2, 2, 1)
    plt.plot(range(1, len(rewards) + 1), rewards, 'b-')
    plt.title('Training Rewards')
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.grid(True)
    
    plt.subplot(2, 2, 2)
    plt.plot(eval_episodes, eval_rewards, 'r-')
    plt.title('Evaluation Rewards')
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.grid(True)
    
    plt.subplot(2, 2, 3)
    plt.plot(eval_episodes, eval_success_rates, 'g-')
    plt.title('Success Rate')
    plt.xlabel('Episode')
    plt.ylabel('Rate')
    plt.grid(True)
    
    plt.subplot(2, 2, 4)
    plt.plot(eval_episodes, eval_key_rates, 'y-', label='Keys')
    plt.plot(eval_episodes, eval_door_rates, 'b-', label='Doors')
    plt.title('Key and Door Collection Rates')
    plt.xlabel('Episode')
    plt.ylabel('Count')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.savefig('final_results.png')
    plt.show()
    
    print("Training complete! Final results saved to 'final_results.png'")
    
    # Save the final model
    torch.save(dceo_agent.state_dict(), 'dceo_agent_final.pth')
    print("Final model saved to 'dceo_agent_final.pth'")
    
    # Return results data
    return {
        'train_rewards': rewards,
        'eval_rewards': eval_rewards,
        'success_rates': eval_success_rates,
        'key_rates': eval_key_rates,
        'door_rates': eval_door_rates,
        'eval_episodes': eval_episodes
    }

In [None]:
# Run the training with visualization
results = train_with_viz()

In [None]:
# Save results (optional)
import pickle
with open('training_results.pkl', 'wb') as f:
    pickle.dump(results, f)
print("Results saved to 'training_results.pkl'")