# Algorithm Comparison for Behavioral Cloning

This notebook provides a comparison of different neural network architectures for behavioral cloning in the risky navigation environment.

**Algorithms Evaluated:**
- **Linear**: Simple linear regression baseline
- **AutoEncoder**: Neural network encoder-decoder architecture  
- **Bayesian**: Bayesian neural network with uncertainty quantification
- **Transformer**: Self-attention based model
- **VAE**: Variational AutoEncoder with probabilistic latent representations

**Workflow:**
1. **Data Collection**: Load expert demonstrations from optimal visibility graph policy
2. **Model Training**: Train each algorithm with simple, transparent training loop
3. **Evaluation**: Test models in environment and compare performance
4. **Analysis**: Visualize results and compare metrics

This simplified approach prioritizes debuggability and clarity over automated hyperparameter optimization.

## Import Required Libraries

In [None]:
!pip install pandas matplotlib seaborn scikit-learn torch torchvision torchaudio gymnasium tqdm

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import time
import os
import warnings
from tqdm import tqdm
from datetime import datetime
import random

print(os.path.abspath('.'))

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')


/


In [2]:
import sys
sys.path.append('/risky_navigation')

from src.env.continuous_nav_env import ContinuousNavigationEnv
from src.algorithms.AutoEncoder.agent import AutoEncoderAgent
from src.algorithms.Bayesian.agent import BayesianAgent
from src.algorithms.Transformer.agent import TransformerAgent
from src.algorithms.Linear.agent import LinearAgent
from src.algorithms.VAE.agent import VAEAgent
from src.utils.file_management import save_pickle, load_pickle

print("All libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"Device available: {'CUDA' if torch.cuda.is_available() else 'CPU'}")


All libraries imported successfully!
PyTorch version: 2.8.0+cu128
Device available: CUDA


## Config

In [3]:
CONFIG = {
    'num_episodes': 1000,
    'max_steps': 200,
    'batch_size': 256,
    'num_epochs': 100,
    'val_ratio': 0.2,
    'num_test_episodes': 50,
    'lr': 1e-3,
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
}

MODEL_CONFIGS = {
    'AutoEncoder': {'latent_dim': 32, 'hidden_dims': [128, 64]},
    'Linear': {},
    'Transformer': {'d_model': 64, 'nhead': 4, 'num_layers': 2, 'dropout': 0.1},
    'Bayesian': {'hidden_dim': 128, 'prior_std': 1.0},
    'VAE': {'latent_dim': 32, 'hidden_dim': 128, 'beta': 1.0}
}

print(f"Using device: {CONFIG['device']}")
print(f"Batch size: {CONFIG['batch_size']}")
print(f"Training epochs: {CONFIG['num_epochs']}")


Using device: cuda
Batch size: 256
Training epochs: 100


In [4]:
if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = True
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    torch.set_float32_matmul_precision('high')
    torch.cuda.empty_cache()
    print("GPU optimizations enabled for RTX 4090")
    print(f"Batch size: {CONFIG['batch_size']}")
else:
    print("WARNING: CUDA not available. Running on CPU will be very slow!")


GPU optimizations enabled for RTX 4090
Batch size: 256


## Training Function

In [5]:
def train_agent_simple(agent, train_states, train_expert_actions, train_goals,
                       val_states, val_expert_actions, val_goals, 
                       num_epochs=100, batch_size=256, device='cpu', verbose=True):
    """Simple training loop with validation tracking."""
    train_losses = []
    val_losses = []
    n_train = len(train_states)
    
    for epoch in range(num_epochs):
        epoch_loss = 0.0
        num_batches = 0
        indices = np.random.permutation(n_train)
        
        for start_idx in range(0, n_train, batch_size):
            end_idx = min(start_idx + batch_size, n_train)
            batch_indices = indices[start_idx:end_idx]
            
            batch_states = torch.tensor(train_states[batch_indices], dtype=torch.float32, device=device)
            batch_actions = torch.tensor(train_expert_actions[batch_indices], dtype=torch.float32, device=device)
            
            loss = agent.train_step(batch_states, None, None, batch_actions)
            epoch_loss += loss
            num_batches += 1
        
        avg_train_loss = epoch_loss / num_batches
        train_losses.append(avg_train_loss)
        
        # Validation
        val_states_t = torch.tensor(val_states, dtype=torch.float32, device=device)
        val_actions_t = torch.tensor(val_expert_actions, dtype=torch.float32, device=device)
        
        with torch.no_grad():
            if hasattr(agent, 'model'):
                agent.model.eval()
                predictions = agent.model(val_states_t)
            elif hasattr(agent, 'encoder'):  # VAE
                agent.encoder.eval()
                agent.decoder.eval()
                mu, _ = agent.encoder(val_states_t)
                predictions = agent.decoder(mu)
            else:
                predictions = agent.predict_action(val_states_t, None)
            
            val_loss = torch.nn.functional.mse_loss(predictions, val_actions_t).item()
        
        val_losses.append(val_loss)
        
        if verbose and (epoch % 10 == 0 or epoch == num_epochs - 1):
            print(f"Epoch {epoch:3d}/{num_epochs}: Train Loss = {avg_train_loss:.6f}, Val Loss = {val_loss:.6f}")
            
            if np.isnan(avg_train_loss) or np.isnan(val_loss):
                print(f"WARNING: NaN detected at epoch {epoch}!")
                break
    
    return train_losses, val_losses

print("Training function defined!")


Training function defined!


## Data Collection and Preparation

In [None]:
def collect_rl_experience(env, num_episodes=100, max_steps=200):
    """Collect RL training data using optimal policy from visibility graph."""
    data = []
    successful_episodes = 0
    
    for ep in tqdm(range(num_episodes), desc='Collecting RL experience'):
        state = env.reset()
        goal = env.goal.copy() if hasattr(env, 'goal') else np.zeros(2)
        
        for t in range(max_steps):
            current_pos = state[:2]
            current_theta = state[2]
            dist_to_goal = np.linalg.norm(current_pos - goal)
            
            try:
                if hasattr(env, 'vgraph'):
                    path = env.vgraph.shortest_path(current_pos, goal)
                    
                    if len(path) > 1:
                        next_waypoint = path[1]
                        direction = next_waypoint - current_pos
                        desired_theta = np.arctan2(direction[1], direction[0])
                        angle_diff = desired_theta - current_theta
                        angle_diff = (angle_diff + np.pi) % (2 * np.pi) - np.pi
                        steering = np.clip(angle_diff, env.action_space.low[1], env.action_space.high[1])
                        
                        # FIXED: Slow down near goal to satisfy velocity constraint
                        if dist_to_goal < env.goal_radius * 2:
                            # Slow down when close to goal
                            throttle = env.action_space.high[0] * 0.2  # 20% throttle near goal
                        else:
                            throttle = env.action_space.high[0]  # Full throttle otherwise
                        
                        action = np.array([throttle, steering])
                    else:
                        # At goal, stop
                        action = np.array([0.0, 0.0])
                else:
                    direction = goal - current_pos
                    desired_theta = np.arctan2(direction[1], direction[0])
                    angle_diff = desired_theta - current_theta
                    angle_diff = (angle_diff + np.pi) % (2 * np.pi) - np.pi
                    steering = np.clip(angle_diff, env.action_space.low[1], env.action_space.high[1])
                    
                    # FIXED: Slow down near goal
                    if dist_to_goal < env.goal_radius * 2:
                        throttle = env.action_space.high[0] * 0.2
                    else:
                        throttle = env.action_space.high[0]
                    
                    action = np.array([throttle, steering])
                    
            except Exception:
                direction = goal - current_pos
                desired_theta = np.arctan2(direction[1], direction[0])
                angle_diff = desired_theta - current_theta
                angle_diff = (angle_diff + np.pi) % (2 * np.pi) - np.pi
                steering = np.clip(angle_diff, env.action_space.low[1], env.action_space.high[1])
                
                # FIXED: Slow down near goal
                if dist_to_goal < env.goal_radius * 2:
                    throttle = env.action_space.high[0] * 0.2
                else:
                    throttle = env.action_space.high[0]
                
                action = np.array([throttle, steering])
            
            action = np.clip(action, env.action_space.low, env.action_space.high)
            next_state, reward, done, info = env.step(action)
            
            data.append({
                'state': state.copy(),
                'action': action.copy(),
                'reward': reward,
                'next_state': next_state.copy(),
                'done': done,
                'goal': goal.copy()
            })
            
            state = next_state
            
            if done:
                if info.get('reason') == 'goal_reached':
                    successful_episodes += 1
                break
    
    print(f"Collected {len(data)} transitions from {num_episodes} episodes")
    print(f"Success rate: {successful_episodes/num_episodes:.2%}")
    
    return data

print("Data collection function defined!")
print("FIXED: Expert now slows down near goal to satisfy velocity constraint")


Data collection function defined!


In [None]:
# Load or collect data
dataset_path = 'rl_experience_dataset.pickle'

if os.path.exists(dataset_path):
    print(f"Deleting old dataset (0% success rate)...")
    os.remove(dataset_path)

print(f"Collecting new dataset with fixed expert policy...")
env_collect = ContinuousNavigationEnv()
data = collect_rl_experience(env_collect, num_episodes=CONFIG['num_episodes'], max_steps=CONFIG['max_steps'])
save_pickle(data, dataset_path)
print(f"✓ Saved dataset to {dataset_path}")

# Extract states, actions, and goals from data
states = np.array([d['state'] for d in data])
actions = np.array([d['action'] for d in data])
next_states = np.array([d['next_state'] for d in data])
rewards = np.array([d['reward'] for d in data])
dones = np.array([d['done'] for d in data])
goals = np.array([d['goal'] for d in data])

print(f"\nData extracted:")
print(f"  States shape: {states.shape}")
print(f"  Actions shape: {actions.shape}")
print(f"  Goals shape: {goals.shape}")
print(f"  Rewards shape: {rewards.shape}")


Collecting RL experience: 100%|██████████| 1000/1000 [00:04<00:00, 202.75it/s]
Collecting RL experience: 100%|██████████| 1000/1000 [00:04<00:00, 202.75it/s]


Collected 56010 transitions from 1000 episodes
Success rate: 0.00%
✓ Saved dataset to rl_experience_dataset.pickle

Data extracted:
  States shape: (56010, 8)
  Actions shape: (56010, 2)
  Goals shape: (56010, 2)
  Rewards shape: (56010,)
✓ Saved dataset to rl_experience_dataset.pickle

Data extracted:
  States shape: (56010, 8)
  Actions shape: (56010, 2)
  Goals shape: (56010, 2)
  Rewards shape: (56010,)


In [None]:
# Define dimensions from the collected data
STATE_DIM = states.shape[1]
ACTION_DIM = actions.shape[1]
GOAL_DIM = goals.shape[1]

print(f"Data dimensions:")
print(f"  STATE_DIM = {STATE_DIM}")
print(f"  ACTION_DIM = {ACTION_DIM}")
print(f"  GOAL_DIM = {GOAL_DIM}")
print(f"  Total samples = {len(states)}")

In [None]:
# Initialize environment for evaluation
env = ContinuousNavigationEnv()
print(f"Environment initialized: {env}")
print(f"  State space: {env.observation_space.shape}")
print(f"  Action space: {env.action_space.shape}")

## Train All Algorithms

In [None]:
print("="*60)
print("TRAINING ALL ALGORITHMS")
print("="*60)

# Split data
n_samples = len(states)
n_train = int(n_samples * (1 - CONFIG['val_ratio']))
indices = np.random.permutation(n_samples)
train_indices = indices[:n_train]
val_indices = indices[n_train:]

train_states = states[train_indices]
train_actions = actions[train_indices]
train_goals = goals[train_indices]

val_states = states[val_indices]
val_actions = actions[val_indices]
val_goals = goals[val_indices]

print(f"Data split: {len(train_states)} train, {len(val_states)} val\n")

all_results = {}

algorithms_to_train = {
    'Linear': (LinearAgent, {}),
    'AutoEncoder': (AutoEncoderAgent, MODEL_CONFIGS['AutoEncoder']),
    'Transformer': (TransformerAgent, MODEL_CONFIGS['Transformer']),
    'Bayesian': (BayesianAgent, MODEL_CONFIGS['Bayesian']),
    'VAE': (VAEAgent, MODEL_CONFIGS['VAE'])
}

for algo_name, (AgentClass, model_config) in algorithms_to_train.items():
    print(f"\n{'='*60}")
    print(f"Training {algo_name}")
    print(f"{'='*60}")
    
    agent = AgentClass(
        state_dim=STATE_DIM,
        action_dim=ACTION_DIM,
        lr=CONFIG['lr'],
        device=CONFIG['device'],
        **model_config
    )
    
    start_time = time.time()
    train_losses, val_losses = train_agent_simple(
        agent, 
        train_states, train_actions, train_goals,
        val_states, val_actions, val_goals,
        num_epochs=CONFIG['num_epochs'],
        batch_size=CONFIG['batch_size'],
        device=CONFIG['device'],
        verbose=True
    )
    train_time = time.time() - start_time
    
    all_results[algo_name] = {
        'agent': agent,
        'train_losses': train_losses,
        'val_losses': val_losses,
        'train_time': train_time
    }
    
    print(f"\n{algo_name} Complete! Time: {train_time:.2f}s")
    print(f"Final train loss: {train_losses[-1]:.6f}, Val loss: {val_losses[-1]:.6f}")
    
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print(f"\n{'='*60}")
print("TRAINING COMPLETE!")
print(f"{'='*60}")


In [None]:
# Plot training curves
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

for algo_name, results in all_results.items():
    epochs = range(len(results['train_losses']))
    axes[0].plot(epochs, results['train_losses'], label=algo_name, alpha=0.7)
    axes[1].plot(epochs, results['val_losses'], label=algo_name, alpha=0.7)

axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Training Loss')
axes[0].set_title('Training Loss Curves')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Validation Loss')
axes[1].set_title('Validation Loss Curves')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nFinal Loss Summary:")
print(f"{'Algorithm':<15} {'Train Loss':<15} {'Val Loss':<15} {'Train Time (s)':<15}")
print("-" * 60)
for algo_name, results in all_results.items():
    print(f"{algo_name:<15} {results['train_losses'][-1]:<15.6f} {results['val_losses'][-1]:<15.6f} {results['train_time']:<15.2f}")

## Evaluate Trained Agents

Now test the trained agents in the environment.

In [None]:
print("="*60)
print("EVALUATING ALL AGENTS")
print("="*60)

def evaluate_agent_simple(agent, env, num_episodes=50, max_steps=200):
    """Evaluate agent in environment."""
    results = {'rewards': [], 'successes': [], 'steps': []}
    
    for episode in range(num_episodes):
        state = env.reset()
        episode_reward = 0
        done = False
        
        for step in range(max_steps):
            state_t = torch.tensor(state, dtype=torch.float32, device=CONFIG['device'])
            action = agent.predict_action(state_t, None)
            
            if isinstance(action, torch.Tensor):
                action = action.cpu().numpy()
            
            next_state, reward, done, info = env.step(action)
            episode_reward += reward
            state = next_state
            
            if done:
                break
        
        results['rewards'].append(episode_reward)
        success = info.get('reason', '') == 'goal_reached' if done else False
        results['successes'].append(1 if success else 0)
        results['steps'].append(step + 1)
    
    return {
        'avg_reward': np.mean(results['rewards']),
        'std_reward': np.std(results['rewards']),
        'success_rate': np.mean(results['successes']),
        'avg_steps': np.mean(results['steps'])
    }

eval_results = {}

for algo_name, results_dict in all_results.items():
    print(f"\nEvaluating {algo_name}...")
    agent = results_dict['agent']
    
    eval_res = evaluate_agent_simple(
        agent, env, 
        num_episodes=CONFIG['num_test_episodes'],
        max_steps=CONFIG['max_steps']
    )
    
    eval_results[algo_name] = eval_res
    
    print(f"  Avg Reward: {eval_res['avg_reward']:.3f} ± {eval_res['std_reward']:.3f}")
    print(f"  Success Rate: {eval_res['success_rate']:.1%}")
    print(f"  Avg Steps: {eval_res['avg_steps']:.1f}")

print(f"\n{'='*60}")
print("EVALUATION COMPLETE!")
print(f"{'='*60}")


In [None]:
# Create comprehensive results table
import pandas as pd

results_data = []
for algo_name in all_results.keys():
    train_res = all_results[algo_name]
    eval_res = eval_results[algo_name]
    
    results_data.append({
        'Algorithm': algo_name,
        'Train Loss': train_res['train_losses'][-1],
        'Val Loss': train_res['val_losses'][-1],
        'Train Time (s)': train_res['train_time'],
        'Avg Reward': eval_res['avg_reward'],
        'Success Rate': eval_res['success_rate'],
        'Avg Steps': eval_res['avg_steps']
    })

df_results = pd.DataFrame(results_data)
df_results = df_results.sort_values('Val Loss')

print("\n" + "="*80)
print("FINAL RESULTS SUMMARY")
print("="*80)
print(df_results.to_string(index=False))
print("="*80)

# Visualize results
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Success Rate
axes[0].bar(df_results['Algorithm'], df_results['Success Rate'])
axes[0].set_ylabel('Success Rate')
axes[0].set_title('Success Rate by Algorithm')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True, alpha=0.3)

# Average Reward
axes[1].bar(df_results['Algorithm'], df_results['Avg Reward'])
axes[1].set_ylabel('Average Reward')
axes[1].set_title('Average Reward by Algorithm')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(True, alpha=0.3)

# Validation Loss
axes[2].bar(df_results['Algorithm'], df_results['Val Loss'])
axes[2].set_ylabel('Validation Loss')
axes[2].set_title('Validation Loss by Algorithm')
axes[2].tick_params(axis='x', rotation=45)
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
print("="*60)
print("DEBUGGING: Action and State Analysis")
print("="*60)

agent = all_results['Linear']['agent']
state = env.reset()
goal = env.goal.copy()

print(f"\nInitial State: {state}")
print(f"Goal Position: {goal}")
print(f"Distance to Goal: {np.linalg.norm(state[:2] - goal):.3f}")
print(f"\nAction Space: [{env.action_space.low}, {env.action_space.high}]")

print(f"\n{'Step':<6} {'Action':<20} {'State[:2]':<20} {'Distance':<10} {'Reward':<8}")
print("-" * 70)

for step in range(5):
    state_t = torch.tensor(state, dtype=torch.float32, device=CONFIG['device'])
    action = agent.predict_action(state_t, None)
    
    if isinstance(action, torch.Tensor):
        action = action.cpu().numpy()
    
    next_state, reward, done, info = env.step(action)
    distance = np.linalg.norm(next_state[:2] - goal)
    
    print(f"{step:<6} {str(action):<20} {str(next_state[:2]):<20} {distance:<10.3f} {reward:<8.3f}")
    
    state = next_state
    if done:
        print(f"\nEpisode ended: {info.get('reason', 'unknown')}")
        break

print(f"\n{'='*60}")
print("Training Data Statistics:")
print(f"{'='*60}")
print(f"Expert actions - Mean: {actions.mean(axis=0)}")
print(f"Expert actions - Std:  {actions.std(axis=0)}")
print(f"Expert actions - Min:  {actions.min(axis=0)}")
print(f"Expert actions - Max:  {actions.max(axis=0)}")
