# Operator Action Prediction for Teleoperation with Communication Delays

Comparing neural network architectures for predicting operator actions during communication delays in space teleoperation.

## Problem

Space teleoperation faces critical communication delays (Mars: 22 min, Moon: 1.25 sec). Operators cannot respond to obstacles in real-time.

## Approach

Learn models that predict operator actions based on current state, previous action, and goal position.

## Algorithms

1. **Linear** - Simple baseline
2. **Bayesian** - Uncertainty-aware predictions
3. **VAE** - Latent action distributions
4. **Transformer** - Temporal sequence modeling

## Workflow

1. Expert demonstrations from visibility graph policy
2. Train models with grid search
3. Evaluate prediction accuracy and success rate

## Import Required Libraries

In [None]:
!pip install pandas matplotlib seaborn scikit-learn torch torchvision torchaudio gymnasium tqdm

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import time
import os
import warnings
from tqdm import tqdm
from datetime import datetime
import random

print(os.path.abspath('.'))

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')


In [None]:
import sys
sys.path.append('/risky_navigation')

from src.env.continuous_nav_env import ContinuousNavigationEnv
from src.env.layouts import read_layout_dict
from src.algorithms.Bayesian.agent import BayesianAgent
from src.algorithms.Transformer.agent import TransformerAgent
from src.algorithms.Linear.agent import LinearAgent
from src.algorithms.VAE.agent import VAEAgent
from src.utils.file_management import save_pickle, load_pickle
from src.utils.visibility_graph import VisibilityGraph

print("All libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"Device available: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

## Config

In [None]:
CONFIG = {
    'num_episodes': 1000,
    'max_steps': 200,
    'batch_size': 512,  # Increased for RTX 4090
    'num_epochs': 200,
    'val_ratio': 0.2,
    'num_test_episodes': 50,
    'lr': 1e-3,
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    'num_workers': 4,  # Parallel data loading
    'prefetch_factor': 2,
}

# Grid search configurations for each algorithm
GRID_SEARCH_CONFIGS = {
    'Transformer': {
        'd_model': [32, 64, 128],
        'nhead': [4, 8],
        'num_layers': [2, 3, 4],
        'dropout': [0.0, 0.1, 0.2],
        'sequence_len': [1, 5, 10],  # Added: temporal sequence length for time-series modeling
        'lr': [1e-3, 5e-4, 1e-4]
    },
    'Bayesian': {
        'hidden_dim': [64, 128, 256],
        'prior_std': [0.5, 1.0, 2.0],
        'kl_weight': [1e-5, 1e-4, 1e-3],  # Added: KL divergence weight for ELBO
        'lr': [1e-3, 5e-4, 1e-4]
    },
    'VAE': {
        'latent_dim': [16, 32, 64],
        'hidden_dim': [64, 128, 256],
        'beta': [0.5, 1.0, 2.0],
        'lr': [1e-3, 5e-4, 1e-4]
    }
}

# Best baseline configs (for quick comparison)
MODEL_CONFIGS = {
    'Linear': {},
    'Transformer': {'d_model': 64, 'nhead': 4, 'num_layers': 2, 'dropout': 0.1, 'sequence_len': 1},
    'Bayesian': {'hidden_dim': 128, 'prior_std': 1.0, 'kl_weight': 1e-5},
    'VAE': {'latent_dim': 32, 'hidden_dim': 128, 'beta': 1.0}
}

print(f"Using device: {CONFIG['device']}")
print(f"Batch size: {CONFIG['batch_size']} (optimized for RTX 4090)")
print(f"Max epochs: {CONFIG['num_epochs']} (with early stopping)")
print(f"Grid search enabled for: {list(GRID_SEARCH_CONFIGS.keys())}")
print(f"\nTransformer will test sequence lengths: {GRID_SEARCH_CONFIGS['Transformer']['sequence_len']}")
print(f"  - sequence_len=1: Single state")
print(f"  - sequence_len>1: Temporal sequences (time-series modeling)")

In [None]:
if torch.cuda.is_available():
    # RTX 4090 optimizations
    torch.backends.cudnn.benchmark = True
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    torch.set_float32_matmul_precision('high')
    
    # Enable cuDNN autotuner for optimal convolution algorithms
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.deterministic = False
    
    # Memory optimizations
    torch.cuda.empty_cache()
    
    # Mixed precision training setup
    from torch.cuda.amp import autocast, GradScaler
    use_amp = True
    scaler = GradScaler()
    
    print("="*60)
    print("GPU OPTIMIZATIONS ENABLED FOR RTX 4090")
    print("="*60)
    print(f"✓ Batch size: {CONFIG['batch_size']}")
    print(f"✓ TF32 matmul: Enabled")
    print(f"✓ cuDNN benchmark: Enabled")
    print(f"✓ Mixed precision (AMP): Enabled")
    print(f"✓ CUDA version: {torch.version.cuda}")
    print(f"✓ GPU: {torch.cuda.get_device_name(0)}")
    print(f"✓ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    print("="*60)
else:
    use_amp = False
    scaler = None
    print("WARNING: CUDA not available. Running on CPU will be very slow!")


## Training Functions with Grid Search

In [None]:
def train_agent_optimized(agent, train_states, train_prev_actions, train_expert_actions, train_goals,
                          val_states, val_prev_actions, val_expert_actions, val_goals, 
                          num_epochs=50, batch_size=64, patience=10, verbose=True):
    device = CONFIG['device']
    use_amp = CONFIG['device'].type == 'cuda'
    scaler = torch.cuda.amp.GradScaler() if use_amp else None
    
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(agent.optimizer, 'min', patience=5, factor=0.5)
    train_losses, val_losses = [], []
    best_val_loss = float('inf')
    epochs_no_improve = 0
    best_model_state = None
    
    for epoch in range(num_epochs):
        epoch_loss = 0
        num_batches = 0
        
        indices = torch.randperm(len(train_states))
        for i in range(0, len(train_states), batch_size):
            batch_idx = indices[i:i+batch_size]
            batch_states = torch.tensor(train_states[batch_idx], dtype=torch.float32, device=device)
            batch_prev_actions = torch.tensor(train_prev_actions[batch_idx], dtype=torch.float32, device=device)
            batch_actions = torch.tensor(train_expert_actions[batch_idx], dtype=torch.float32, device=device)
            batch_goals = torch.tensor(train_goals[batch_idx], dtype=torch.float32, device=device)
            
            if use_amp and isinstance(agent, (BayesianAgent, VAEAgent)):
                with torch.cuda.amp.autocast():
                    if hasattr(agent, 'model'):
                        agent.model.train()
                        agent.optimizer.zero_grad()
                        inputs = torch.cat([batch_states, batch_prev_actions, batch_goals], dim=1)
                        predictions = agent.model(inputs)
                        loss = agent.loss_fn(predictions, batch_actions)
                    elif hasattr(agent, 'encoder'):
                        loss = torch.tensor(agent.train_step(batch_states, batch_prev_actions, batch_goals, batch_actions), device=device)
                
                if isinstance(loss, torch.Tensor) and loss.requires_grad:
                    scaler.scale(loss).backward()
                    scaler.step(agent.optimizer)
                    scaler.update()
                    scheduler.step()
                    loss = loss.item()
                else:
                    scheduler.step()
            else:
                loss = agent.train_step(batch_states, batch_prev_actions, batch_goals, batch_actions)
                if isinstance(loss, torch.Tensor):
                    loss = loss.item()
                scheduler.step()
            
            epoch_loss += loss
            num_batches += 1
        
        avg_train_loss = epoch_loss / num_batches
        train_losses.append(avg_train_loss)
        
        val_states_t = torch.tensor(val_states, dtype=torch.float32, device=device)
        val_prev_actions_t = torch.tensor(val_prev_actions, dtype=torch.float32, device=device)
        val_actions_t = torch.tensor(val_expert_actions, dtype=torch.float32, device=device)
        val_goals_t = torch.tensor(val_goals, dtype=torch.float32, device=device)
        
        with torch.no_grad():
            if hasattr(agent, 'model'):
                agent.model.eval()
                inputs = torch.cat([val_states_t, val_prev_actions_t, val_goals_t], dim=1)
                predictions = agent.model(inputs)
                agent.model.train()
            elif hasattr(agent, 'encoder'):
                agent.encoder.eval()
                agent.decoder.eval()
                inputs = torch.cat([val_states_t, val_prev_actions_t, val_goals_t], dim=1)
                mu, _ = agent.encoder(inputs)
                predictions = agent.decoder(mu)
                agent.encoder.train()
                agent.decoder.train()
            
            val_loss = torch.nn.functional.mse_loss(predictions, val_actions_t).item()
        
        val_losses.append(val_loss)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            if hasattr(agent, 'model'):
                best_model_state = agent.model.state_dict()
            elif hasattr(agent, 'encoder'):
                best_model_state = {'encoder': agent.encoder.state_dict(), 'decoder': agent.decoder.state_dict()}
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                if verbose:
                    print(f"Early stop at epoch {epoch}. Best val: {best_val_loss:.6f}")
                if best_model_state:
                    if hasattr(agent, 'model'):
                        agent.model.load_state_dict(best_model_state)
                    elif hasattr(agent, 'encoder'):
                        agent.encoder.load_state_dict(best_model_state['encoder'])
                        agent.decoder.load_state_dict(best_model_state['decoder'])
                break
        
        if verbose and epoch % 10 == 0:
            print(f"Epoch {epoch}: Train={avg_train_loss:.6f}, Val={val_loss:.6f}")
    
    return train_losses, val_losses

print("Training function defined!")

In [None]:
for alg_name, (AgentClass, param_grid) in algorithms_with_grid.items():
    print(f"\n{'='*60}")
    print(f"Grid Search: {alg_name}")
    print(f"{'='*60}")
    
    param_names = list(param_grid.keys())
    param_values = list(param_grid.values())
    
    from itertools import product
    configs = list(product(*param_values))
    total_configs = len(configs)
    
    print(f"Testing {total_configs} configurations for {alg_name}...")
    
    best_config = None
    best_val_loss = float('inf')
    best_agent = None
    best_train_time = 0
    
    for config_idx, config_vals in enumerate(configs):
        config = dict(zip(param_names, config_vals))
        
        print(f"  [{config_idx+1}/{total_configs}] Testing {config}...")
        
        try:
            agent = AgentClass(
                state_dim=STATE_DIM,
                action_dim=ACTION_DIM,
                goal_dim=GOAL_DIM,
                **config,
                device=CONFIG['device']
            )
            
            start_time = time.time()
            train_losses, val_losses = train_agent_optimized(
                agent,
                train_states, train_prev_actions, train_actions, train_goals,
                val_states, val_prev_actions, val_actions, val_goals,
                num_epochs=CONFIG['num_epochs'],
                batch_size=CONFIG['batch_size'],
                verbose=False
            )
            
            elapsed_time = time.time() - start_time
            val_loss = min(val_losses) if val_losses else float('inf')
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_config = config
                best_agent = agent
                best_train_time = elapsed_time
                print(f"    ✓ New best: {val_loss:.6f}")
        
        except Exception as e:
            print(f"    ✗ Failed: {e}")
            continue
    
    grid_search_results[alg_name] = {
        'best_config': best_config,
        'best_val_loss': best_val_loss,
        'best_agent': best_agent,
        'train_time': best_train_time
    }
    
    print(f"\n{alg_name} Best Config: {best_config}")
    print(f"{alg_name} Best Val Loss: {best_val_loss:.6f}")
    print(f"{alg_name} Train Time: {best_train_time:.2f}s")

print("\n" + "="*60)
print("GRID SEARCH COMPLETE")
print("="*60)
for alg_name, result in grid_search_results.items():
    print(f"{alg_name:12} | Val Loss: {result['best_val_loss']:.6f} | Time: {result['train_time']:.2f}s")
    print(f"             | Config: {result['best_config']}")

## Data Collection and Preparation

In [None]:
def collect_rl_experience(env, vgraph, num_episodes=100, max_steps=200):
    data = []
    successful_episodes = 0
    
    for ep in tqdm(range(num_episodes), desc='Collecting experience'):
        state = env.reset()
        goal = env.goal.copy()
        prev_action = np.zeros(2)
        
        for t in range(max_steps):
            current_pos = state[:2]
            current_theta = state[2]
            
            _, path = vgraph(current_pos)
            if len(path) > 1:
                target = np.array(path[1])
            else:
                target = goal
            
            direction = target - current_pos
            desired_theta = np.arctan2(direction[1], direction[0])
            angle_diff = desired_theta - current_theta
            angle_diff = (angle_diff + np.pi) % (2 * np.pi) - np.pi
            
            steering = np.clip(angle_diff * 2.0, env.action_space.low[1], env.action_space.high[1])
            dist_to_target = np.linalg.norm(direction)
            
            if dist_to_target < env.goal_radius * 3:
                throttle = env.action_space.high[0] * 0.3
            else:
                throttle = env.action_space.high[0] * 0.8
            
            action = np.array([throttle, steering])
            action = np.clip(action, env.action_space.low, env.action_space.high)
            
            next_state, reward, done, info = env.step(action)
            
            data.append({
                'state': state.copy(),
                'prev_action': prev_action.copy(),
                'action': action.copy(),
                'goal': goal.copy()
            })
            
            prev_action = action
            state = next_state
            
            if done:
                if info.get('reason') == 'goal_reached':
                    successful_episodes += 1
                break
    
    print(f"Collected {len(data)} transitions from {num_episodes} episodes")
    print(f"Success rate: {successful_episodes/num_episodes:.2%}")
    return data

print("Data collection function defined!")

In [None]:
dataset_path = 'expert_dataset.pickle'

if os.path.exists(dataset_path):
    print(f"Loading existing dataset from {dataset_path}...")
    data = load_pickle(dataset_path)
    print(f"✓ Loaded {len(data)} transitions")
else:
    print(f"Collecting new dataset...")
    layout_dict = read_layout_dict('example0')
    env_collect = ContinuousNavigationEnv(**layout_dict)
    vgraph = VisibilityGraph(env_collect.goal, env_collect.obstacles, env_collect.bounds, resolution=(20, 20))
    data = collect_rl_experience(env_collect, vgraph, num_episodes=CONFIG['num_episodes'], max_steps=CONFIG['max_steps'])
    save_pickle(data, dataset_path)
    print(f"✓ Saved dataset to {dataset_path}")

states = np.array([d['state'] for d in data])
prev_actions = np.array([d['prev_action'] for d in data])
actions = np.array([d['action'] for d in data])
goals = np.array([d['goal'] for d in data])

print(f"\nDataset Summary:")
print(f"  Total transitions: {len(data)}")
print(f"  States shape: {states.shape}")
print(f"  Prev actions shape: {prev_actions.shape}")
print(f"  Actions shape: {actions.shape}")

In [None]:
STATE_DIM = states.shape[1]
ACTION_DIM = actions.shape[1]
layout_dict = read_layout_dict('example0')
env = ContinuousNavigationEnv(**layout_dict)

print(f"Environment & Data Configuration:")
print(f"  STATE_DIM = {STATE_DIM}")
print(f"  ACTION_DIM = {ACTION_DIM}")
print(f"  Total samples = {len(states)}")
print(f"  State space: {env.observation_space.shape}")
print(f"  Action space: {env.action_space.shape}")

## Grid Search Training - All Algorithms

In [None]:
print("="*60)
print("GRID SEARCH TRAINING - ALL ALGORITHMS")
print("="*60)

GOAL_DIM = 2
n_samples = len(states)
n_train = int(n_samples * (1 - CONFIG['val_ratio']))
indices = np.random.permutation(n_samples)
train_indices = indices[:n_train]
val_indices = indices[n_train:]

train_states = states[train_indices]
train_prev_actions = prev_actions[train_indices]
train_actions = actions[train_indices]
train_goals = goals[train_indices]

val_states = states[val_indices]
val_prev_actions = prev_actions[val_indices]
val_actions = actions[val_indices]
val_goals = goals[val_indices]

print(f"Data split: {len(train_states)} train, {len(val_states)} val")
print(f"Batch size: {CONFIG['batch_size']}")
print(f"Max epochs: {CONFIG['num_epochs']}\n")

grid_search_results = {}
algorithms_with_grid = {
    'Transformer': (TransformerAgent, GRID_SEARCH_CONFIGS['Transformer']),
    'Bayesian': (BayesianAgent, GRID_SEARCH_CONFIGS['Bayesian']),
    'VAE': (VAEAgent, GRID_SEARCH_CONFIGS['VAE'])
}

print(f"\n{'='*60}")
print(f"Training Linear (Baseline - No Grid Search)")
print(f"{'='*60}")

linear_agent = LinearAgent(
    state_dim=STATE_DIM,
    action_dim=ACTION_DIM,
    goal_dim=GOAL_DIM,
    lr=CONFIG['lr'],
    device=CONFIG['device'],
    action_low=env.action_space.low,
    action_high=env.action_space.high
)

start_time = time.time()
train_losses, val_losses = train_agent_optimized(
    linear_agent,
    train_states, train_prev_actions, train_actions, train_goals,
    val_states, val_prev_actions, val_actions, val_goals,
    num_epochs=CONFIG['num_epochs'],
    batch_size=CONFIG['batch_size'],
    verbose=False
)

elapsed_time = time.time() - start_time
best_val_loss = min(val_losses) if val_losses else float('inf')

print(f"Linear - Train time: {elapsed_time:.2f}s, Best val loss: {best_val_loss:.6f}")

grid_search_results['Linear'] = {
    'best_config': {'lr': CONFIG['lr']},
    'best_val_loss': best_val_loss,
    'best_agent': linear_agent,
    'train_time': elapsed_time
}

In [None]:
# Grid Search Results Visualization
print("\n" + "="*60)
print("VISUALIZING GRID SEARCH RESULTS")
print("="*60)

# Figure 1: Training and Validation Curves for Top 5 Configs per Algorithm
fig1, axes1 = plt.subplots(len(grid_search_results), 2, figsize=(18, 4*len(grid_search_results)))
if len(grid_search_results) == 1:
    axes1 = axes1.reshape(1, -1)

for idx, (algo_name, results_list) in enumerate(grid_search_results.items()):
    ax_train = axes1[idx, 0]
    ax_val = axes1[idx, 1]
    
    # Plot top 5 configs
    for i, result in enumerate(results_list[:5]):
        epochs = range(len(result['train_losses']))
        
        # Config label with key parameters
        if result['config']:
            key_params = {k: v for k, v in result['config'].items() if k in ['lr', 'latent_dim', 'd_model', 'hidden_dim']}
            config_label = f"Rank {i+1}: " + ", ".join([f"{k}={v}" for k, v in key_params.items()])
        else:
            config_label = "Baseline"
        
        ax_train.plot(epochs, result['train_losses'], label=config_label, alpha=0.8, linewidth=2)
        ax_val.plot(epochs, result['val_losses'], label=config_label, alpha=0.8, linewidth=2)
    
    # Configure plots
    ax_train.set_xlabel('Epoch', fontsize=11)
    ax_train.set_ylabel('Training Loss', fontsize=11)
    ax_train.set_title(f'{algo_name} - Training Loss (Top 5 Configs)', fontsize=12, fontweight='bold')
    ax_train.legend(fontsize=9, loc='best')
    ax_train.grid(True, alpha=0.3, linestyle='--')
    ax_train.set_yscale('log')
    
    ax_val.set_xlabel('Epoch', fontsize=11)
    ax_val.set_ylabel('Validation Loss', fontsize=11)
    ax_val.set_title(f'{algo_name} - Validation Loss (Top 5 Configs)', fontsize=12, fontweight='bold')
    ax_val.legend(fontsize=9, loc='best')
    ax_val.grid(True, alpha=0.3, linestyle='--')
    ax_val.set_yscale('log')

plt.tight_layout()
plt.show()

# Figure 2: Performance Comparison - Val Loss vs Training Time
fig2, ax2 = plt.subplots(1, 1, figsize=(14, 8))
colors = plt.cm.tab10(np.linspace(0, 1, len(grid_search_results)))
markers = ['o', 's', '^', 'D']

for idx, (algo_name, results_list) in enumerate(grid_search_results.items()):
    val_losses = [r['final_val_loss'] for r in results_list[:10]]
    train_times = [r['train_time'] for r in results_list[:10]]
    
    ax2.scatter(train_times, val_losses, s=200, alpha=0.6, 
               color=colors[idx], marker=markers[idx % len(markers)],
               label=algo_name, edgecolors='black', linewidth=1.5)
    
    # Annotate best config
    ax2.annotate(f'{algo_name}\nBest', 
                xy=(train_times[0], val_losses[0]),
                xytext=(10, 10), textcoords='offset points',
                fontsize=9, fontweight='bold',
                bbox=dict(boxstyle='round,pad=0.5', facecolor=colors[idx], alpha=0.3),
                arrowprops=dict(arrowstyle='->', lw=1.5))

ax2.set_xlabel('Training Time (seconds)', fontsize=12, fontweight='bold')
ax2.set_ylabel('Final Validation Loss', fontsize=12, fontweight='bold')
ax2.set_title('Grid Search: Validation Loss vs Training Time', fontsize=14, fontweight='bold')
ax2.legend(fontsize=10, loc='best', framealpha=0.9)
ax2.grid(True, alpha=0.3, linestyle='--')
ax2.set_yscale('log')
plt.tight_layout()
plt.show()

# Print summary tables
print("\n" + "="*120)
print("GRID SEARCH RESULTS - TOP 5 CONFIGS PER ALGORITHM")
print("="*120)

for algo_name, results_list in grid_search_results.items():
    print(f"\n{algo_name}:")
    print(f"{'Rank':<6} {'Val Loss':<12} {'Train Loss':<12} {'Time(s)':<10} {'Epochs':<8} {'Config'}")
    print("-" * 120)
    
    for rank, result in enumerate(results_list[:5], 1):
        config_str = ', '.join([f"{k}={v}" for k, v in result['config'].items()]) or "N/A (baseline)"
        final_train_loss = result['train_losses'][-1]
        num_epochs = len(result['train_losses'])
        
        print(f"{rank:<6} {result['final_val_loss']:<12.6f} {final_train_loss:<12.6f} "
              f"{result['train_time']:<10.1f} {num_epochs:<8} {config_str}")

# Summary statistics
print("\n" + "="*120)
print("SUMMARY STATISTICS")
print("="*120)
print(f"{'Algorithm':<15} {'Configs':<10} {'Best Val':<12} {'Worst Val':<12} {'Avg Val':<12} {'Std Val':<12}")
print("-" * 120)

for algo_name, results_list in grid_search_results.items():
    val_losses = [r['final_val_loss'] for r in results_list]
    print(f"{algo_name:<15} {len(results_list):<10} {min(val_losses):<12.6f} "
          f"{max(val_losses):<12.6f} {np.mean(val_losses):<12.6f} {np.std(val_losses):<12.6f}")

print("="*120)

## Evaluate Best Models from Grid Search

In [None]:
print("="*60)
print("EVALUATING BEST MODELS FROM GRID SEARCH")
print("="*60)

def evaluate_agent_optimized(agent, env, num_episodes=50, max_steps=200, device='cpu'):
    results = {
        'rewards': [], 
        'successes': [], 
        'steps': [],
        'final_distances': [],
        'final_velocities': []
    }
    
    for episode in range(num_episodes):
        state = env.reset()
        goal = env.goal.copy() if hasattr(env, 'goal') else np.zeros(2)
        prev_action = np.zeros(ACTION_DIM)
        episode_reward = 0
        done = False
        
        for step in range(max_steps):
            state_t = torch.tensor(state, dtype=torch.float32, device=device)
            prev_action_t = torch.tensor(prev_action, dtype=torch.float32, device=device)
            goal_t = torch.tensor(goal, dtype=torch.float32, device=device)
            
            with torch.no_grad():
                action = agent.predict_action(state_t, prev_action_t, goal_t)
            
            if isinstance(action, torch.Tensor):
                action = action.cpu().numpy()
            
            next_state, reward, done, info = env.step(action)
            episode_reward += reward
            prev_action = action
            state = next_state
            
            if done:
                break
        
        results['rewards'].append(episode_reward)
        success = info.get('reason', '') == 'goal_reached' if done else False
        results['successes'].append(1 if success else 0)
        results['steps'].append(step + 1)
        results['final_distances'].append(np.linalg.norm(state[:2] - goal))
        results['final_velocities'].append(state[3])
    
    return {
        'avg_reward': np.mean(results['rewards']),
        'success_rate': np.mean(results['successes']),
        'avg_steps': np.mean(results['steps']),
        'avg_final_dist': np.mean(results['final_distances']),
        'avg_final_vel': np.mean(results['final_velocities'])
    }

eval_results = {}
env_eval = ContinuousNavigationEnv()

for alg_name, result in grid_search_results.items():
    agent = result['best_agent']
    print(f"\nEvaluating {alg_name}...")
    eval_res = evaluate_agent_optimized(agent, env_eval, num_episodes=CONFIG['num_test_episodes'], device=CONFIG['device'])
    eval_results[alg_name] = eval_res
    print(f"{alg_name} - Success: {eval_res['success_rate']:.1%}, Avg Reward: {eval_res['avg_reward']:.2f}")

print("\n" + "="*60)
print("EVALUATION COMPLETE")
print("="*60)

In [None]:
results_data = []
for alg_name, result in grid_search_results.items():
    eval_res = eval_results[alg_name]
    results_data.append({
        'Algorithm': alg_name,
        'Val Loss': result['best_val_loss'],
        'Train Time (s)': result['train_time'],
        'Success Rate': eval_res['success_rate'],
        'Avg Reward': eval_res['avg_reward'],
        'Avg Steps': eval_res['avg_steps'],
    })

df_results = pd.DataFrame(results_data)
df_results = df_results.sort_values('Val Loss')

print("\n" + "="*80)
print("FINAL RESULTS SUMMARY")
print("="*80)
print(df_results.to_string(index=False))
print("="*80)

fig, axes = plt.subplots(2, 2, figsize=(16, 10))

axes[0, 0].bar(df_results['Algorithm'], df_results['Success Rate'], color='skyblue', edgecolor='black')
axes[0, 0].set_ylabel('Success Rate')
axes[0, 0].set_title('Success Rate by Algorithm')
axes[0, 0].grid(True, alpha=0.3, axis='y')
axes[0, 0].set_ylim([0, 1.1])

axes[0, 1].bar(df_results['Algorithm'], df_results['Avg Reward'], color='lightgreen', edgecolor='black')
axes[0, 1].set_ylabel('Average Reward')
axes[0, 1].set_title('Average Reward by Algorithm')
axes[0, 1].grid(True, alpha=0.3, axis='y')

axes[1, 0].bar(df_results['Algorithm'], df_results['Val Loss'], color='salmon', edgecolor='black')
axes[1, 0].set_ylabel('Validation Loss')
axes[1, 0].set_title('Validation Loss by Algorithm')
axes[1, 0].grid(True, alpha=0.3, axis='y')
axes[1, 0].set_yscale('log')

axes[1, 1].bar(df_results['Algorithm'], df_results['Train Time (s)'], color='plum', edgecolor='black')
axes[1, 1].set_ylabel('Training Time (s)')
axes[1, 1].set_title('Training Time by Algorithm')
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

In [None]:
save_dir = 'trained_models/grid_search_best'
os.makedirs(save_dir, exist_ok=True)

for alg_name, result in grid_search_results.items():
    agent = result['best_agent']
    model_path = os.path.join(save_dir, f'{alg_name}_best.pth')
    
    if hasattr(agent, 'model'):
        torch.save(agent.model.state_dict(), model_path)
    elif hasattr(agent, 'encoder'):
        torch.save({'encoder': agent.encoder.state_dict(), 'decoder': agent.decoder.state_dict()}, model_path)
    
    config_path = os.path.join(save_dir, f'{alg_name}_config.pkl')
    save_pickle({
        'config': result['best_config'],
        'val_loss': result['best_val_loss'],
        'train_time': result['train_time'],
        'eval_results': eval_results[alg_name]
    }, config_path)

print(f"Models and configs saved to {save_dir}/")