In [1]:
import gymnasium as gym

env = gym.make('LunarLanderContinuous-v3')
print(f"Observation space: {env.observation_space}")  # Box(8,)
print(f"Action space: {env.action_space}")            # Box(2,)

# State: [x, y, vx, vy, angle, angular_velocity, left_leg_contact, right_leg_contact]
# Actions: [main_engine_power, side_engine_power] both in [-1, 1]

Observation space: Box([ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
  -0.         -0.       ], [ 2.5        2.5       10.        10.         6.2831855 10.
  1.         1.       ], (8,), float32)
Action space: Box(-1.0, 1.0, (2,), float32)


In [None]:
import sys
sys.path.append('streaming_diffusion_policy')

import torch
import numpy as np
import gymnasium as gym
from diffusion_policy.policy.tedi_unet_lowdim_policy import TEDiUnetLowdimPolicy
from diffusion_policy.model.diffusion.conditional_unet1d_tedi import ConditionalUnet1D
from diffusion_policy.policy.schedulers import DDPMTEDiScheduler

# Lunar Lander specifics
obs_dim = 8           # LunarLander state dimension
action_dim = 2        # [main_engine, side_engine]

# Temporal parameters (keep these or adjust)
horizon = 16          # Must be divisible by 4
n_obs_steps = 2       # Recent observations to condition on
n_action_steps = 4    # Actions to execute per prediction (reduced for LL)

# Diffusion parameters
num_train_timesteps = 100
num_inference_steps = 16

ImportError: cannot import name 'hf_cache_home' from 'huggingface_hub.constants' (c:\Users\puyua\AppData\Local\Programs\Python\Python312\Lib\site-packages\huggingface_hub\constants.py)

In [None]:
# Data collection
import pickle
from collections import deque

def collect_demonstrations(num_episodes=100, render=False):
    """
    Collect expert demonstrations using heuristic controller or trained agent
    """
    env = gym.make('LunarLanderContinuous-v2', render_mode='human' if render else None)
    
    episodes = []
    successful_episodes = 0
    
    for ep in range(num_episodes):
        obs, info = env.reset()
        episode_data = {
            'observations': [],
            'actions': [],
            'rewards': [],
            'dones': []
        }
        
        done = False
        truncated = False
        step_count = 0
        episode_reward = 0
        
        while not (done or truncated):
            # Use a simple heuristic or load a pretrained agent
            # For now, use random actions (you should replace this!)
            action = env.action_space.sample()
            
            # Store transition
            episode_data['observations'].append(obs)
            episode_data['actions'].append(action)
            
            # Step environment
            obs, reward, done, truncated, info = env.step(action)
            episode_data['rewards'].append(reward)
            episode_data['dones'].append(done or truncated)
            
            episode_reward += reward
            step_count += 1
            
            if step_count > 1000:  # Prevent infinite episodes
                break
        
        # Only keep successful episodes (reward > 200 is good landing)
        if episode_reward > 200:
            episodes.append({
                'observations': np.array(episode_data['observations']),
                'actions': np.array(episode_data['actions']),
                'rewards': np.array(episode_data['rewards']),
                'total_reward': episode_reward
            })
            successful_episodes += 1
            print(f"Episode {ep}: ✅ Reward = {episode_reward:.1f} (kept)")
        else:
            print(f"Episode {ep}: ❌ Reward = {episode_reward:.1f} (discarded)")
    
    env.close()
    print(f"\n✅ Collected {successful_episodes} successful episodes")
    
    # Save to disk
    with open('lunar_lander_demos.pkl', 'wb') as f:
        pickle.dump(episodes, f)
    
    return episodes

# Uncomment to collect data:
# episodes = collect_demonstrations(num_episodes=50)

In [None]:
# Dataset class
from torch.utils.data import Dataset, DataLoader

class LunarLanderDataset(Dataset):
    def __init__(self, episodes, horizon=16, n_obs_steps=2):
        """
        Args:
            episodes: List of episode dictionaries with 'observations' and 'actions'
            horizon: Length of action sequence to predict
            n_obs_steps: Number of observation steps for conditioning
        """
        self.horizon = horizon
        self.n_obs_steps = n_obs_steps
        self.episodes = episodes
        
        # Create indices for sampling
        self.indices = []
        for ep_idx, episode in enumerate(episodes):
            episode_len = len(episode['actions'])
            # We need at least n_obs_steps + horizon
            if episode_len >= n_obs_steps + horizon:
                # Can sample from any position that has enough future steps
                for start_idx in range(episode_len - horizon - n_obs_steps + 1):
                    self.indices.append((ep_idx, start_idx))
        
        print(f"Dataset created with {len(self.indices)} samples from {len(episodes)} episodes")
        
        # Compute normalization statistics
        all_obs = np.concatenate([ep['observations'] for ep in episodes])
        all_actions = np.concatenate([ep['actions'] for ep in episodes])
        
        self.obs_mean = all_obs.mean(axis=0)
        self.obs_std = all_obs.std(axis=0) + 1e-8
        self.action_mean = all_actions.mean(axis=0)
        self.action_std = all_actions.std(axis=0) + 1e-8
        
    def normalize_obs(self, obs):
        return (obs - self.obs_mean) / self.obs_std
    
    def normalize_action(self, action):
        return (action - self.action_mean) / self.action_std
    
    def denormalize_action(self, action):
        return action * self.action_std + self.action_mean
    
    def __len__(self):
        return len(self.indices)
    
    def __getitem__(self, idx):
        ep_idx, start_idx = self.indices[idx]
        episode = self.episodes[ep_idx]
        
        # Extract observation window
        obs_seq = episode['observations'][start_idx:start_idx + self.n_obs_steps]
        
        # Extract action sequence
        action_seq = episode['actions'][start_idx:start_idx + self.horizon]
        
        # Normalize
        obs_seq = self.normalize_obs(obs_seq)
        action_seq = self.normalize_action(action_seq)
        
        return {
            'obs': torch.FloatTensor(obs_seq),
            'action': torch.FloatTensor(action_seq)
        }

# Load and create dataset
with open('lunar_lander_demos.pkl', 'rb') as f:
    episodes = pickle.load(f)

dataset = LunarLanderDataset(episodes, horizon=horizon, n_obs_steps=n_obs_steps)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, num_workers=0)

In [None]:
# Training
from diffusion_policy.common.normalize_util import LinearNormalizer

# Build model (reuse from your existing code)
model = ConditionalUnet1D(
    input_dim=action_dim,
    global_cond_dim=obs_dim * n_obs_steps,
    local_cond_dim=None,
    diffusion_step_embed_dim=256,
    down_dims=[256, 512, 1024],
    kernel_size=5,
    n_groups=8,
    cond_predict_scale=True,
    horizon=horizon
)

noise_scheduler = DDPMTEDiScheduler(
    num_train_timesteps=num_train_timesteps,
    beta_start=0.0001,
    beta_end=0.02,
    beta_schedule='squaredcos_cap_v2',
    variance_type='fixed_small',
    clip_sample=True,
    prediction_type='epsilon'
)

policy = TEDiUnetLowdimPolicy(
    model=model,
    noise_scheduler=noise_scheduler,
    horizon=horizon,
    obs_dim=obs_dim,
    action_dim=action_dim,
    n_action_steps=n_action_steps,
    n_obs_steps=n_obs_steps,
    num_inference_steps=num_inference_steps,
    obs_as_global_cond=True,
    obs_as_local_cond=False,
    pred_action_steps_only=False,
    temporally_constant_weight=0.2,
    temporally_increasing_weight=0.0,
    temporally_random_weights=0.0,
    chunk_wise_weight=0.8,
    buffer_init="zero"
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
policy = policy.to(device)

# Setup normalizer
normalizer = LinearNormalizer()
normalizer['obs'] = LinearNormalizer()
normalizer['obs'].fit(torch.from_numpy(np.concatenate([ep['observations'] for ep in episodes])))
normalizer['action'] = LinearNormalizer()
normalizer['action'].fit(torch.from_numpy(np.concatenate([ep['actions'] for ep in episodes])))
policy.set_normalizer(normalizer)

# Optimizer
optimizer = torch.optim.AdamW(policy.parameters(), lr=1e-4, weight_decay=1e-6)

# Training loop
num_epochs = 100
print(f"Starting training for {num_epochs} epochs...")

for epoch in range(num_epochs):
    policy.model.train()
    epoch_loss = 0
    
    for batch_idx, batch in enumerate(dataloader):
        # Move to device
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Compute loss
        loss = policy.compute_loss(batch)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}: Loss = {avg_loss:.4f}")
    
    # Save checkpoint
    if (epoch + 1) % 10 == 0:
        torch.save({
            'epoch': epoch,
            'model_state_dict': policy.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_loss,
        }, f'lunar_lander_ckpt_epoch_{epoch+1}.pt')

print("Training complete!")

In [None]:
# Evaluation

def evaluate_policy(policy, num_episodes=10, render=True):
    """Evaluate trained policy on Lunar Lander"""
    env = gym.make('LunarLanderContinuous-v2', render_mode='human' if render else None)
    
    policy.model.eval()
    all_rewards = []
    
    for ep in range(num_episodes):
        obs, info = env.reset()
        policy.reset_buffer()  # Important: reset for each episode!
        
        obs_history = deque(maxlen=n_obs_steps)
        # Initialize with first observation
        for _ in range(n_obs_steps):
            obs_history.append(obs)
        
        episode_reward = 0
        done = False
        truncated = False
        step_count = 0
        
        while not (done or truncated) and step_count < 1000:
            # Prepare observation
            obs_seq = np.array(list(obs_history))
            obs_tensor = torch.FloatTensor(obs_seq).unsqueeze(0).to(device)
            
            # Predict actions
            with torch.no_grad():
                result = policy.predict_action({'obs': obs_tensor})
                action_seq = result['action'].cpu().numpy()[0]
            
            # Execute first action (or multiple if n_action_steps > 1)
            for act_idx in range(min(n_action_steps, len(action_seq))):
                if done or truncated:
                    break
                    
                action = action_seq[act_idx]
                action = np.clip(action, -1, 1)  # Ensure valid action
                
                obs, reward, done, truncated, info = env.step(action)
                obs_history.append(obs)
                episode_reward += reward
                step_count += 1
        
        all_rewards.append(episode_reward)
        print(f"Episode {ep+1}: Reward = {episode_reward:.1f}")
    
    env.close()
    
    mean_reward = np.mean(all_rewards)
    std_reward = np.std(all_rewards)
    print(f"\nEvaluation Results:")
    print(f"   Mean Reward: {mean_reward:.1f} ± {std_reward:.1f}")
    print(f"   Success Rate: {sum(r > 200 for r in all_rewards) / len(all_rewards) * 100:.1f}%")
    
    return all_rewards

# Run evaluation
rewards = evaluate_policy(policy, num_episodes=5, render=True)