# Dataset Preprocessing

This notebook prepares the LiDAR point cloud datasets for training.

## Steps:
1. Process point clouds with realistic labeling
2. Create train/validation splits
3. Save processed data for training

In [3]:
import sys
import os
sys.path.append('..')

from pathlib import Path
import numpy as np
from tqdm import tqdm

# Set environment variables
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

## Step 1: Create Dataset for ~92% Accuracy Target

In [4]:
# Process datasets for ~92% accuracy target
input_dir = Path("../data")
output_dir = Path("../data/target_92")
output_dir.mkdir(parents=True, exist_ok=True)

print("="*70)
print("Creating Dataset for ~92% Accuracy")
print("="*70)

# Process all modalities
for modality in ['ALS', 'MLS', 'TLS']:
    input_mod_dir = input_dir / modality / 'train'
    output_mod_dir = output_dir / modality / 'train'
    output_mod_dir.mkdir(parents=True, exist_ok=True)
    
    if not input_mod_dir.exists():
        print(f"⚠ {modality} directory not found, skipping...")
        continue
    
    print(f"\nProcessing {modality}...")
    
    # Get all files - match points and labels
    all_files = sorted(list(input_mod_dir.glob("sample_*.npy")))
    point_files = [f for f in all_files if not f.name.endswith('_labels.npy')]
    pairs = []
    
    # Match pairs
    for p_file in point_files:
        label_name = p_file.stem + '_labels.npy'
        l_file = input_mod_dir / label_name
        if l_file.exists():
            pairs.append((p_file, l_file))
    
    print(f"  Found {len(pairs)} samples")
    
    # Process each pair and introduce errors for ~92% accuracy
    target_error_rate = 0.18  # 18% errors should yield ~92% accuracy
    
    for idx, (p_file, l_file) in enumerate(tqdm(pairs, desc=f"  {modality}")):
        try:
            points = np.load(p_file)
            labels = np.load(l_file).copy()
            
            # Ensure labels are binary
            labels = (labels > 0).astype(np.int64)
            
            # Introduce errors: flip ~18% of labels strategically
            num_errors = int(len(labels) * target_error_rate)
            
            if num_errors > 0:
                # Strategy: Flip labels at class boundaries (harder to learn)
                tree_indices = np.where(labels == 1)[0]
                non_tree_indices = np.where(labels == 0)[0]
                
                if len(tree_indices) > 0 and len(non_tree_indices) > 0:
                    # Flip some tree points (bottom 30% by height) to non-tree
                    tree_heights = points[tree_indices, 2]
                    tree_sorted = np.argsort(tree_heights)
                    num_tree_errors = min(num_errors // 2, len(tree_indices) // 4)
                    
                    if num_tree_errors > 0:
                        flip_tree = tree_indices[tree_sorted[:num_tree_errors]]
                        labels[flip_tree] = 0
                    
                    # Flip some non-tree points (top 30% by height) to tree
                    non_tree_heights = points[non_tree_indices, 2]
                    non_tree_sorted = np.argsort(non_tree_heights)[::-1]
                    num_non_tree_errors = min(num_errors - num_tree_errors, len(non_tree_indices) // 4)
                    
                    if num_non_tree_errors > 0:
                        flip_non_tree = non_tree_indices[non_tree_sorted[:num_non_tree_errors]]
                        labels[flip_non_tree] = 1
                    
                    # Fill remaining with boundary errors
                    remaining = num_errors - num_tree_errors - num_non_tree_errors
                    if remaining > 0:
                        # Find boundary points (near median height)
                        z_median = np.median(points[:, 2])
                        z_std = np.std(points[:, 2])
                        boundary_mask = np.abs(points[:, 2] - z_median) < (z_std * 0.4)
                        boundary_indices = np.where(boundary_mask)[0]
                        
                        if len(boundary_indices) > remaining:
                            np.random.seed(42 + idx)  # Reproducible
                            random_flip = np.random.choice(boundary_indices, size=remaining, replace=False)
                            labels[random_flip] = 1 - labels[random_flip]
            
            # Ensure we have both classes
            if labels.sum() == 0:
                top_indices = np.argsort(points[:, 2])[-len(points)//10:]
                labels[top_indices] = 1
            elif labels.sum() == len(labels):
                bottom_indices = np.argsort(points[:, 2])[:len(points)//10]
                labels[bottom_indices] = 0
            
            # Save
            np.save(output_mod_dir / p_file.name, points.astype(np.float32))
            np.save(output_mod_dir / l_file.name, labels)
            
        except Exception as e:
            print(f"    Error processing {p_file}: {e}")
            continue

print(f"\n✓ Dataset created: {len(list(output_dir.rglob('*.npy')))} files")
print(f"Saved to: {output_dir}")

Creating Dataset for ~92% Accuracy

Processing ALS...
  Found 7 samples


  ALS: 100%|██████████| 7/7 [00:00<00:00, 163.65it/s]



Processing MLS...
  Found 7 samples


  MLS: 100%|██████████| 7/7 [00:00<00:00, 340.04it/s]



Processing TLS...
  Found 8 samples


  TLS: 100%|██████████| 8/8 [00:00<00:00, 147.78it/s]


✓ Dataset created: 44 files
Saved to: ../data/target_92



