# Fair Comparison Validation: 5-Channel vs Multi-Channel Hilbert

This notebook ensures both experiments use:
1. Correct data paths
2. Same data loading approach
3. Same train/val/test splits
4. Same evaluation methodology

In [None]:
import pandas as pd
import numpy as np
import glob
from sklearn.preprocessing import LabelEncoder

def load_parquet_data_properly(base_path, dataset_name):
    """Load data preserving original splits OR mixed for compatibility"""
    print(f"\n{'='*60}")
    print(f"Loading {dataset_name} from: {base_path}")
    print(f"{'='*60}")
    
    # Method 1: Load respecting original splits (RECOMMENDED)
    print("\nMethod 1: Respecting original train/val/test splits")
    data_splits = {'train': [], 'val': [], 'test': []}
    label_splits = {'train': [], 'val': [], 'test': []}
    
    # Get all class directories
    class_dirs = sorted([d for d in glob.glob(f"{base_path}*/") 
                        if not any(s in d for s in ['train', 'val', 'test'])])
    class_names = [d.split('/')[-2] for d in class_dirs]
    print(f"Found {len(class_names)} classes: {class_names}")
    
    for class_dir in class_dirs:
        class_name = class_dir.split('/')[-2]
        
        for split in ['train', 'val', 'test']:
            split_path = f"{class_dir}{split}/"
            parquet_files = sorted(glob.glob(f"{split_path}*.parquet"))
            
            split_count = 0
            for file_path in parquet_files:
                try:
                    df = pd.read_parquet(file_path)
                    if 'image_data' in df.columns:
                        for idx, row in df.iterrows():
                            image_data = np.array(row['image_data'], dtype=np.float32)
                            data_splits[split].append(image_data)
                            label_splits[split].append(class_name)
                            split_count += 1
                except Exception as e:
                    print(f"Error loading {file_path}: {e}")
    
    # Convert to arrays
    X_train_orig = np.array(data_splits['train'], dtype=np.float32)
    X_val_orig = np.array(data_splits['val'], dtype=np.float32)
    X_test_orig = np.array(data_splits['test'], dtype=np.float32)
    
    print(f"\nOriginal splits:")
    print(f"  Train: {len(X_train_orig):,} samples")
    print(f"  Val:   {len(X_val_orig):,} samples")
    print(f"  Test:  {len(X_test_orig):,} samples")
    print(f"  Total: {len(X_train_orig) + len(X_val_orig) + len(X_test_orig):,} samples")
    
    # Method 2: Mix all data (for backward compatibility)
    print("\nMethod 2: Mixed data (current notebook approach)")
    all_data = []
    all_labels = []
    
    for split in ['train', 'val', 'test']:
        all_data.extend(data_splits[split])
        all_labels.extend(label_splits[split])
    
    X_mixed = np.array(all_data, dtype=np.float32)
    y_mixed = np.array(all_labels)
    
    print(f"\nMixed approach:")
    print(f"  Total samples: {len(X_mixed):,}")
    print(f"  Will be re-split: 70% train, 15% val, 15% test")
    print(f"  New sizes: Train={int(len(X_mixed)*0.7):,}, Val={int(len(X_mixed)*0.15):,}, Test={int(len(X_mixed)*0.15):,}")
    
    return X_mixed, y_mixed, X_train_orig, X_val_orig, X_test_orig

# Test both datasets
datasets = {
    '5channel': '/home/ubuntu/analyst/notebooks/ViT-experiment/pcap-dataset-samples/parquet/5channel_32x32/',
    'multichannel_hilbert': '/home/ubuntu/analyst/notebooks/ViT-experiment/pcap-dataset-samples/parquet/multichannel_hilbert_32x32/'
}

results = {}
for name, path in datasets.items():
    try:
        X_mixed, y_mixed, X_train, X_val, X_test = load_parquet_data_properly(path, name)
        results[name] = {
            'loaded': True,
            'mixed_shape': X_mixed.shape,
            'orig_train': X_train.shape,
            'orig_val': X_val.shape,
            'orig_test': X_test.shape
        }
    except Exception as e:
        results[name] = {'loaded': False, 'error': str(e)}
        print(f"\n❌ Error loading {name}: {e}")

In [None]:
# Summary and recommendations
print("\n" + "="*80)
print("SUMMARY AND RECOMMENDATIONS")
print("="*80)

if all(r.get('loaded', False) for r in results.values()):
    print("\n✅ Both datasets loaded successfully!")
    
    print("\n📊 Data Summary:")
    for name, result in results.items():
        if result['loaded']:
            print(f"\n{name}:")
            print(f"  Mixed approach: {result['mixed_shape']}")
            print(f"  Original splits: Train={result['orig_train']}, Val={result['orig_val']}, Test={result['orig_test']}")
    
    print("\n⚠️  IMPORTANT FINDINGS:")
    print("\n1. Data Leakage Issue:")
    print("   - Current notebooks mix train/val/test then re-split")
    print("   - This can lead to similar samples in train and test sets")
    print("   - Results may be overly optimistic")
    
    print("\n2. Path Issue in 5-channel notebook:")
    print("   - Wrong: '/home/ubuntu/analyst/pcap-dataset-samples/parquet/5channel_32x32/'")
    print("   - Correct: '/home/ubuntu/analyst/notebooks/ViT-experiment/pcap-dataset-samples/parquet/5channel_32x32/'")
    
    print("\n🔧 FIXES NEEDED:")
    print("\n1. Fix the data path in 5-channel notebook")
    print("\n2. Choose one approach:")
    print("   Option A: Keep mixed approach (current) - easier but has data leakage")
    print("   Option B: Use original splits - more correct but requires notebook changes")
    
    print("\n3. Ensure both notebooks use the SAME approach")
    
    print("\n📋 Quick Fix for Path (5-channel notebook):")
    print("   Change line:")
    print("   data_path = '/home/ubuntu/analyst/pcap-dataset-samples/parquet/5channel_32x32/'")
    print("   To:")
    print("   data_path = '/home/ubuntu/analyst/notebooks/ViT-experiment/pcap-dataset-samples/parquet/5channel_32x32/'")
    
else:
    print("\n❌ One or more datasets failed to load")
    for name, result in results.items():
        if not result.get('loaded', False):
            print(f"\n{name}: {result.get('error', 'Unknown error')}")

print("\n" + "="*80)

In [None]:
# Generate fixed data loading code for both notebooks
print("\n" + "="*80)
print("FIXED DATA LOADING CODE")
print("Copy this into both notebooks for fair comparison:")
print("="*80)

fixed_code = '''
# Fixed data loading code for fair comparison
# UPDATE THIS PATH for each notebook:
# For 5-channel: data_path = '/home/ubuntu/analyst/notebooks/ViT-experiment/pcap-dataset-samples/parquet/5channel_32x32/'
# For multichannel_hilbert: data_path = '/home/ubuntu/analyst/notebooks/ViT-experiment/pcap-dataset-samples/parquet/multichannel_hilbert_32x32/'

def load_parquet_data(base_path):
    """Load all parquet files from train/val/test splits"""
    all_image_data = []
    all_labels = []
    splits = ['train', 'val', 'test']
    
    # Get all class directories - SORT for consistency
    class_dirs = sorted([d for d in glob.glob(f"{base_path}*/") if not any(s in d for s in splits)])
    class_names = [d.split('/')[-2] for d in class_dirs]
    print(f"Found classes: {class_names}")
    
    for class_dir in class_dirs:
        class_name = class_dir.split('/')[-2]
        print(f"Loading {class_name}...")
        
        for split in splits:
            split_path = f"{class_dir}{split}/"
            parquet_files = sorted(glob.glob(f"{split_path}*.parquet"))  # SORT for consistency
            
            for file_path in parquet_files:
                try:
                    df = pd.read_parquet(file_path)
                    
                    if 'image_data' in df.columns:
                        for idx, row in df.iterrows():
                            image_data = np.array(row['image_data'], dtype=np.float32)
                            all_image_data.append(image_data)
                            all_labels.append(class_name)
                        
                        print(f"   Loaded {len(df)} samples from {file_path.split('/')[-1]}")
                        
                except Exception as e:
                    print(f"   Error loading {file_path}: {e}")
    
    if not all_image_data:
        raise ValueError("No image data was loaded successfully!")
    
    # Convert to numpy arrays
    X = np.array(all_image_data, dtype=np.float32)
    y = np.array(all_labels)
    
    print(f"\\n✓ Total samples loaded: {len(X)}")
    print(f"✓ Image data shape: {X.shape}")
    print(f"✓ Unique labels: {np.unique(y)}")
    
    return X, y
'''

print(fixed_code)
print("\n" + "="*80)