In [7]:
#!/usr/bin/env python
# coding=utf-8
"""
HBN and TUEG EEG Data Processing with OmnEEG
===========================================

This notebook processes HBN and TUEG EEG datasets using OmnEEG to create 
standardized HDF5 files for tokenizable timeseries processing.

Author: Created for EEG tokenization pipeline
Date: 2025-01-14
"""

import os
import sys
import shutil
import glob
import yaml
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# Add OmnEEG to path
sys.path.append('./OmnEEG')
from omneeg.io import EEG

print("🧠 HBN and TUEG EEG Processing Pipeline")
print("=" * 50)


🧠 HBN and TUEG EEG Processing Pipeline


In [9]:
# Configuration
# =============

# Data paths
HBN_DATA_PATH = "/home/mahta/projects/ctb-gdumas85/data/HBN/EEG/"
SCRATCH_OUTPUT_PATH = "/home/mahta/scratch/omneeg_hdf5/"  # Adjust scratch path as needed
TUEG_DATA_PATH = "/path/to/tueg/data/"  # Update with actual TUEG path

# Create output directory
os.makedirs(SCRATCH_OUTPUT_PATH, exist_ok=True)
print(f"📁 Output directory: {SCRATCH_OUTPUT_PATH}")

# OmnEEG configuration
OMNEEG_CONFIG = {
    'data': SCRATCH_OUTPUT_PATH,
    'sfreq': 128,           # Sampling frequency in Hz
    'duration': 1,          # Epoch duration in seconds
    'epochs': 100,          # Number of epochs to extract (increased for more data)
    'resolution': 64,       # Resolution for topographic maps (increased for better detail)
    'overwrite': True       # Overwrite existing HDF5 files
}

print("⚙️  Configuration loaded")
print(f"   • Sampling rate: {OMNEEG_CONFIG['sfreq']} Hz")
print(f"   • Epoch duration: {OMNEEG_CONFIG['duration']} sec")
print(f"   • Number of epochs: {OMNEEG_CONFIG['epochs']}")
print(f"   • Topomap resolution: {OMNEEG_CONFIG['resolution']}x{OMNEEG_CONFIG['resolution']}")


📁 Output directory: /home/mahta/scratch/omneeg_hdf5/
⚙️  Configuration loaded
   • Sampling rate: 128 Hz
   • Epoch duration: 1 sec
   • Number of epochs: 100
   • Topomap resolution: 64x64


In [10]:
# Setup OmnEEG Configuration Files
# ===============================

def create_omneeg_config():
    """Create the main config.yaml file for OmnEEG"""
    config_path = "./OmnEEG/config.yaml"
    
    with open(config_path, 'w') as f:
        yaml.dump(OMNEEG_CONFIG, f, default_flow_style=False)
    
    print(f"✅ Created OmnEEG config: {config_path}")

def create_cohort_configs():
    """Create cohort configuration files for HBN and TUEG"""
    
    # HBN cohort configuration
    hbn_config = {
        'regexp': f'{HBN_DATA_PATH}**/*.tar.gz',  # HBN files are in tar.gz format
        'create_epochs': True,
        'rename_channels': False,  # We'll handle this based on actual channel names
        'set_montage': 'standard_1020'  # Standard 10-20 montage
    }
    
    # TUEG cohort configuration  
    tueg_config = {
        'regexp': f'{TUEG_DATA_PATH}**/*.edf',  # Assuming TUEG uses EDF format
        'create_epochs': True,
        'rename_channels': False,
        'set_montage': 'standard_1020'
    }
    
    # Write cohort configs
    hbn_path = "./OmnEEG/data/hbn.yaml"
    tueg_path = "./OmnEEG/data/tueg.yaml"
    
    with open(hbn_path, 'w') as f:
        yaml.dump(hbn_config, f, default_flow_style=False)
    
    with open(tueg_path, 'w') as f:
        yaml.dump(tueg_config, f, default_flow_style=False)
    
    print(f"✅ Created HBN cohort config: {hbn_path}")
    print(f"✅ Created TUEG cohort config: {tueg_path}")
    
    return hbn_config, tueg_config

# Create configuration files
create_omneeg_config()
hbn_config, tueg_config = create_cohort_configs()


✅ Created OmnEEG config: ./OmnEEG/config.yaml
✅ Created HBN cohort config: ./OmnEEG/data/hbn.yaml
✅ Created TUEG cohort config: ./OmnEEG/data/tueg.yaml


In [11]:
# Explore HBN Data Structure
# ==========================

def explore_hbn_data():
    """Explore the HBN data structure to understand file organization"""
    print("🔍 Exploring HBN data structure...")
    
    # List HBN files
    hbn_files = glob.glob(os.path.join(HBN_DATA_PATH, "*.tar.gz"))
    print(f"📊 Found {len(hbn_files)} HBN tar.gz files")
    
    if hbn_files:
        # Show first few files
        print("\n📋 Sample HBN files:")
        for i, file in enumerate(hbn_files[:5]):
            basename = os.path.basename(file)
            size_mb = os.path.getsize(file) / (1024*1024)
            print(f"   {i+1}. {basename} ({size_mb:.1f} MB)")
        
        if len(hbn_files) > 5:
            print(f"   ... and {len(hbn_files) - 5} more files")
    
    return hbn_files

def extract_and_process_hbn_sample(tar_file, extract_dir):
    """Extract a sample HBN tar.gz file to examine its contents"""
    import tarfile
    
    print(f"\n🔧 Extracting sample file: {os.path.basename(tar_file)}")
    
    # Create extraction directory
    os.makedirs(extract_dir, exist_ok=True)
    
    # Extract tar.gz file
    with tarfile.open(tar_file, 'r:gz') as tar:
        tar.extractall(extract_dir)
    
    # Find EEG files in extracted directory
    eeg_files = []
    for ext in ['*.edf', '*.fif', '*.mff']:
        eeg_files.extend(glob.glob(os.path.join(extract_dir, '**', ext), recursive=True))
    
    print(f"📁 Extracted to: {extract_dir}")
    print(f"🧠 Found {len(eeg_files)} EEG files:")
    
    for eeg_file in eeg_files[:3]:  # Show first 3
        rel_path = os.path.relpath(eeg_file, extract_dir)
        print(f"   • {rel_path}")
    
    return eeg_files

# Explore HBN data
hbn_files = explore_hbn_data()


🔍 Exploring HBN data structure...
📊 Found 3567 HBN tar.gz files

📋 Sample HBN files:
   1. NDARHZ255RA6.tar.gz (2834.9 MB)
   2. NDARMC325JCN.tar.gz (2952.9 MB)
   3. NDARZN148PMN.tar.gz (1414.9 MB)
   4. NDARZR412TBP.tar.gz (2592.3 MB)
   5. NDARME656MTN.tar.gz (2375.5 MB)
   ... and 3562 more files


In [16]:
# Process HBN Data with OmnEEG
# ============================

def create_hbn_extraction_pipeline():
    """Create a pipeline to extract and process HBN data"""
    import tarfile
    from tqdm import tqdm
    
    # Create temporary extraction directory
    temp_extract_dir = os.path.join(SCRATCH_OUTPUT_PATH, "temp_extract")
    os.makedirs(temp_extract_dir, exist_ok=True)
    
    print("🚀 Starting HBN data processing pipeline...")
    print(f"   Processing {len(hbn_files)} HBN files")
    
    processed_files = []
    
    # let's just do one participant for now
    one_participant = hbn_files[0]
    
    for i, tar_file in enumerate(tqdm(hbn_files[:1], desc="Processing HBN files")):  # Process first 3 for testing
        try:
            # Extract tar file
            subject_id = os.path.basename(tar_file).replace('.tar.gz', '')
            subject_extract_dir = os.path.join(temp_extract_dir, subject_id)
            
            print(f"\n📦 Processing subject: {subject_id}")
            
            # Extract tar.gz file
            with tarfile.open(tar_file, 'r:gz') as tar:
                tar.extractall(subject_extract_dir)
            
            # Find EEG files
            eeg_files = []
            for ext in ['*.edf', '*.fif', '*.mff']:
                eeg_files.extend(glob.glob(os.path.join(subject_extract_dir, '**', ext), recursive=True))
            
            print(f"   Found {len(eeg_files)} EEG files for {subject_id}")
            
            # Update HBN config for this specific subject
            if eeg_files:
                subject_config = {
                    'regexp': os.path.join(subject_extract_dir, '**/*.edf'),  # Assuming EDF format
                    'create_epochs': True,
                    'rename_channels': False,
                    'set_montage': 'standard_1020'
                }
                
                # Write subject-specific config
                subject_config_path = f"./OmnEEG/data/hbn_{subject_id}.yaml"
                with open(subject_config_path, 'w') as f:
                    yaml.dump(subject_config, f, default_flow_style=False)
                
                processed_files.append({
                    'subject_id': subject_id,
                    'config_path': subject_config_path,
                    'eeg_files': eeg_files,
                    'extract_dir': subject_extract_dir
                })
            
        except Exception as e:
            print(f"❌ Error processing {tar_file}: {str(e)}")
            continue
    
    return processed_files

# Create and run the extraction pipeline
processed_hbn_files = create_hbn_extraction_pipeline()


🚀 Starting HBN data processing pipeline...
   Processing 3567 HBN files


Processing HBN files:   0%|          | 0/1 [00:00<?, ?it/s]


📦 Processing subject: NDARHZ255RA6


Processing HBN files:   0%|          | 0/1 [00:11<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# Generate HDF5 Files using OmnEEG
# =================================

def process_with_omneeg(processed_files):
    """Process extracted EEG files with OmnEEG to create HDF5 files"""
    
    print("🧠 Converting EEG data to HDF5 using OmnEEG...")
    
    # Change to OmnEEG directory to use relative paths
    original_cwd = os.getcwd()
    os.chdir('./OmnEEG')
    
    try:
        hdf5_outputs = []
        
        for file_info in processed_files:
            subject_id = file_info['subject_id']
            cohort_name = f"hbn_{subject_id}"
            
            print(f"\n🔄 Processing subject {subject_id} with OmnEEG...")
            
            try:
                # Create EEG dataset for this subject
                dataset = EEG(cohort=cohort_name)
                
                print(f"   📊 Dataset created for {len(dataset)} files")
                
                # Process all files in the dataset
                for i in range(len(dataset)):
                    print(f"   Processing file {i+1}/{len(dataset)}...")
                    
                    # This will automatically create HDF5 files
                    data = dataset[i]
                    
                    print(f"   ✅ Generated tensor shape: {data.shape}")
                    # Shape should be: (epochs, height, width, timepoints)
                
                hdf5_outputs.append({
                    'subject_id': subject_id,
                    'dataset': dataset,
                    'tensor_shape': data.shape if 'data' in locals() else None
                })
                
            except Exception as e:
                print(f"   ❌ Error processing {subject_id}: {str(e)}")
                continue
        
        return hdf5_outputs
        
    finally:
        # Return to original directory
        os.chdir(original_cwd)

# Process files with OmnEEG
if processed_hbn_files:
    hdf5_results = process_with_omneeg(processed_hbn_files)
else:
    print("⚠️  No processed files available. Skipping OmnEEG processing.")
    hdf5_results = []


In [None]:
# Visualize Results and Summary
# =============================

def visualize_hdf5_outputs(hdf5_results):
    """Visualize the generated HDF5 data"""
    
    if not hdf5_results:
        print("⚠️  No HDF5 results to visualize")
        return
    
    print("📈 Visualizing OmnEEG outputs...")
    
    # Create visualization for first subject
    first_result = hdf5_results[0]
    subject_id = first_result['subject_id']
    dataset = first_result['dataset']
    
    # Get sample data
    sample_data = dataset[0]  # Shape: (epochs, height, width, timepoints)
    
    print(f"🧠 Sample data from subject {subject_id}:")
    print(f"   Shape: {sample_data.shape}")
    print(f"   Data type: {sample_data.dtype}")
    print(f"   Value range: [{sample_data.min():.3f}, {sample_data.max():.3f}]")
    
    # Plot topographic maps
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    fig.suptitle(f'EEG Topographic Maps - Subject {subject_id}', fontsize=16)
    
    # Plot 6 different timepoints from first epoch
    epoch_idx = 0
    timepoints = np.linspace(0, sample_data.shape[3]-1, 6, dtype=int)
    
    for i, (ax, t) in enumerate(zip(axes.flat, timepoints)):
        topomap = sample_data[epoch_idx, :, :, t]
        
        im = ax.imshow(topomap, cmap='RdBu_r', 
                      vmin=-np.abs(topomap).max(), 
                      vmax=np.abs(topomap).max())
        ax.set_title(f'Time: {t}/{sample_data.shape[3]-1}')
        ax.axis('off')
        
        # Add colorbar
        plt.colorbar(im, ax=ax, shrink=0.6)
    
    plt.tight_layout()
    plt.show()

def generate_summary_report(hdf5_results):
    """Generate a summary report of the processing"""
    
    print("\n" + "="*60)
    print("📋 PROCESSING SUMMARY REPORT")
    print("="*60)
    
    print(f"🗂️  Output Directory: {SCRATCH_OUTPUT_PATH}")
    print(f"📊 Total subjects processed: {len(hdf5_results)}")
    
    if hdf5_results:
        print(f"✅ Successfully processed subjects:")
        for result in hdf5_results:
            subject_id = result['subject_id']
            tensor_shape = result['tensor_shape']
            print(f"   • {subject_id}: {tensor_shape}")
    
    # List generated HDF5 files
    hdf5_files = glob.glob(os.path.join(SCRATCH_OUTPUT_PATH, "**/*.h5"), recursive=True)
    
    print(f"\n💾 Generated HDF5 files: {len(hdf5_files)}")
    total_size_mb = 0
    
    for hdf5_file in hdf5_files[:10]:  # Show first 10
        rel_path = os.path.relpath(hdf5_file, SCRATCH_OUTPUT_PATH)
        size_mb = os.path.getsize(hdf5_file) / (1024*1024)
        total_size_mb += size_mb
        print(f"   📄 {rel_path} ({size_mb:.1f} MB)")
    
    if len(hdf5_files) > 10:
        print(f"   ... and {len(hdf5_files) - 10} more files")
    
    print(f"\n💽 Total data size: {total_size_mb:.1f} MB")
    
    print("\n🎯 Next steps for tokenization:")
    print("   1. HDF5 files are ready for tokenizable timeseries processing")
    print("   2. Each file contains 4D tensors: (epochs, height, width, timepoints)")
    print("   3. Use these files as input for transformer-based EEG analysis")
    
    print("="*60)

# Generate visualizations and summary
if hdf5_results:
    visualize_hdf5_outputs(hdf5_results)

generate_summary_report(hdf5_results)
