<a href="https://colab.research.google.com/github/papertuc2000/CL-Drive/blob/dev/Remove_Nan_EEG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
import pandas as pd
import numpy as np
from glob import glob
from scipy.signal import butter, iirnotch, filtfilt

dataset_path = '/content/drive/MyDrive/Colab Notebooks/CL-Drive'
modalities = ['EEG']
clean_data_lists = {m: [] for m in modalities}

# Parameters based on the paper
EEG_SAMPLING_RATE = 256  # Hz
LOW_CUTOFF = 0.4         # Hz
HIGH_CUTOFF = 75.0       # Hz
NOTCH_FREQ = 60.0        # Hz
Q_FACTOR = 30.0          # Quality factor

def butter_bandpass_filter(data, lowcut, highcut, fs, order=2):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    if len(data.shape) == 1:
        return filtfilt(b, a, data)
    else:
        return np.apply_along_axis(lambda x: filtfilt(b, a, x), axis=0, arr=data)

def notch_filter(data, freq, fs, q=30):
    b, a = iirnotch(freq, Q=q, fs=fs)
    if len(data.shape) == 1:
        return filtfilt(b, a, data)
    else:
        return np.apply_along_axis(lambda x: filtfilt(b, a, x), axis=0, arr=data)

print("Starting robust EEG preprocessing (Fixed Header Handling)...")

for mod in modalities:
    print(f"\n--- Processing Modality: {mod} ---")

    pattern = os.path.join(dataset_path, mod, '*', f'{mod.lower()}_*_level_*.csv')
    files = glob(pattern)

    if not files:
        print(f"Warning: No files found for {mod}.")
        continue

    print(f"Found {len(files)} files for {mod}. Cleaning...")

    for file_path in files:
        try:
            # FIX 1: Use header=0 to treat the first row as column names automatically
            # This prevents the text headers from being read as data and turning into NaNs
            df_raw = pd.read_csv(file_path, header=0, low_memory=False)

            # Ensure all data columns are numeric (Timestamp might be float, others float)
            # We select only numeric columns to be safe for filtering
            df_numeric = df_raw.select_dtypes(include=[np.number])

            if df_numeric.empty:
                print(f"\n>> Warning: No numeric data found in {os.path.basename(file_path)}")
                continue

            data_values = df_numeric.values

            # Check for initial NaNs before filtering
            initial_nans = np.isnan(data_values).sum()

            # --- STEP 1: FILTERING ---
            # Apply filters only to the numeric data
            try:
                filtered_data = butter_bandpass_filter(data_values, LOW_CUTOFF, HIGH_CUTOFF, EEG_SAMPLING_RATE, order=2)
                filtered_data = notch_filter(filtered_data, NOTCH_FREQ, EEG_SAMPLING_RATE, Q_FACTOR)
            except ValueError as ve:
                # Handle cases where data length might be too short for filter
                print(f"\n>> Error filtering {os.path.basename(file_path)}: {ve}")
                continue

            df_filtered = pd.DataFrame(filtered_data, columns=df_numeric.columns)

            # Re-attach the Timestamp column if it was excluded from numeric selection but exists in original
            # Usually 'Timestamp' is numeric, so it should be in df_numeric.
            # If 'Timestamp' was string, we need to handle alignment, but based on your sample it looks numeric.
            # Let's assume all columns needed for analysis are in df_filtered now.

            # --- STEP 2: REMOVE MISSING DATA ---
            # Drop columns that are entirely empty (just in case)
            df_dropped_cols = df_filtered.dropna(axis=1, how='all')

            # Drop ANY row that has even a single NaN (Strict policy per paper)
            df_clean = df_dropped_cols.dropna(how='any')

            data_array = df_clean.values

            if data_array.size > 0:
                clean_data_lists[mod].append(data_array)

                # Report stats
                rows_removed = len(data_values) - len(data_array)
                print(f"\n>> Success: {os.path.basename(file_path)}")
                print(f"   Original Rows: {len(data_values)} | Cleaned Rows: {len(data_array)}")
                if rows_removed > 0:
                    print(f"   Removed {rows_removed} rows due to NaNs or filtering artifacts.")

                # Show sample
                print(df_clean.head())

            else:
                print(f"\n>> Warning: File {os.path.basename(file_path)} resulted in empty data after cleaning.")
                # Debug: Check how many NaNs were there initially
                print(f"   Initial NaN count in file: {initial_nans}")

        except Exception as e:
            print(f"Error processing {os.path.basename(file_path)}: {e}")
            import traceback
            traceback.print_exc()

# Final summary
print("\n" + "="*50)
print("PREPROCESSING COMPLETE.")
print("="*50)
for mod in modalities:
    count = len(clean_data_lists[mod])
    if count > 0:
        total_rows = sum(arr.shape[0] for arr in clean_data_lists[mod])
        cols = clean_data_lists[mod][0].shape[1]
        print(f"[{mod}] Loaded: {count} files | Total Valid Rows: {total_rows} | Cols: {cols}")
    else:
        print(f"[{mod}] - NO DATA LOADED. Check header handling.")

Starting robust EEG preprocessing (Fixed Header Handling)...

--- Processing Modality: EEG ---
Found 363 files for EEG. Cleaning...

>> Success: eeg_data_level_1.csv
   Original Rows: 46920 | Cleaned Rows: 46920
   Timestamp       TP9        AF7       AF8       TP10
0  -0.112873 -2.698526  -8.558943  0.327518  -1.591817
1  -0.111152 -4.634950  -5.137913 -4.349075   8.733447
2  -0.109426 -2.689655  -5.896077 -5.018556  13.586396
3  -0.107687 -2.061365 -10.775855 -0.196296  13.232770
4  -0.105972 -4.965163 -10.436949  2.287526  16.147916

>> Success: eeg_baseline_level_1.csv
   Original Rows: 30977 | Cleaned Rows: 30977
   Timestamp
0  -0.112873
1  -0.111152
2  -0.109426
3  -0.107687
4  -0.105972

>> Success: eeg_data_level_2.csv
   Original Rows: 47219 | Cleaned Rows: 47219
   Timestamp        TP9        AF7        AF8       TP10
0  -0.112873  -7.316270  -4.861035   1.417909   9.428047
1  -0.111152 -10.965125 -10.519540  -5.902217   4.801911
2  -0.109426  -5.055448 -13.684436 -11.896219