<a href="https://colab.research.google.com/github/papertuc2000/CL-Drive/blob/dev/Clean_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Since you have 363 files and over 13 million rows of data, printing the entire dataset would fill the screen and make it unreadable. Therefore, in the code below, I have added a section that displays the first 5 rows of each processed file as a sample.

1 Clean Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
import os
dataset_path = '/content/drive/MyDrive/Colab Notebooks/CL-Drive'
if os.path.exists(dataset_path):
    print("Folders found:", os.listdir(dataset_path))
else:
    print("Path does not exist! Check your Google Drive mount.")

Folders found: ['ECG', 'Gaze', 'EDA', 'Labels', 'EEG']


In [None]:
import pandas as pd
import numpy as np
from glob import glob

In [2]:



dataset_path = '/content/drive/MyDrive/Colab Notebooks/CL-Drive'
# modalities = ['EEG', 'ECG', 'EDA', 'Gaze']
modalities = ['EEG']
clean_data_lists = {m: [] for m in modalities}

print("Starting robust data preprocessing with Glob...")

for mod in modalities:
    print(f"\n--- Processing Modality: {mod} ---")

    # Search pattern
    pattern = os.path.join(dataset_path, mod, '*', f'{mod.lower()}_*_level_*.csv')
    files = glob(pattern)

    if not files:
        print(f"Warning: No files found for {mod}.")
        continue

    print(f"Found {len(files)} files for {mod}. Cleaning...")

    for file_path in files:
        try:
            # Important change: added low_memory=False to remove Mixed Types warning
            df_raw = pd.read_csv(file_path, header=None, low_memory=False)

            # Drop columns that are completely empty
            df_dropped_cols = df_raw.dropna(axis=1, how='all')

            # Drop rows that are completely empty
            df_clean = df_dropped_cols.dropna(how='all')

            data_array = df_clean.values

            if data_array.size > 0:
                clean_data_lists[mod].append(data_array)

                # --- NEW SECTION: Print sample content of the cleaned file ---
                print(f"\n>> Sample content for: {os.path.basename(file_path)}")
                # Convert back to DataFrame temporarily for nice printing (optional)
                # If you strictly want numpy array output, just print data_array[:5]
                df_sample = pd.DataFrame(data_array)
                print(df_sample.head())
                # --------------------------------------------------------------

        except Exception as e:
            print(f"Error processing {os.path.basename(file_path)}: {e}")

# Final summary
print("\n" + "="*50)
print("PREPROCESSING COMPLETE.")
print("="*50)
for mod in modalities:
    count = len(clean_data_lists[mod])
    if count > 0:
        total_rows = sum(arr.shape[0] for arr in clean_data_lists[mod])
        cols = clean_data_lists[mod][0].shape[1]
        print(f"[{mod}] Loaded: {count} files | Total Rows: {total_rows} | Cols: {cols}")

        # Optional: Print a final combined sample if you want to see mixed data from all files
        # Uncomment the lines below if you want to see a sample from the combined list
        # print(f"\n>> Combined Sample for {mod} (First 5 rows of first file):")
        # print(pd.DataFrame(clean_data_lists[mod][0]).head())

    else:
        print(f"[{mod}] - NO DATA LOADED")

Starting robust data preprocessing with Glob...

--- Processing Modality: EEG ---
Found 363 files for EEG. Cleaning...

>> Sample content for: eeg_data_level_1.csv
              0             1            2             3             4
0     Timestamp           TP9          AF7           AF8          TP10
1   120.0078125  -21.97265625  -41.9921875   -24.4140625        -31.25
2  120.01171875  -22.94921875     -39.0625  -27.83203125   -20.5078125
3    120.015625      -23.4375  -34.1796875   -30.2734375  -15.13671875
4  120.01953125   -18.5546875  -43.9453125      -23.4375    -17.578125

>> Sample content for: eeg_baseline_level_1.csv
            0    1    2    3     4
0   Timestamp  TP9  AF7  AF8  TP10
1  0.00390625  NaN  NaN  NaN   NaN
2   0.0078125  NaN  NaN  NaN   NaN
3  0.01171875  NaN  NaN  NaN   NaN
4    0.015625  NaN  NaN  NaN   NaN

>> Sample content for: eeg_data_level_2.csv
              0             1             2             3              4
0     Timestamp           TP9    

2 Raw Data

In [3]:



dataset_path = '/content/drive/MyDrive/Colab Notebooks/CL-Drive'
# modalities = ['EEG', 'ECG', 'EDA', 'Gaze']
modalities = ['EEG']
raw_data_lists = {m: [] for m in modalities}

print("Starting raw data loading with Glob (No Cleaning)...")

for mod in modalities:
    print(f"\n--- Processing Modality: {mod} ---")

    # Search pattern
    pattern = os.path.join(dataset_path, mod, '*', f'{mod.lower()}_*_level_*.csv')
    files = glob(pattern)

    if not files:
        print(f"Warning: No files found for {mod}.")
        continue

    print(f"Found {len(files)} files for {mod}. Loading raw data...")

    for file_path in files:
        try:
            # Load CSV without cleaning (low_memory=False to avoid warnings)
            df_raw = pd.read_csv(file_path, header=None, low_memory=False)

            # Convert to numpy array (keeping all NaNs and empty cells)
            data_array = df_raw.values

            if data_array.size > 0:
                raw_data_lists[mod].append(data_array)

                # Display sample content of the raw file
                print(f"\n>> Raw content sample for: {os.path.basename(file_path)}")
                print(df_raw.head())
                print("-" * 30)

        except Exception as e:
            print(f"Error processing {os.path.basename(file_path)}: {e}")

# Final summary
print("\n" + "="*50)
print("RAW DATA LOADING COMPLETE.")
print("="*50)
for mod in modalities:
    count = len(raw_data_lists[mod])
    if count > 0:
        total_rows = sum(arr.shape[0] for arr in raw_data_lists[mod])
        cols = raw_data_lists[mod][0].shape[1]
        print(f"[{mod}] Loaded: {count} files | Total Rows (including empties): {total_rows} | Cols: {cols}")
    else:
        print(f"[{mod}] - NO DATA LOADED")

Starting raw data loading with Glob (No Cleaning)...

--- Processing Modality: EEG ---
Found 363 files for EEG. Loading raw data...

>> Raw content sample for: eeg_data_level_1.csv
              0             1            2             3             4
0     Timestamp           TP9          AF7           AF8          TP10
1   120.0078125  -21.97265625  -41.9921875   -24.4140625        -31.25
2  120.01171875  -22.94921875     -39.0625  -27.83203125   -20.5078125
3    120.015625      -23.4375  -34.1796875   -30.2734375  -15.13671875
4  120.01953125   -18.5546875  -43.9453125      -23.4375    -17.578125
------------------------------

>> Raw content sample for: eeg_baseline_level_1.csv
            0    1    2    3     4
0   Timestamp  TP9  AF7  AF8  TP10
1  0.00390625  NaN  NaN  NaN   NaN
2   0.0078125  NaN  NaN  NaN   NaN
3  0.01171875  NaN  NaN  NaN   NaN
4    0.015625  NaN  NaN  NaN   NaN
------------------------------

>> Raw content sample for: eeg_data_level_2.csv
              0    