In [None]:
# Once the first pipeline is ready, we can extend it with more advanced visualizations and preprocessing steps as needed. and those steps are done here ..

In [1]:
import pandas as pd
from pathlib import Path
df = pd.read_csv('pain_dataset_outputs/files_metadata.csv')

print(df.shape)

(37, 7)


### **1) Show problematic files (duration dev > 1s, missing annotation, duplicates)**

In [2]:

# extract basename and numeric id
df['basename'] = df['file'].apply(lambda x: x.split('\\')[-1])
df['id'] = df['basename'].str.extract(r'ID(\d+)').astype(float).astype('Int64')

# missing IDs (0..43)
expected = set(range(44))
present_ids = set(df['id'].dropna().astype(int))
missing_ids = sorted(expected - present_ids)
print('Missing IDs:', missing_ids)

# duplicates
dups = df['id'][df['id'].duplicated(keep=False)]
print('Duplicate ID entries (check these files):')
print(df[df['id'].isin(dups)][['basename','file']])



Missing IDs: [12, 17, 28, 29, 32, 34, 36, 42]
Duplicate ID entries (check these files):
      basename                      file
6  ID15(1).gdf  Pain-Dataset\ID15(1).gdf
7     ID15.gdf     Pain-Dataset\ID15.gdf


In [3]:

# duration deviations (more than 1s)
df['duration_dev_sec'] = df['duration_sec'] - 600.0
bad_dur = df[df['duration_dev_sec'].abs() > 1.0][['basename','duration_sec','duration_dev_sec','n_samples']]
print('Files with duration deviating > 1s from 600s:')
print(bad_dur.to_string(index=False))

# files with unusual annotations
print('Distinct annotation sets and counts:')
print(df['annotations'].value_counts())

Files with duration deviating > 1s from 600s:
   basename  duration_sec  duration_dev_sec  n_samples
    ID1.gdf       605.824             5.824     151456
   ID10.gdf       605.952             5.952     151488
   ID11.gdf       605.824             5.824     151456
   ID13.gdf       605.824             5.824     151456
   ID14.gdf       605.824             5.824     151456
ID15(1).gdf       605.824             5.824     151456
   ID15.gdf       605.824             5.824     151456
   ID16.gdf       605.824             5.824     151456
   ID18.gdf       605.824             5.824     151456
   ID19.gdf       605.824             5.824     151456
    ID2.gdf       605.824             5.824     151456
   ID20.gdf       605.824             5.824     151456
   ID21.gdf       605.824             5.824     151456
   ID22.gdf       605.824             5.824     151456
   ID23.gdf       605.952             5.952     151488
   ID24.gdf       605.952             5.952     151488
   ID25.gdf       6

### **2) Inspect annotations inside a given GDF file to get event times (run per-file)**

In [4]:
import mne
from pathlib import Path
f = Path('Pain-Dataset') / 'ID1.gdf'   # change filename as needed
raw = mne.io.read_raw_gdf(str(f), preload=False, verbose='ERROR')
print('Sampling rate:', raw.info['sfreq'])
print('Channels:', raw.info['ch_names'][:6], '...')

# Show annotations (onset, duration, description)
ann = raw.annotations
print('Annotations object:', ann)
if ann is not None:
    for onset, duration, desc in zip(ann.onset, ann.duration, ann.description):
        print(f'onset(s)={onset:.3f}, dur={duration:.3f}, desc={desc}')
else:
    print('No annotations found in this file.')


Sampling rate: 250.0
Channels: ['FP1', 'FP2', 'F3', 'F4', 'C3', 'C4'] ...
Annotations object: <Annotations | 3 segments: 33024 (1), 33025 (1), 33029 (1)>
onset(s)=5.000, dur=0.004, desc=33024
onset(s)=305.000, dur=0.004, desc=33025
onset(s)=605.000, dur=0.004, desc=33029


In [7]:
import mne
from pathlib import Path

# Path to the folder containing .gdf files
folder_path = Path('Pain-Dataset')

# Loop through all .gdf files in the folder
for f in folder_path.glob('*.gdf'):
    print(f"Processing file: {f.name}")
    
    # Read the raw EEG data
    raw = mne.io.read_raw_gdf(str(f), preload=False, verbose='ERROR')
    
    # Print the sampling rate and first 6 channel names
    print('Sampling rate:', raw.info['sfreq'])
    print('Channels:', raw.info['ch_names'][:6], '...')
    
    # Show annotations (onset, duration, description)
    ann = raw.annotations
    print('Annotations object:', ann)
    
    if ann is not None:
        for onset, duration, desc in zip(ann.onset, ann.duration, ann.description):
            print(f'onset(s)={onset:.3f}, dur={duration:.3f}, desc={desc}')
    else:
        print('No annotations found in this file.')
    
    print('-' * 40)  # Print a separator between files


Processing file: ID0.gdf
Sampling rate: 250.0
Channels: ['FP1', 'FP2', 'F3', 'F4', 'C3', 'C4'] ...
Annotations object: <Annotations | 2 segments: 33024 (1), 33025 (1)>
onset(s)=0.000, dur=0.004, desc=33024
onset(s)=300.000, dur=0.004, desc=33025
----------------------------------------
Processing file: ID1.gdf
Sampling rate: 250.0
Channels: ['FP1', 'FP2', 'F3', 'F4', 'C3', 'C4'] ...
Annotations object: <Annotations | 3 segments: 33024 (1), 33025 (1), 33029 (1)>
onset(s)=5.000, dur=0.004, desc=33024
onset(s)=305.000, dur=0.004, desc=33025
onset(s)=605.000, dur=0.004, desc=33029
----------------------------------------
Processing file: ID10.gdf
Sampling rate: 250.0
Channels: ['FP1', 'FP2', 'F3', 'F4', 'C3', 'C4'] ...
Annotations object: <Annotations | 3 segments: 33024 (1), 33025 (1), 33029 (1)>
onset(s)=5.000, dur=0.004, desc=33024
onset(s)=305.000, dur=0.004, desc=33025
onset(s)=605.000, dur=0.004, desc=33029
----------------------------------------
Processing file: ID11.gdf
Sampling r

In [8]:
import mne
import pandas as pd
from pathlib import Path

DATA_DIR = Path('Pain-Dataset')

summary_rows = []

for f in sorted(DATA_DIR.glob('*.gdf')):
    try:
        raw = mne.io.read_raw_gdf(str(f), preload=False, verbose='ERROR')
        ann = raw.annotations
        sampling_rate = raw.info['sfreq']
        ch_names = ','.join(raw.info['ch_names'])
        
        if ann is not None and len(ann) > 0:
            for onset, duration, desc in zip(ann.onset, ann.duration, ann.description):
                summary_rows.append({
                    'file': f.name,
                    'sampling_rate': sampling_rate,
                    'num_channels': len(raw.info['ch_names']),
                    'onset_s': round(onset, 3),
                    'duration_s': round(duration, 3),
                    'event_code': desc
                })
        else:
            summary_rows.append({
                'file': f.name,
                'sampling_rate': sampling_rate,
                'num_channels': len(raw.info['ch_names']),
                'onset_s': None,
                'duration_s': None,
                'event_code': 'None'
            })
    except Exception as e:
        print(f"Error reading {f.name}: {e}")

# Create DataFrame
df_ann = pd.DataFrame(summary_rows)

# Save for later
df_ann.to_csv('annotations_summary.csv', index=False)

print("✅ Annotation summary saved as annotations_summary.csv")
display(df_ann.head())


✅ Annotation summary saved as annotations_summary.csv


Unnamed: 0,file,sampling_rate,num_channels,onset_s,duration_s,event_code
0,ID0.gdf,250.0,24,0.0,0.004,33024
1,ID0.gdf,250.0,24,300.0,0.004,33025
2,ID1.gdf,250.0,24,5.0,0.004,33024
3,ID1.gdf,250.0,24,305.0,0.004,33025
4,ID1.gdf,250.0,24,605.0,0.004,33029


### **Save the Data into EYE OPEN(EO) and EYE CLOSE(EC) Segments and save it for further use**

In [None]:
'''
# here we dont take the ID0, ID26 special cases into consideration ...

import mne
from pathlib import Path
import numpy as np

base = Path('Pain-Dataset')
save_dir = Path('Segments')
save_dir.mkdir(exist_ok=True)

for f in sorted(base.glob('*.gdf')):
    print(f"\n📁 Processing {f.name}")
    raw = mne.io.read_raw_gdf(str(f), preload=True, verbose='ERROR')
    ann = raw.annotations
    sfreq = raw.info['sfreq']
    total_dur = raw.times[-1]
    
    # --- Find event onsets safely ---
    codes = {desc: onset for onset, desc in zip(ann.onset, ann.description)}
    start1 = codes.get('33024', 0.0)      # EO start
    start2 = codes.get('33025', start1 + 300.0)  # EC start
    end    = codes.get('33029', start2 + 300.0)  # recording end or EC end fallback

    # --- Fix missing/extra durations ---
    # EO desired = [5s, 305s], EC = [305s, 605s]
    eo_start = max(start1, 5.0)
    eo_end   = eo_start + 300.0
    ec_start = eo_end
    ec_end   = ec_start + 300.0

    # Clamp within available recording length
    eo_end = min(eo_end, total_dur)
    ec_end = min(ec_end, total_dur)

    # Round down to nearest multiple of 5
    def round_down_5(x):
        return np.floor(x / 5) * 5

    eo_start = round_down_5(eo_start)
    eo_end   = round_down_5(eo_end)
    ec_start = round_down_5(ec_start)
    ec_end   = round_down_5(ec_end)

    # Skip if too short (<5s)
    if ec_end - ec_start < 5 or eo_end - eo_start < 5:
        print(f"⚠️ Skipped (too short): {f.name}")
        continue

    # --- Crop segments ---
    eo = raw.copy().crop(tmin=eo_start, tmax=eo_end)
    ec = raw.copy().crop(tmin=ec_start, tmax=ec_end)

    # --- Save both segments ---
    eo.save(save_dir / f"{f.stem}_EO_raw.fif", overwrite=True)
    ec.save(save_dir / f"{f.stem}_EC_raw.fif", overwrite=True)

    # Optionally save arrays
    np.save(save_dir / f"{f.stem}_EO.npy", eo.get_data())
    np.save(save_dir / f"{f.stem}_EC.npy", ec.get_data())

    print(f"✅ EO: {eo_start:.1f}-{eo_end:.1f}s | EC: {ec_start:.1f}-{ec_end:.1f}s | Total duration: {total_dur:.1f}s")

print("\n🎯 Segmentation complete: all EO & EC files saved cleanly.")


'''


In [10]:

import mne
from pathlib import Path
import numpy as np

base = Path('Pain-Dataset')
save_dir = Path('Segments')
save_dir.mkdir(exist_ok=True)

for f in sorted(base.glob('*.gdf')):
    print(f"\n📁 Processing {f.name}")
    raw = mne.io.read_raw_gdf(str(f), preload=True, verbose='ERROR')
    ann = raw.annotations
    sfreq = raw.info['sfreq']
    total_dur = raw.times[-1]

    # --- Handle ID0 and ID26 separately ---
    if 'ID0' in f.name or 'ID26' in f.name:
        eo_start, eo_end = 0.0, 300.0
        ec_start = eo_end
        ec_end = np.floor(total_dur / 5) * 5  # round down to multiple of 5
        print(f"⚙️ Using fixed timing for {f.name}")
    else:
        # --- Normal case: derive timings from annotations ---
        codes = {desc: onset for onset, desc in zip(ann.onset, ann.description)}
        start1 = codes.get('33024', 0.0)       # EO start
        start2 = codes.get('33025', start1 + 300.0)  # EC start
        end = codes.get('33029', start2 + 300.0)     # End

        # EO desired = [5s, 305s], EC = [305s, 605s]
        eo_start = max(start1, 5.0)
        eo_end = eo_start + 300.0
        ec_start = eo_end
        ec_end = ec_start + 300.0

        # Clamp within total duration
        eo_end = min(eo_end, total_dur)
        ec_end = min(ec_end, total_dur)

        # Round to nearest multiple of 5
        def round_down_5(x):
            return np.floor(x / 5) * 5

        eo_start = round_down_5(eo_start)
        eo_end = round_down_5(eo_end)
        ec_start = round_down_5(ec_start)
        ec_end = round_down_5(ec_end)

        # Skip if segment too short
        if ec_end - ec_start < 5 or eo_end - eo_start < 5:
            print(f"⚠️ Skipped (too short): {f.name}")
            continue

    # --- Crop both segments ---
    eo = raw.copy().crop(tmin=eo_start, tmax=eo_end)
    ec = raw.copy().crop(tmin=ec_start, tmax=ec_end)

    # --- Save both segments ---
    eo.save(save_dir / f"{f.stem}_EO_raw.fif", overwrite=True)
    ec.save(save_dir / f"{f.stem}_EC_raw.fif", overwrite=True)

    np.save(save_dir / f"{f.stem}_EO.npy", eo.get_data())
    np.save(save_dir / f"{f.stem}_EC.npy", ec.get_data())

    print(f"✅ EO: {eo_start:.1f}-{eo_end:.1f}s | EC: {ec_start:.1f}-{ec_end:.1f}s | Total duration: {total_dur:.1f}s")

print("\n🎯 Segmentation complete: all EO & EC files saved cleanly.")




📁 Processing ID0.gdf
⚙️ Using fixed timing for ID0.gdf
Writing c:\Users\User\Desktop\TCS-PROJECTS\Project-1-EEG-SIGNALS-PAIN\Segments\ID0_EO_raw.fif
Closing c:\Users\User\Desktop\TCS-PROJECTS\Project-1-EEG-SIGNALS-PAIN\Segments\ID0_EO_raw.fif
[done]
Writing c:\Users\User\Desktop\TCS-PROJECTS\Project-1-EEG-SIGNALS-PAIN\Segments\ID0_EC_raw.fif
Closing c:\Users\User\Desktop\TCS-PROJECTS\Project-1-EEG-SIGNALS-PAIN\Segments\ID0_EC_raw.fif
[done]
✅ EO: 0.0-300.0s | EC: 300.0-595.0s | Total duration: 599.8s

📁 Processing ID1.gdf
Writing c:\Users\User\Desktop\TCS-PROJECTS\Project-1-EEG-SIGNALS-PAIN\Segments\ID1_EO_raw.fif
Closing c:\Users\User\Desktop\TCS-PROJECTS\Project-1-EEG-SIGNALS-PAIN\Segments\ID1_EO_raw.fif
[done]
Writing c:\Users\User\Desktop\TCS-PROJECTS\Project-1-EEG-SIGNALS-PAIN\Segments\ID1_EC_raw.fif
Closing c:\Users\User\Desktop\TCS-PROJECTS\Project-1-EEG-SIGNALS-PAIN\Segments\ID1_EC_raw.fif
[done]
✅ EO: 5.0-305.0s | EC: 305.0-605.0s | Total duration: 605.8s

📁 Processing ID10.g

In [11]:
import mne
import numpy as np
from pathlib import Path


# directory containing your processed files
DATA_DIR = Path("Segments")  
SAMPLING_RATE = 250  # Hz
EXPECTED_SEGMENT_SEC = [300, 600]  # 5 min or 10 min possible

print(f"\nChecking all .fif and .npy files in: {DATA_DIR}\n{'='*80}")

for file in sorted(DATA_DIR.glob("*")):
    if file.suffix == ".fif":
        raw = mne.io.read_raw_fif(file, preload=False, verbose="ERROR")
        n_channels = len(raw.ch_names)
        n_samples = raw.n_times
        duration_sec = n_samples / SAMPLING_RATE

        # Status check
        status = (
            "✅ Exact 5 min"
            if abs(duration_sec - 300) < 0.5
            else "✅ Exact 10 min"
            if abs(duration_sec - 600) < 0.5
            else "⚠️ Irregular length"
        )

        print(f"[FIF] {file.name}")
        print(f"   Channels: {n_channels}, Samples: {n_samples}, Duration: {duration_sec:.2f}s → {status}")

    elif file.suffix == ".npy":
        data = np.load(file)
        if data.ndim == 2:
            n_channels, n_samples = data.shape
        elif data.ndim == 3:
            n_channels, n_samples, _ = data.shape
        else:
            n_channels, n_samples = "?", "?"

        duration_sec = n_samples / SAMPLING_RATE if isinstance(n_samples, int) else "?"
        status = (
            "✅ Exact 5 min"
            if isinstance(duration_sec, float) and abs(duration_sec - 300) < 0.5
            else "✅ Exact 10 min"
            if isinstance(duration_sec, float) and abs(duration_sec - 600) < 0.5
            else "⚠️ Irregular length"
        )

        print(f"[NPY] {file.name}")
        print(f"   Shape: {data.shape}, Duration: {duration_sec}s → {status}")

print("\nCheck complete ✅\n")



Checking all .fif and .npy files in: Segments
[NPY] ID0_EC.npy
   Shape: (24, 73751), Duration: 295.004s → ⚠️ Irregular length
[FIF] ID0_EC_raw.fif
   Channels: 24, Samples: 73751, Duration: 295.00s → ⚠️ Irregular length
[NPY] ID0_EO.npy
   Shape: (24, 75001), Duration: 300.004s → ✅ Exact 5 min
[FIF] ID0_EO_raw.fif
   Channels: 24, Samples: 75001, Duration: 300.00s → ✅ Exact 5 min
[NPY] ID10_EC.npy
   Shape: (24, 75001), Duration: 300.004s → ✅ Exact 5 min
[FIF] ID10_EC_raw.fif
   Channels: 24, Samples: 75001, Duration: 300.00s → ✅ Exact 5 min
[NPY] ID10_EO.npy
   Shape: (24, 75001), Duration: 300.004s → ✅ Exact 5 min
[FIF] ID10_EO_raw.fif
   Channels: 24, Samples: 75001, Duration: 300.00s → ✅ Exact 5 min
[NPY] ID11_EC.npy
   Shape: (24, 75001), Duration: 300.004s → ✅ Exact 5 min
[FIF] ID11_EC_raw.fif
   Channels: 24, Samples: 75001, Duration: 300.00s → ✅ Exact 5 min
[NPY] ID11_EO.npy
   Shape: (24, 75001), Duration: 300.004s → ✅ Exact 5 min
[FIF] ID11_EO_raw.fif
   Channels: 24, Sampl