# # Overview: 
Load statistical metrics from EDF files and transform into a dataframe

Observe dataframe dimension and data types

Save dataframe to csv file

In [1]:
# Import the necessary packages from utils file
from utils import pd, np, mne

In [5]:
# Load one EDF file (PhysioNet uses .edf format)
path_name = '../data/raw/'
file_name = 'S001R04.edf'
raw = mne.io.read_raw_edf(path_name + file_name, preload=True)
raw.filter(1., 40.)  # Bandpass filtering

# Assume annotations are already present
events, event_id = mne.events_from_annotations(raw)

# Epoching (trial segmentation, e.g., 0–1s after event)
epochs = mne.Epochs(raw, events, event_id=event_id, tmin=0, tmax=1, baseline=None, preload=True)

# Extract trial IDs (sample index of each event)
trial_ids = epochs.events[:, 0]  # Sample index of event onset

# Create column names for features
feature_names = ['trial_id','subject_id']
for ch in range(1, 65):  # 64 channels
    feature_names.extend([
        f'ch{ch}_std',
        f'ch{ch}_mean',
        f'ch{ch}_max',
        f'ch{ch}_min'
    ])

# Create a table of features
features = []
labels = []

for trial_id, epoch, label in zip(trial_ids, epochs.get_data(), epochs.events[:, -1]):
    # epoch shape: (n_channels, n_times)
    channel_features = [trial_id, file_name]
    for channel in epoch:
        channel_features.extend([
            np.std(channel),
            np.mean(channel),
            np.max(channel),
            np.min(channel)
        ])
    features.append(channel_features)
    labels.append(label)

Extracting EDF parameters from /Users/miriamlandau/Documents/predict_hand_imagery/data/raw/S001R04.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 19999  =      0.000 ...   124.994 secs...
Filtering raw data in 1 contiguous segment
Setting up band-pass filter from 1 - 40 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 1.00
- Lower transition bandwidth: 1.00 Hz (-6 dB cutoff frequency: 0.50 Hz)
- Upper passband edge: 40.00 Hz
- Upper transition bandwidth: 10.00 Hz (-6 dB cutoff frequency: 45.00 Hz)
- Filter length: 529 samples (3.306 s)

Used Annotations descriptions: [np.str_('T0'), np.str_('T1'), np.str_('T2')]
Not setting metadata
30 matching events found
No baseline correction applied
0 projection items activated
Using data 

[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  64 out of  64 | elapsed:    0.1s finished


In [6]:
# Transform dataset into pandas dataframe 
df = pd.DataFrame(features,columns=feature_names)
df['label'] = labels

In [9]:
# Analyze dataframe shape, columns, data types, and sample rows
print(df.info())
display(df.describe())
display(df.head())
print(f" dataframe shape {df.shape}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Columns: 259 entries, trial_id to label
dtypes: float64(256), int64(2), object(1)
memory usage: 60.8+ KB
None


Unnamed: 0,trial_id,ch1_std,ch1_mean,ch1_max,ch1_min,ch2_std,ch2_mean,ch2_max,ch2_min,ch3_std,...,ch62_min,ch63_std,ch63_mean,ch63_max,ch63_min,ch64_std,ch64_mean,ch64_max,ch64_min,label
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,...,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,9632.0,4e-05,-5.384013e-07,9.9e-05,-9.7e-05,4.3e-05,-7.832657e-07,0.000103,-0.000102,4.5e-05,...,-0.000101,4.5e-05,7.744705e-07,0.000123,-0.000105,4.2e-05,7.138032e-07,0.000127,-9.8e-05,1.733333
std,5845.699628,1.1e-05,8.887085e-06,3.1e-05,2.7e-05,1.1e-05,9.368872e-06,3.1e-05,2.5e-05,1.1e-05,...,2.5e-05,9e-06,9.352656e-06,3.4e-05,3.1e-05,6e-06,6.93876e-06,2.3e-05,2.1e-05,0.827682
min,0.0,2.7e-05,-2.107512e-05,5.4e-05,-0.000175,2.9e-05,-2.335466e-05,5.6e-05,-0.000179,2.7e-05,...,-0.000151,3.3e-05,-2.406747e-05,5.7e-05,-0.000164,3e-05,-1.407863e-05,6.5e-05,-0.000145,1.0
25%,4820.0,3.3e-05,-4.23378e-06,7.4e-05,-0.000111,3.6e-05,-5.94979e-06,7.7e-05,-0.000113,3.8e-05,...,-0.000123,3.8e-05,-4.782154e-06,0.0001,-0.000136,3.8e-05,-2.997388e-06,0.000114,-0.000111,1.0
50%,9632.0,3.9e-05,-1.635169e-06,9.9e-05,-8.7e-05,4e-05,-2.778214e-06,0.0001,-9.4e-05,4.2e-05,...,-0.0001,4.3e-05,2.071886e-06,0.000116,-0.000103,4.2e-05,-1.138494e-06,0.000127,-9.9e-05,1.5
75%,14444.0,4.2e-05,2.895811e-06,0.000108,-7.5e-05,4.6e-05,3.191965e-06,0.000117,-8.5e-05,5e-05,...,-8.1e-05,5.3e-05,4.80674e-06,0.000138,-8e-05,4.4e-05,2.65805e-06,0.000138,-8.3e-05,2.0
max,19264.0,7.9e-05,2.923329e-05,0.000169,-6e-05,8e-05,3.139056e-05,0.000175,-6.5e-05,8.1e-05,...,-5.8e-05,6.6e-05,2.461599e-05,0.000192,-5.9e-05,5.4e-05,2.113615e-05,0.00018,-5.3e-05,3.0


Unnamed: 0,trial_id,subject_id,ch1_std,ch1_mean,ch1_max,ch1_min,ch2_std,ch2_mean,ch2_max,ch2_min,...,ch62_min,ch63_std,ch63_mean,ch63_max,ch63_min,ch64_std,ch64_mean,ch64_max,ch64_min,label
0,0,S001R04.edf,4e-05,-2.1e-05,0.000101,-0.000134,3.8e-05,-2.3e-05,9.2e-05,-0.000138,...,-5.8e-05,3.8e-05,1.294466e-05,0.000113,-5.9e-05,3.8e-05,-3.76182e-07,0.000114,-8.1e-05,1
1,672,S001R04.edf,2.7e-05,5e-06,6.7e-05,-6e-05,3.2e-05,1e-05,7.5e-05,-8.1e-05,...,-6.9e-05,3.3e-05,1.274179e-05,0.0001,-7.9e-05,3e-05,1.495558e-05,0.00013,-5.3e-05,3
2,1328,S001R04.edf,5.1e-05,2e-06,9.3e-05,-0.000122,5.2e-05,2e-06,9.3e-05,-0.000113,...,-8.2e-05,3.3e-05,-4.349705e-07,8e-05,-8e-05,4.7e-05,1.516147e-06,0.000112,-0.000106,1
3,2000,S001R04.edf,3.8e-05,1e-06,9.5e-05,-8.8e-05,4.6e-05,3e-06,0.000101,-0.000105,...,-0.000117,4.8e-05,9.954523e-06,0.000135,-0.000113,4.6e-05,8.908144e-06,0.000133,-0.000104,2
4,2656,S001R04.edf,3.8e-05,-1.3e-05,9.7e-05,-0.000105,3.7e-05,-1.1e-05,0.000103,-8.8e-05,...,-8e-05,4e-05,-1.790916e-05,0.000138,-8.5e-05,3.9e-05,-1.407863e-05,0.000152,-7.9e-05,1


 dataframe shape (30, 259)


In [10]:
#Save dataframe to csv in data folder
df.to_csv('../data/interim/eeg_motor_imagery.csv',index=False)


# Key Takeaways

**Dimension**

30 rows X 259 columns

Features are all float

**Column Naming Convention**

Features follow the pattern: `ch{channel_number}_{statistic}`

 Example: `ch1_std`, `ch1_mean`, `ch1_max`, `ch1_min`

**Key Identifiers**

`subject_id`: File name that uniquely identifies the trial subject

`trial_id`: Timestamp indicating when the trial occurred

**Note:**

 Dataset is shorter than wide, more trials should be added