In [None]:
#Project: Rest vs. Load Brain Dynamics via Microstate Features
#Dated: 08-01-2025

In [1]:
import pathlib as pl
import pyedflib
import pandas as pd
import numpy as np
# I imported .edf files through built-in python edf format library. I used for-loop iteration method to iterate through all files and then presented the data in python DataFrame format.
# I have define the path where all the .edf files are palaced (almost 72 files)
path = pl.Path('cod')

# List to hold dataframes
dfs = []

# Iterate through files in directory
for file in path.iterdir():
    if file.suffix == '.edf':  # Only process EDF files
        subject = file.stem  # Remove file extension
        try:
            # Open the EDF file
            with pyedflib.EdfReader(str(file)) as edf:
                num_signals = edf.signals_in_file
                signal_labels = edf.getSignalLabels()

                # Collect all signal data (ensure it's 2D: samples x signals)
                all_signal_data = np.array([edf.readSignal(i) for i in range(num_signals)]).T  # Transpose to (samples, signals)

                # Convert to DataFrame
                df = pd.DataFrame(all_signal_data, columns=signal_labels)
                
                # Add 'Subject' column
                df['Subject'] = subject
                
                # Append DataFrame to the list
                dfs.append(df)

        except Exception as e:
            print(f"Error processing file {file}: {e}")

#concatenate all dataframes into one
final_df = pd.concat(dfs, ignore_index=True)

In [2]:
final_df

Unnamed: 0,EEG Fp1,EEG Fp2,EEG F3,EEG F4,EEG F7,EEG F8,EEG T3,EEG T4,EEG C3,EEG C4,...,EEG P3,EEG P4,EEG O1,EEG O2,EEG Fz,EEG Cz,EEG Pz,EEG A2-A1,ECG ECG,Subject
0,-3.647938,-3.581866,-4.081247,-0.237508,-3.460168,-1.021940,-1.619108,4.479832,-1.957309,2.807482,...,1.076818,7.086144,4.313824,10.168745,-0.227751,0.358397,2.822136,1.291542,5.413076e-03,Subject00_1
1,-4.236482,-4.279388,-4.766219,-0.617699,-3.709403,-1.308793,-0.877716,5.607640,-1.426855,3.299914,...,2.962960,9.159618,6.237584,12.274858,-0.602733,4.160351,5.311727,0.973219,1.988144e-03,Subject00_1
2,-4.954218,-5.020849,-5.783455,-1.193804,-3.903253,-1.523457,0.171455,7.056246,-0.864033,3.822244,...,5.318983,11.767794,8.766892,14.835672,-1.163293,8.861065,8.367278,0.579025,-3.129572e-03,Subject00_1
3,-5.703854,-5.656125,-7.079280,-1.928968,-4.019562,-1.548153,1.453587,8.670264,-0.369542,4.274226,...,7.870239,14.621829,11.722487,17.598852,-1.882648,13.699181,11.597705,0.182633,-9.795724e-03,Subject00_1
4,-6.370552,-6.046078,-8.518234,-2.747541,-4.076794,-1.278398,2.842846,10.249195,-0.027893,4.578479,...,10.264316,17.350356,14.857342,20.304570,-2.682355,17.717742,14.534023,-0.119747,-1.766913e-02,Subject00_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4337995,-0.000739,-0.000688,0.000232,-0.000037,-0.000067,0.000607,-0.000103,0.000069,-0.000939,-0.000238,...,-0.000305,-0.000935,-0.000853,0.000042,0.001176,-0.000409,-0.000890,0.000430,-1.241016e-07,Subject35_2
4337996,-0.000739,-0.000688,0.000232,-0.000037,-0.000067,0.000607,-0.000103,0.000069,-0.000939,-0.000238,...,-0.000305,-0.000935,-0.000853,0.000042,0.001176,-0.000409,-0.000890,0.000430,-1.241016e-07,Subject35_2
4337997,-0.000739,-0.000688,0.000232,-0.000037,-0.000067,0.000607,-0.000103,0.000069,-0.000939,-0.000238,...,-0.000305,-0.000935,-0.000853,0.000042,0.001176,-0.000409,-0.000890,0.000430,-1.241016e-07,Subject35_2
4337998,-0.000739,-0.000688,0.000232,-0.000037,-0.000067,0.000607,-0.000103,0.000069,-0.000939,-0.000238,...,-0.000305,-0.000935,-0.000853,0.000042,0.001176,-0.000409,-0.000890,0.000430,-1.241016e-07,Subject35_2


In [3]:
new_df = final_df.drop_duplicates()
new_df

Unnamed: 0,EEG Fp1,EEG Fp2,EEG F3,EEG F4,EEG F7,EEG F8,EEG T3,EEG T4,EEG C3,EEG C4,...,EEG P3,EEG P4,EEG O1,EEG O2,EEG Fz,EEG Cz,EEG Pz,EEG A2-A1,ECG ECG,Subject
0,-3.647938,-3.581866,-4.081247,-0.237508,-3.460168,-1.021940,-1.619108,4.479832,-1.957309,2.807482,...,1.076818,7.086144,4.313824,10.168745,-0.227751,0.358397,2.822136,1.291542,5.413076e-03,Subject00_1
1,-4.236482,-4.279388,-4.766219,-0.617699,-3.709403,-1.308793,-0.877716,5.607640,-1.426855,3.299914,...,2.962960,9.159618,6.237584,12.274858,-0.602733,4.160351,5.311727,0.973219,1.988144e-03,Subject00_1
2,-4.954218,-5.020849,-5.783455,-1.193804,-3.903253,-1.523457,0.171455,7.056246,-0.864033,3.822244,...,5.318983,11.767794,8.766892,14.835672,-1.163293,8.861065,8.367278,0.579025,-3.129572e-03,Subject00_1
3,-5.703854,-5.656125,-7.079280,-1.928968,-4.019562,-1.548153,1.453587,8.670264,-0.369542,4.274226,...,7.870239,14.621829,11.722487,17.598852,-1.882648,13.699181,11.597705,0.182633,-9.795724e-03,Subject00_1
4,-6.370552,-6.046078,-8.518234,-2.747541,-4.076794,-1.278398,2.842846,10.249195,-0.027893,4.578479,...,10.264316,17.350356,14.857342,20.304570,-2.682355,17.717742,14.534023,-0.119747,-1.766913e-02,Subject00_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4337000,6.068225,11.341059,11.880622,8.704864,7.149732,2.870794,3.841710,-0.370489,4.694464,4.894113,...,3.596736,2.244284,-3.371009,-1.516534,14.620034,8.177969,-0.508389,-1.934205,-1.636739e-03,Subject35_2
4337001,7.019116,12.248549,12.594159,9.542338,6.573288,4.867168,1.733452,1.027583,4.945969,5.777945,...,3.499022,3.041885,-3.030298,-0.636897,15.447058,8.832793,0.324818,-1.605837,-1.587349e-03,Subject35_2
4337002,8.219520,13.526545,13.599800,10.963628,6.304100,7.081677,0.438521,3.076330,6.031199,7.366763,...,4.342815,4.717389,-1.673226,1.112122,16.521885,10.287248,2.070767,-0.185645,-1.133856e-03,Subject35_2
4337003,-1.528249,-2.248759,1.688192,-0.641564,-2.141335,1.230323,-0.693304,0.793943,-2.686290,-0.779050,...,-2.594910,-1.312907,-0.887858,-0.223285,0.543910,-1.307928,-3.667004,2.093092,-2.925377e-03,Subject35_2


In [4]:
new_df.columns

Index(['EEG Fp1', 'EEG Fp2', 'EEG F3', 'EEG F4', 'EEG F7', 'EEG F8', 'EEG T3',
       'EEG T4', 'EEG C3', 'EEG C4', 'EEG T5', 'EEG T6', 'EEG P3', 'EEG P4',
       'EEG O1', 'EEG O2', 'EEG Fz', 'EEG Cz', 'EEG Pz', 'EEG A2-A1',
       'ECG ECG', 'Subject'],
      dtype='object')

In [5]:
eeg_columns = [col for col in final_df.columns if 'EEG' in col]
eeg_data = new_df[eeg_columns].values  # Extract EEG signals as a NumPy array

In [7]:
#Labeling A,B,C and D
from sklearn.cluster import KMeans

# Perform k-means clustering (4 clusters for microstates A, B, C, D)
kmeans = KMeans(n_clusters=4, random_state=42)
microstates = kmeans.fit_predict(eeg_data)  # eeg_data is your EEG signals as a NumPy array

# Add the microstate labels to your DataFrame
new_df['Microstate'] = microstates

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Microstate'] = microstates


In [8]:
new_df['Microstate']

0          2
1          2
2          2
3          2
4          2
          ..
4337000    0
4337001    0
4337002    3
4337003    0
4337514    0
Name: Microstate, Length: 4267195, dtype: int32

In [10]:
state_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D'}
new_df['Microstate'] = new_df['Microstate'].map(state_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Microstate'] = new_df['Microstate'].map(state_mapping)


In [11]:
new_df['Microstate']

0          C
1          C
2          C
3          C
4          C
          ..
4337000    A
4337001    A
4337002    D
4337003    A
4337514    A
Name: Microstate, Length: 4267195, dtype: object

In [12]:
new_df.head()

Unnamed: 0,EEG Fp1,EEG Fp2,EEG F3,EEG F4,EEG F7,EEG F8,EEG T3,EEG T4,EEG C3,EEG C4,...,EEG P4,EEG O1,EEG O2,EEG Fz,EEG Cz,EEG Pz,EEG A2-A1,ECG ECG,Subject,Microstate
0,-3.647938,-3.581866,-4.081247,-0.237508,-3.460168,-1.02194,-1.619108,4.479832,-1.957309,2.807482,...,7.086144,4.313824,10.168745,-0.227751,0.358397,2.822136,1.291542,0.005413,Subject00_1,C
1,-4.236482,-4.279388,-4.766219,-0.617699,-3.709403,-1.308793,-0.877716,5.60764,-1.426855,3.299914,...,9.159618,6.237584,12.274858,-0.602733,4.160351,5.311727,0.973219,0.001988,Subject00_1,C
2,-4.954218,-5.020849,-5.783455,-1.193804,-3.903253,-1.523457,0.171455,7.056246,-0.864033,3.822244,...,11.767794,8.766892,14.835672,-1.163293,8.861065,8.367278,0.579025,-0.00313,Subject00_1,C
3,-5.703854,-5.656125,-7.07928,-1.928968,-4.019562,-1.548153,1.453587,8.670264,-0.369542,4.274226,...,14.621829,11.722487,17.598852,-1.882648,13.699181,11.597705,0.182633,-0.009796,Subject00_1,C
4,-6.370552,-6.046078,-8.518234,-2.747541,-4.076794,-1.278398,2.842846,10.249195,-0.027893,4.578479,...,17.350356,14.857342,20.30457,-2.682355,17.717742,14.534023,-0.119747,-0.017669,Subject00_1,C


In [17]:
#Task 1: Global Explained Variance (gev)
import numpy as np

# Placeholder for storing GEV values
gev_values = {}

# Loop through each microstate
for state in ['A', 'B', 'C', 'D']:
    # Select the data for the current microstate
    state_indices = new_df['Microstate'] == state
    state_data = new_df[state_indices].select_dtypes(include=[np.number])  # Numeric columns only (EEG, ECG)

    # Compute variance for this microstate
    state_variance = state_data.var().sum()

    # Compute total variance across all data
    total_variance = new_df.select_dtypes(include=[np.number]).var().sum()

    # Compute GEV
    gev_values[state] = state_variance / total_variance

# Convert GEV values into a dictionary
gev_row = {f"{state}_gev": gev_values[state] for state in gev_values}
print("GEV Values:", gev_row)

GEV Values: {'A_gev': 0.4583691916961327, 'B_gev': 0.6812122825202646, 'C_gev': 0.48234031335352734, 'D_gev': 0.7161840735536413}


In [21]:
len(new_df['Microstate'].isnull())

4267195

In [26]:
len(new_df['Microstate'])

4267195

In [29]:
#Task 2: Time coverage (timecov) 
# Placeholder for storing time coverage values
timecov_values = {}

# Total number of time points
total_time_points = len(new_df)

# Loop through each microstate
for state in ['A', 'B', 'C', 'D']:
    # Count the number of time points assigned to this microstate
    time_points_in_state = (new_df['Microstate'] == state).sum()

    # Compute time coverage
    timecov_values[state] = time_points_in_state / total_time_points

# Convert time coverage values into a dictionary
timecov_row = {f"{state}_timecov": timecov_values[state] for state in timecov_values}
print("Time Coverage Values:", timecov_row)

Time Coverage Values: {'A_timecov': 0.3021094653513608, 'B_timecov': 0.22020671659017224, 'C_timecov': 0.2719287494478223, 'D_timecov': 0.2057550686106447}


In [30]:
#Task 3: Mean durations (meandurs)
# Placeholder for storing mean duration values
meandurs_values = {}

# Sampling rate (in Hz), adjust based on your dataset
sampling_rate = 100  # Example: 100 Hz

# Loop through each microstate
for state in ['A', 'B', 'C', 'D']:
    # Find consecutive segment durations
    durations = []
    current_duration = 0

    for i in range(len(new_df)):
        if new_df['Microstate'].iloc[i] == state:
            current_duration += 1
        else:
            if current_duration > 0:
                durations.append(current_duration)
                current_duration = 0

    # Add the last segment if it exists
    if current_duration > 0:
        durations.append(current_duration)

    # Compute mean duration in seconds
    if len(durations) > 0:
        meandurs_values[state] = np.mean(durations) / sampling_rate
    else:
        meandurs_values[state] = 0  # No segments found for this state

# Convert mean duration values into a dictionary
meandurs_row = {f"{state}_meandurs": meandurs_values[state] for state in meandurs_values}
print("Mean Duration Values:", meandurs_row)

Mean Duration Values: {'A_meandurs': 0.12752975160010682, 'B_meandurs': 0.15499373206215156, 'C_meandurs': 0.1185360397172394, 'D_meandurs': 0.15003109994702757}


In [31]:
#Task 4: Occurrence per seconds (occurrence)
# Placeholder for storing occurrence values
occurrence_values = {}

# Total duration of the dataset in seconds
total_duration = len(new_df) / sampling_rate  # len(new_df) = total time points

# Loop through each microstate
for state in ['A', 'B', 'C', 'D']:
    # Create a boolean mask for rows where Microstate equals the current state
    mask = new_df['Microstate'] == state

    # Identify where transitions into the state occur (new segments start)
    transitions = mask & ~mask.shift(fill_value=False)

    # Count the number of transitions (segments)
    num_segments = transitions.sum()

    # Compute occurrence per second
    occurrence_values[state] = num_segments / total_duration

# Convert occurrence values into a dictionary
occurrence_row = {f"{state}_occurrence": occurrence_values[state] for state in occurrence_values}
print("Occurrence Values:", occurrence_row)


Occurrence Values: {'A_occurrence': 2.368933221940877, 'B_occurrence': 1.4207459466933197, 'C_occurrence': 2.294059680891077, 'D_occurrence': 1.3714161176135613}


In [32]:
# Combine all feature rows into a single dictionary
final_row = {**gev_row, **timecov_row, **meandurs_row, **occurrence_row}

# Add Target (Rest or Load) based on the subject's condition
final_row['Target'] = 'Rest'  # Replace 'Rest' with 'Load' for Load data

# Convert to a DataFrame for saving or further processing
final_row_df = pd.DataFrame([final_row])
print(final_row_df)


      A_gev     B_gev    C_gev     D_gev  A_timecov  B_timecov  C_timecov  \
0  0.458369  0.681212  0.48234  0.716184   0.302109   0.220207   0.271929   

   D_timecov  A_meandurs  B_meandurs  C_meandurs  D_meandurs  A_occurrence  \
0   0.205755     0.12753    0.154994    0.118536    0.150031      2.368933   

   B_occurrence  C_occurrence  D_occurrence Target  
0      1.420746       2.29406      1.371416   Rest  


In [36]:
#Final_dataset including new feautres and target Load/Rest
# Placeholder for the final dataset
final_dataset = []

# Iterate over unique subjects
for subject in new_df['Subject'].unique():
    # Filter data for the current subject
    subject_data = new_df[new_df['Subject'] == subject]

    # Determine the target label (e.g., based on '_1' or '_2' in the subject name)
    target = 'Rest' if '_1' in subject else 'Load'

    # Calculate features for the subject
    gev_row = {f"{state}_gev": gev_values[state] for state in gev_values}  # Task 1
    timecov_row = {f"{state}_timecov": timecov_values[state] for state in timecov_values}  # Task 3
    meandurs_row = {f"{state}_meandurs": meandurs_values[state] for state in meandurs_values}  # Task 4
    occurrence_row = {f"{state}_occurrence": occurrence_values[state] for state in occurrence_values}  # Task 5

    # Combine all features into a single row
    final_row = {**gev_row, **timecov_row, **meandurs_row, **occurrence_row}
    final_row['Target'] = target

    # Append to the final dataset
    final_dataset.append(final_row)

# Convert the dataset into a DataFrame
final_dataset_df = pd.DataFrame(final_dataset)

# Save the dataset to a CSV file
final_dataset_df.to_csv("final_microstates_dataset.csv", index=False)
print("Final dataset created and saved successfully!")


Final dataset created and saved successfully!


In [37]:
final_dataset_df

Unnamed: 0,A_gev,B_gev,C_gev,D_gev,A_timecov,B_timecov,C_timecov,D_timecov,A_meandurs,B_meandurs,C_meandurs,D_meandurs,A_occurrence,B_occurrence,C_occurrence,D_occurrence,Target
0,0.458369,0.681212,0.48234,0.716184,0.302109,0.220207,0.271929,0.205755,0.12753,0.154994,0.118536,0.150031,2.368933,1.420746,2.29406,1.371416,Rest
1,0.458369,0.681212,0.48234,0.716184,0.302109,0.220207,0.271929,0.205755,0.12753,0.154994,0.118536,0.150031,2.368933,1.420746,2.29406,1.371416,Load
2,0.458369,0.681212,0.48234,0.716184,0.302109,0.220207,0.271929,0.205755,0.12753,0.154994,0.118536,0.150031,2.368933,1.420746,2.29406,1.371416,Rest
3,0.458369,0.681212,0.48234,0.716184,0.302109,0.220207,0.271929,0.205755,0.12753,0.154994,0.118536,0.150031,2.368933,1.420746,2.29406,1.371416,Load
4,0.458369,0.681212,0.48234,0.716184,0.302109,0.220207,0.271929,0.205755,0.12753,0.154994,0.118536,0.150031,2.368933,1.420746,2.29406,1.371416,Rest
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,0.458369,0.681212,0.48234,0.716184,0.302109,0.220207,0.271929,0.205755,0.12753,0.154994,0.118536,0.150031,2.368933,1.420746,2.29406,1.371416,Load
68,0.458369,0.681212,0.48234,0.716184,0.302109,0.220207,0.271929,0.205755,0.12753,0.154994,0.118536,0.150031,2.368933,1.420746,2.29406,1.371416,Rest
69,0.458369,0.681212,0.48234,0.716184,0.302109,0.220207,0.271929,0.205755,0.12753,0.154994,0.118536,0.150031,2.368933,1.420746,2.29406,1.371416,Load
70,0.458369,0.681212,0.48234,0.716184,0.302109,0.220207,0.271929,0.205755,0.12753,0.154994,0.118536,0.150031,2.368933,1.420746,2.29406,1.371416,Rest


In [41]:
len(final_dataset_df[final_dataset_df['Target'] == 'Load'])

36

In [42]:
len(final_dataset_df[final_dataset_df['Target'] == 'Rest'])

36