In [153]:
import pyedflib
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import statistics
import warnings
warnings.filterwarnings('ignore')
from statsmodels.tsa import stattools
from scipy.signal import periodogram
import logging
from itertools import groupby


def read_edf_file(file_path):
    """
    Reads an .edf file and returns the EEG and EMG streams as pandas DataFrames.
    """
    f = pyedflib.EdfReader(file_path)

    # Assuming the EEG channel is the first channel and EMG is the second channel
    eeg_signal = f.readSignal(0)
    emg_signal = f.readSignal(1)

    # Extract the channel names for the DataFrame
    eeg_channel_name = f.getSignalLabels()[0]
    emg_channel_name = f.getSignalLabels()[1]

    # Get the sample frequency
    sample_frequency = f.getSampleFrequency(0)  # Assuming both streams have the same frequency

    # Calculate the timestamps for the samples
    n_samples = min(len(eeg_signal), len(emg_signal))
    time = [i / sample_frequency for i in range(n_samples)]

    # Create pandas DataFrame
    df = pd.DataFrame({
        'Time': time,
        eeg_channel_name: eeg_signal[:n_samples],
        emg_channel_name: emg_signal[:n_samples],
    })

    # Close the EdfReader
    f.close()

    return df

file = 'edf_293.edf'

data = read_edf_file(file)

x = data.Time
y = data.EEG

In [154]:
# Labels
label_df = pd.read_csv("Data_293.csv")
labels = label_df["NAPS_Numeric"].iloc[1:]
labels = [int(label) for label in labels]

# Label List

Label 1: W (Awake)

Label 2: WA (Awake Artifact)?

Label 3: NR (NREM)

Label 4: Not defined

Label 5: R (REM)

Label 7: U (Artifacts?)


# Summary Statistics

In [155]:
# How many segments per label do you want to analyze?
no_segments = 100

In [156]:
indices_dict = {}

for label in list(set(labels)): 
    indices = [index for index, value in enumerate(labels) if value == label][:no_segments]
    indices_dict[label] = indices

In [157]:
def segment_data(df, segment_size, step_size = 2):
    n_segments = int(df["Time"].iloc[-1]) // segment_size
    eeg_segments = []
    emg_segments = []

    for i in range(n_segments):
        start_idx = int(i* segment_size*1000/step_size)
        end_idx = start_idx + int(segment_size*1000/step_size)
        segment = df.iloc[start_idx:end_idx]
        eeg_segments.append(list(segment["EEG"]))
        emg_segments.append(list(segment["EMG"]))

    return eeg_segments, emg_segments

In [158]:
# Segment the data
segment_length = 4  # seconds
eeg_segments, emg_segments = segment_data(data, segment_length, step_size = 2)

In [159]:
window_size = 30
freq = 1
lag_size = 30

In [160]:
means1 = []
variance1 = []
entropy1 = []
lumpiness1 = []
stability1 = []
flat_spots1 = []
hurst1 = []
std_der1 = []
crossing_points1 = []
binarized_means1 = []
unitroot_kpss1 = []
heterogeneity = []
histogram_mode = []
linearity = []

for label_idx in indices_dict[1]:
    means1.append(statistics.mean(eeg_segments[label_idx]))
    variance1.append(statistics.variance(eeg_segments[label_idx]))

    # Entropy
    _, psd = periodogram(eeg_segments[label_idx], freq)
    psd_norm = psd / np.sum(psd)
    entropy = np.nansum(psd_norm * np.log2(psd_norm))
    entropy1.append(-(entropy / np.log2(psd_norm.size)))

    # Takes too long to compute
    # Flat Spots
    #max_run_length = 0
    #for i in range(0, len(x), window_size):
    #    run_length = np.max(
    #        [len(list(v)) for k, v in groupby(x[i : i + window_size])]
    #)
    #if run_length > max_run_length:
    #    max_run_length = run_length

    # Hurst
    # Create the range of lag values
    lags = range(2, min(lag_size, len(x) - 1))
    # Calculate the array of the variances of the lagged differences
    tau = [np.std(np.asarray(eeg_segments[label_idx])[lag:] - np.asarray(eeg_segments[label_idx])[:-lag]) for lag in lags]
    # Use a linear fit to estimate the Hurst Exponent
    poly = np.polyfit(np.log(lags), np.log(tau), 1)
    # Return the Hurst exponent from the polyfit output
    hurst1.append(poly[0] if not np.isnan(poly[0]) else 0)

    # Standard Deviation of the first derivative
    std_der1.append(np.std(np.gradient(eeg_segments[label_idx])))
        
    # Calculate the number of crossing points.
    # Crossing points happen when a time series crosses the median line.
    median = np.median(eeg_segments[label_idx])
    cp = 0
    for i in range(len(eeg_segments[label_idx]) - 1):
        if x[i] <= median < x[i + 1] or x[i] >= median > x[i + 1]:
            cp += 1
    crossing_points1.append(cp)

    # Binarized means
    # Converts time series array into a binarized version.
    # Time-series values above its mean are given 1, and those below the mean
    # are 0. Returns the average value of the binarized vector.
    binarized_means1.append(np.mean(np.asarray(x) > np.mean(x)))



    
    lumpiness1.append(np.var([np.var(x_w) for x_w in np.array_split(eeg_segments[label_idx], len(eeg_segments[label_idx]) // window_size + 1)]))
    stability1.append(np.var([np.mean(x_w) for x_w in np.array_split(eeg_segments[label_idx], len(eeg_segments[label_idx]) // window_size + 1)]))

In [161]:
feature_df1 = pd.DataFrame()

feature_df1["Mean"] = means1
feature_df1["Variance"] = variance1
feature_df1["Entropy"] = entropy1
feature_df1["Lumpiness"] = lumpiness1
feature_df1["Stability"] = stability1
feature_df1["Hurst"] = hurst1
feature_df1["STD_Derivative"] = std_der1
feature_df1["Crossing_Points"] = crossing_points1
feature_df1["Binarized_Means"] = binarized_means1


feature_df1["Label"] = 1

In [None]:
means3 = []
variance3 = []
entropy3 = []
lumpiness3 = []
stability3 = []
hurst3 = []
std_der3 = []
crossing_points3 = []
binarized_means3 = []


for label_idx in indices_dict[3]:
    means3.append(statistics.mean(eeg_segments[label_idx]))
    variance3.append(statistics.variance(eeg_segments[label_idx]))
    
    # Entropy
    _, psd = periodogram(eeg_segments[label_idx], freq)
    psd_norm = psd / np.sum(psd)
    entropy = np.nansum(psd_norm * np.log2(psd_norm))
    entropy3.append(-(entropy / np.log2(psd_norm.size)))

    # Hurst
    # Create the range of lag values
    lags = range(2, min(lag_size, len(x) - 1))
    # Calculate the array of the variances of the lagged differences
    tau = [np.std(np.asarray(eeg_segments[label_idx])[lag:] - np.asarray(eeg_segments[label_idx])[:-lag]) for lag in lags]
    # Use a linear fit to estimate the Hurst Exponent
    poly = np.polyfit(np.log(lags), np.log(tau), 1)
    # Return the Hurst exponent from the polyfit output
    hurst3.append(poly[0] if not np.isnan(poly[0]) else 0)

    # Standard Deviation of the first derivative
    std_der3.append(np.std(np.gradient(eeg_segments[label_idx])))

    # Calculate the number of crossing points.
    # Crossing points happen when a time series crosses the median line.
    median = np.median(eeg_segments[label_idx])
    cp = 0
    for i in range(len(eeg_segments[label_idx]) - 1):
        if x[i] <= median < x[i + 1] or x[i] >= median > x[i + 1]:
            cp += 1
    crossing_points3.append(cp)

    # Binarized means
    # Converts time series array into a binarized version.
    # Time-series values above its mean are given 1, and those below the mean
    # are 0. Returns the average value of the binarized vector.
    binarized_means3.append(np.mean(np.asarray(x) > np.mean(x)))


    
    lumpiness3.append(np.var([np.var(x_w) for x_w in np.array_split(eeg_segments[label_idx], len(eeg_segments[label_idx]) // window_size + 1)]))
    stability3.append(np.var([np.mean(x_w) for x_w in np.array_split(eeg_segments[label_idx], len(eeg_segments[label_idx]) // window_size + 1)]))

In [None]:
feature_df3 = pd.DataFrame()

feature_df3["Mean"] = means3
feature_df3["Variance"] = variance3
feature_df3["Entropy"] = entropy3
feature_df3["Lumpiness"] = lumpiness3
feature_df3["Stability"] = stability3
feature_df3["Hurst"] = hurst3
feature_df3["STD_Derivative"] = std_der3
feature_df3["Crossing_Points"] = crossing_points3
feature_df3["Binarized_Means"] = binarized_means3


feature_df3["Label"] = 3


In [None]:
feature_df = pd.concat([feature_df1, feature_df3])

In [None]:
feature_df.to_csv("Statistical_Features.csv")

This code contains modified functions from the python package kats, which itself is only
available for earlier Python versions.

MIT License

Copyright (c) Facebook, Inc. and its affiliates.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
