In [190]:
import pyedflib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import statistics
import warnings
warnings.filterwarnings('ignore')
from statsmodels.tsa import stattools
from scipy.signal import periodogram
import logging
from itertools import groupby
import scipy.stats as stats
from statsmodels.tsa.stattools import acf, kpss, pacf
from statsmodels.stats.diagnostic import het_arch
from scipy.signal import find_peaks

In [191]:
# Choose if you want to look at EEG or EMG data
data_type = "EEG"
#data_type = "EMG"

In [192]:
# choose individuum
subject = "m292"

In [193]:
label_list = [0, 1, 2, 3, 4]

In [194]:
# EEG & EMG data
data = {}

for label in label_list:
    data[label] = pd.read_csv("Data/"+str(subject)+"/run0"+str(label)+"/Time_Series_Data.csv")

# Segmenting Data

In [195]:
def segment_data(df, segment_size, step_size = 2):
    """
    Segments time-series data into EEG and EMG segments.

    Parameters:
    - df (DataFrame): The input dataframe containing the columns "Time", "EEG" and "EMG".
    - segment_size (float): The desired size of each segment in seconds.
    - step_size (float, optional): The step size of "Time" in milliseconds. Default is 2 millisecond.

    Returns:
    Tuple of two lists:
    - List of EEG segments.
    - List of EMG segments.
    """

    n_segments = int(df["time"].iloc[-1]) // segment_size
    eeg_segments = []
    emg_segments = []

    for i in range(n_segments):
        start_idx = int(i* segment_size*1000/step_size)
        end_idx = start_idx + int(segment_size*1000/step_size)
        segment = df.iloc[start_idx:end_idx]
        eeg_segments.append(list(segment["voltage"]))
        emg_segments.append(list(segment["emg"]))

    return eeg_segments, emg_segments

In [196]:
# Segment the data
segment_size = 4  # seconds
eeg_segments = {}
emg_segments = {}

for label in label_list:
    eeg_segments[label], emg_segments[label] = segment_data(data[label], segment_size, step_size = 2)

In [197]:
if data_type == "EEG":
    segments = eeg_segments
else:
    segments = emg_segments

## Compute Features

This computes all features of the categories "statistics", "level_shift_features" and "acfpacf_features" (Autocorrelation) which are contained in the Python module 'kats', as well as skewness and kurtosis. (It does not compute time series length because length stays the same.)

In [198]:
### KATS FEATURES OF CATEGORY "statistics" ###

def compute_means(segments,  label):
    """
    Computes means of all segments for a certain label

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - indices_dict (dicts): dictionary which contains all segment indices (values) for each label (key) for either train or test set.
    - label (int): Label we want to compute means for. 1, 3, 5 or 7.

    Returns:
    - Means of time series segments (list of floats)
    """
    
    means = []

    for sgmt in segments:
        means.append(statistics.mean(sgmt))

    return means


def compute_percentiles(segments, label, percentile):
    """
    Computes percentiles of all segments for a certain label

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - indices_dict (dicts): dictionary which contains all segment indices (values) for each label (key) for either train or test set.
    - label (int): Label we want to compute percentiles for. 1, 3, 5 or 7.
    - percentile (float): Percentile to compute (e.g., 0.25 for 25th percentile)

    Returns:
    - Percentiles of time series segments (list of floats)
    """
    
    percentiles = []

    for sgmt in segments:
        percentiles.append(np.percentile(sgmt, percentile * 100))

    return percentiles


def compute_number_of_peaks(segments, label):
    """
    Computes number of peaks of all segments for a certain label

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - indices_dict (dicts): dictionary which contains all segment indices (values) for each label (key) for either train or test set.
    - label (int): Label we want to compute number of peaks for. 1, 3, 5 or 7.

    Returns:
    - Number of peaks of time series segments (list of ints)
    """
    
    number_of_peaks = []

    for sgmt in segments:
        peaks, _ = find_peaks(sgmt)
        number_of_peaks.append(len(peaks))

    return number_of_peaks


def compute_number_of_valleys(segments, label):
    """
    Computes number of valleys of all segments for a certain label

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - indices_dict (dicts): dictionary which contains all segment indices (values) for each label (key) for either train or test set.
    - label (int): Label we want to compute number of valleys for. 1, 3, 5 or 7.

    Returns:
    - Number of valleys of time series segments (list of ints)
    """
    
    number_of_valleys = []

    for sgmt in segments:
        valleys, _ = find_peaks([-x for x in sgmt])
        number_of_valleys.append(len(valleys))

    return number_of_valleys


    
def compute_variance(segments,  label):
    """
    Computes variances of all segments for a certain label

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - indices_dict (dicts): dictionary which contains all segment indices (values) for each label (key) for either train or test set.
    - label (int): Label we want to compute means for. 1, 3, 5 or 7.

    Returns:
    - Variances of time series segments (list of floats)
    """
    variances = []

    for sgmt in segments:
        variances.append(statistics.variance(sgmt))

    return variances


def compute_entropy(segments,  label, freq = 1):
    """
    Computes entropies of all segments for a certain label

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - indices_dict (dicts): dictionary which contains all segment indices (values) for each label (key) for either train or test set.
    - label (int): Label we want to compute means for. 1, 3, 5 or 7.

    Returns:
    - Entropies of time series segments (list of floats) 
    """
    entropies = []

    for sgmt in segments:

        _, psd = periodogram(sgmt, freq)
        psd_norm = psd / np.sum(psd)
        entropy = np.nansum(psd_norm * np.log2(psd_norm))
        entropies.append(-(entropy / np.log2(psd_norm.size)))

    return entropies


def compute_lumpiness(segments,  label, window_size = 30):
    """
    Computes lumpiness of all segments for a certain label

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - indices_dict (dicts): dictionary which contains all segment indices (values) for each label (key) for either train or test set.
    - label (int): Label we want to compute means for. 1, 3, 5 or 7.

    Returns:
    - Lumpinesses of time series segments (list of floats)
    """
    lumpinesses = []

    for sgmt in segments:
        lumpinesses.append(np.var([np.var(x_w) for x_w in np.array_split(sgmt, len(sgmt) // window_size + 1)]))

    return lumpinesses


def compute_stabilities(segments,  label, window_size = 30):
    """
    Computes stabilities of all segments for a certain label

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - indices_dict (dicts): dictionary which contains all segment indices (values) for each label (key) for either train or test set.
    - label (int): Label we want to compute means for. 1, 3, 5 or 7.

    Returns:
    - Stabilities of time series segments (list of floats)
    """
    stabilities = []

    for sgmt in segments:
        stabilities.append(np.var([np.mean(x_w) for x_w in np.array_split(sgmt, len(sgmt) // window_size + 1)]))

    return stabilities


def compute_flat_spots(segments,  label, nbins = 10):
    """
    Getting flat spots: Maximum run-lengths across equally-sized segments of time series

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - label (int): Label we want to compute means for. 
    - nbins (int): Number of bins to segment time series data into.

    Returns:
        Maximum run-lengths across segmented time series array.
    """


    if len(x) <= nbins:
        msg = (
            "Length of time series is shorter than nbins, unable to "
            "calculate flat spots feature"
        )
        logging.error(msg)
        return np.nan

    max_run_length = 0
    window_size = int(len(x) / nbins)
    for i in range(0, len(x), window_size):
        run_length = np.max(
            [len(list(v)) for k, v in groupby(x[i : i + window_size])]
        )
        if run_length > max_run_length:
            max_run_length = run_length
    return max_run_length


def compute_hursts(segments,  label, lag_size = 30):
    """
    Computes hursts of all segments for a certain label

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - indices_dict (dicts): dictionary which contains all segment indices (values) for each label (key) for either train or test set.
    - label (int): Label we want to compute means for. 1, 3, 5 or 7.

    Returns:
    - Hursts of time series segments (list of floats)
    """
    hursts = []

    for sgmt in segments:
        # Create the range of lag values
        lags = range(2, min(lag_size, len(sgmt) - 1))
        # Calculate the array of the variances of the lagged differences
        tau = [np.std(np.asarray(sgmt)[lag:] - np.asarray(sgmt)[:-lag]) for lag in lags]
        # Use a linear fit to estimate the Hurst Exponent
        poly = np.polyfit(np.log(lags), np.log(tau), 1)
        # Return the Hurst exponent from the polyfit output
        hursts.append(poly[0] if not np.isnan(poly[0]) else 0)
    return hursts



def compute_standard_dev_of_first_der(segments,  label):
    """
    Computes standard deviation of the first derivative of all segments for a certain label

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - indices_dict (dicts): dictionary which contains all segment indices (values) for each label (key) for either train or test set.
    - label (int): Label we want to compute means for. 1, 3, 5 or 7.

    Returns:
    - standard deviation of the first derivative of time series segments (list of floats)
    """
    stds = []

    for sgmt in segments:
        stds.append(np.std(np.gradient(sgmt)))

    return stds

    
    
def compute_crossing_points(segments,  label):
    """
    Computes crossing points of all segments for a certain label.
    Crossing points happen when a time series crosses the median line.


    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - indices_dict (dicts): dictionary which contains all segment indices (values) for each label (key) for either train or test set.
    - label (int): Label we want to compute means for. 1, 3, 5 or 7.

    Returns:
    - crossing points of time series segments (list of floats)
    """
    
    crossing_points = []

    for sgmt in segments:
        # Calculate the number of crossing points.
        median = np.median(sgmt)
        cp = 0
        for i in range(len(sgmt) - 1):
            if sgmt[i] <= median < sgmt[i + 1] or sgmt[i] >= median > sgmt[i + 1]:
                cp += 1
        crossing_points.append(cp)
        
    return crossing_points



def compute_binarized_means(segments,  label):
    """
    Computes binarized means of all segments for a certain label.
    Converts time series array into a binarized version.
    Time-series values above its mean are given 1, and those below the mean
    are 0. Returns the average value of the binarized vector.


    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - indices_dict (dicts): dictionary which contains all segment indices (values) for each label (key) for either train or test set.
    - label (int): Label we want to compute means for. 1, 3, 5 or 7.

    Returns:
    - binarized menas of time series segments (list of floats)
    """
    binarized_means = []

    for sgmt in segments:
        binarized_means.append(np.mean(np.asarray(sgmt) > np.mean(sgmt)))

    return binarized_means


def compute_unitroot_kpss(segments,  label):
    """
    Get the test statistic based on KPSS test.

    Test a null hypothesis that an observable time series is stationary
    around a deterministic trend. A vector comprising the statistic for the
    KPSS unit root test with linear trend and lag one
    Wiki: https://en.wikipedia.org/wiki/KPSS_test

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - label (int): Label we want to compute means for. 1, 3, 5 or 7.

    Returns:
        Test statistics acquired using KPSS test.
    """

    test_statistics = []

    for sgmt in segments:
        test_statistics.append(kpss(sgmt, regression="ct", nlags=1)[0])

    return test_statistics


def compute_heterogenity(segments,  label):
    """
    Compute Engle's test for autogregressive Conditional Heteroscedasticity (ARCH).

    reference: https://www.statsmodels.org/dev/generated/statsmodels.stats.diagnostic.het_arch.html
    Engle’s Test for Autoregressive Conditional Heteroscedasticity (ARCH)

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - label (int): Label we want to compute means for. 1, 3, 5 or 7.

    Returns:
            Lagrange multiplier test statistic
    """

    test_statistics = []

    for sgmt in segments:
        test_statistics.append(het_arch(np.array(sgmt), nlags=min(10, len(sgmt) // 5))[0])

    return test_statistics

def compute_histogram_mode(segments,  label, nbins = 10):
    """
    Measures the mode of the data vector using histograms with a given number of bins.
    Reference: https://cran.r-project.org/web/packages/tsfeatures/vignettes/tsfeatures.html

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - label (int): Label we want to compute means for. 1, 3, 5 or 7.
    - nbins: int; Number of bins to get the histograms. Default value is 10.

    Returns:
        Mode of the data vector using histograms.
    """

    modes = []

    for sgmt in segments:
        cnt, val = np.histogram(sgmt, bins=nbins)
        modes.append(val[cnt.argmax()])

    return modes

    
def compute_linearity(segments,  label):
    """
    Compute linearity feature: R square from a fitted linear regression.

    Args:
        x: The univariate time series array in the form of 1d numpy array.

    Returns:
        R square from a fitted linear regression.
    """

    R_squares = []

    for sgmt in segments:
        _, _, r_value, _, _ = stats.linregress(np.arange(len(sgmt)), sgmt)

        R_squares.append(r_value**2)

    return R_squares


### KATS FEATURES OF CATEGORY "level_shift_features" ###

def compute_level_shift_idx(segments, label, window_size = 20):
    """
    Calculates level_shift_idx: Location of the maximum mean value difference,
    between two consecutive sliding windows


    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - label (int): Label we want to compute medians for.
    - window_size (int) Length of the sliding window.

    Returns:
    List of Level Shift indices
    """

    level_shift_indices = []

    for sgmt in segments:
        if len(sgmt) < window_size:
            raise ValueError("Segment length must be greater than or equal to the window size")
        
        # Convert segment to NumPy array
        sgmt_array = np.array(sgmt)
        
        # Create a sliding window view
        sliding_windows = np.lib.stride_tricks.sliding_window_view(sgmt_array, window_size)
        
        # Compute the means over the sliding windows
        means = np.mean(sliding_windows, axis=1)
        
        # Compute the absolute differences between consecutive means
        mean_diff = np.abs(means[:-1] - means[1:])
        
        # Find the index of the maximum level shift
        level_shift_idx = np.argmax(mean_diff)
        
        level_shift_indices.append(level_shift_idx)

    return level_shift_indices


def compute_level_shift_size(segments, label, window_size = 20):
    """
    
    Calculate level_shift_size: Size of the maximum mean value difference,
    between two consecutive sliding windows


    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - label (int): Label we want to compute medians for.
    - window_size (int) Length of the sliding window.

    Returns:
    List of Level Shift sizes
    """

    level_shift_sizes = []

    for sgmt in segments:
        if len(sgmt) < window_size:
            raise ValueError("Segment length must be greater than or equal to the window size")
        
        # Convert segment to NumPy array
        sgmt_array = np.array(sgmt)
        
        # Create a sliding window view
        sliding_windows = np.lib.stride_tricks.sliding_window_view(sgmt_array, window_size)
        
        # Compute the means over the sliding windows
        means = np.mean(sliding_windows, axis=1)
        
        # Compute the absolute differences between consecutive means
        mean_diff = np.abs(means[:-1] - means[1:])
        
        # Find the index of the maximum level shift
        level_shift_sz = mean_diff[np.argmax(mean_diff)]
        
        level_shift_sizes.append(level_shift_sz)

    return level_shift_sizes


### KATS FEATURES OF CATEGORY "acfpacf_features" AND "special_ac" ###
### Autocorrelation Function (ACF) and Partial Autocorrelation Function (PACF) features ###


def log_helper(x, period = 7):

    if len(x) < 10 or len(x) < period or len(np.unique(x)) == 1:
        msg = (
            "Length is shorter than period, or constant time series, "
            "unable to calculate acf/pacf features"
        )
        logging.error(msg)
    pass


def compute_y_acf1(segments, label, acfpacf_lag = 6, period = 7, default_status = True):
    """

    y_acf1: first ACF value of the original series
    
    Reference: https://stackoverflow.com/questions/36038927/whats-the-difference-between-pandas-acf-and-statsmodel-acf
    R code: https://cran.r-project.org/web/packages/tsfeatures/vignettes/tsfeatures.html
    Paper: Meta-learning how to forecast time series


    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - label (int): Label we want to compute medians for.
    - acfpacf_lag (int) Largest lag number for returning ACF/PACF features
                via statsmodels.
    - period (int) Seasonal period.
    - default_status (Bool): Default status of the switch for calculate the
                features or not.


    Returns:
    List of y_acf1 features
    """

    y_acf1s = []
    
    for sgmt in segments:
        log_helper(sgmt)

        nlag = min(acfpacf_lag, len(sgmt) - 2)

        y_acf_list = acf(sgmt, fft=True, nlags=period)[1:]
        y_acf1s.append(y_acf_list[0])

    return y_acf1s


def compute_y_acf5(segments, label, acfpacf_lag = 6, period = 7, default_status = True):
    """

    y_acf5: sum of squares of first 5 ACF values of original series
    
    Reference: https://stackoverflow.com/questions/36038927/whats-the-difference-between-pandas-acf-and-statsmodel-acf
    R code: https://cran.r-project.org/web/packages/tsfeatures/vignettes/tsfeatures.html
    Paper: Meta-learning how to forecast time series


    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - label (int): Label we want to compute medians for.
    - acfpacf_lag (int) Largest lag number for returning ACF/PACF features
                via statsmodels.
    - period (int) Seasonal period.
    - default_status (Bool): Default status of the switch for calculate the
                features or not.


    Returns:
    List of y_acf5 features
    """
    y_acf5s = []
    
    for sgmt in segments:
        log_helper(sgmt)

        nlag = min(acfpacf_lag, len(sgmt) - 2)
        
        y_acf_list = acf(sgmt, fft=True, nlags=period)[1:]

        y_acf5s.append(np.sum(np.asarray(y_acf_list)[:5] ** 2))

    return y_acf5s



def compute_diff1y_acf1(segments, label, acfpacf_lag = 6, period = 7, default_status = True):
    """

    diff1y_acf1: first ACF value of the differenced series
    
    Reference: https://stackoverflow.com/questions/36038927/whats-the-difference-between-pandas-acf-and-statsmodel-acf
    R code: https://cran.r-project.org/web/packages/tsfeatures/vignettes/tsfeatures.html
    Paper: Meta-learning how to forecast time series


    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - label (int): Label we want to compute medians for.
    - acfpacf_lag (int) Largest lag number for returning ACF/PACF features
                via statsmodels.
    - period (int) Seasonal period.
    - default_status (Bool): Default status of the switch for calculate the
                features or not.


    Returns:
    List of diff1y_acf1 features
    """
    diff1y_acf1s = []
    
    for sgmt in segments:
        log_helper(sgmt)

        nlag = min(acfpacf_lag, len(sgmt) - 2)

        diff1x = [sgmt[i] - sgmt[i - 1] for i in range(1, len(sgmt))]

        diff1y_acf_list = acf(diff1x, fft=True, nlags=nlag)[1:]
        diff1y_acf1s.append(diff1y_acf_list[0])

    return diff1y_acf1s



def compute_diff1y_acf5(segments, label, acfpacf_lag = 6, period = 7, default_status = True):
    """
    
    diff1y_acf5: sum of squares of first 5 ACF values of differenced series
    
    Reference: https://stackoverflow.com/questions/36038927/whats-the-difference-between-pandas-acf-and-statsmodel-acf
    R code: https://cran.r-project.org/web/packages/tsfeatures/vignettes/tsfeatures.html
    Paper: Meta-learning how to forecast time series


    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - label (int): Label we want to compute medians for.
    - acfpacf_lag (int) Largest lag number for returning ACF/PACF features
                via statsmodels.
    - period (int) Seasonal period.
    - default_status (Bool): Default status of the switch for calculate the
                features or not.


    Returns:
    List of diff1y_acf5 features
    """
    diff1y_acf5s = []
    
    for sgmt in segments:
        log_helper(sgmt)

        nlag = min(acfpacf_lag, len(sgmt) - 2)

        diff1x = [sgmt[i] - sgmt[i - 1] for i in range(1, len(sgmt))]

        diff1y_acf_list = acf(diff1x, fft=True, nlags=nlag)[1:]
        diff1y_acf5s.append(np.sum(np.asarray(diff1y_acf_list)[:5] ** 2))

    return diff1y_acf5s
                        



def compute_diff2y_acf1(segments, label, acfpacf_lag = 6, period = 7, default_status = True):
    """
    
    diff2y_acf1: first ACF value of the twice-differenced series
    
    Reference: https://stackoverflow.com/questions/36038927/whats-the-difference-between-pandas-acf-and-statsmodel-acf
    R code: https://cran.r-project.org/web/packages/tsfeatures/vignettes/tsfeatures.html
    Paper: Meta-learning how to forecast time series


    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - label (int): Label we want to compute medians for.
    - acfpacf_lag (int) Largest lag number for returning ACF/PACF features
                via statsmodels.
    - period (int) Seasonal period.
    - default_status (Bool): Default status of the switch for calculate the
                features or not.


    Returns:
    List of diff2y_acf1 features
    """
    diff2y_acf1s = []
    
    for sgmt in segments:
        log_helper(sgmt)

        nlag = min(acfpacf_lag, len(sgmt) - 2)

        diff1x = [sgmt[i] - sgmt[i - 1] for i in range(1, len(sgmt))]
        diff2x = [diff1x[i] - diff1x[i - 1] for i in range(1, len(diff1x))]

        diff2y_acf_list = acf(diff2x, fft=True, nlags=nlag)[1:]
        diff2y_acf1s.append(diff2y_acf_list[0])

    return diff2y_acf1s



def compute_diff2y_acf5(segments, label, acfpacf_lag = 6, period = 7, default_status = True):
    """

    diff2y_acf5: sum of squares of first 5 ACF values of twice-differenced series

    
    Reference: https://stackoverflow.com/questions/36038927/whats-the-difference-between-pandas-acf-and-statsmodel-acf
    R code: https://cran.r-project.org/web/packages/tsfeatures/vignettes/tsfeatures.html
    Paper: Meta-learning how to forecast time series


    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - label (int): Label we want to compute medians for.
    - acfpacf_lag (int) Largest lag number for returning ACF/PACF features
                via statsmodels.
    - period (int) Seasonal period.
    - default_status (Bool): Default status of the switch for calculate the
                features or not.


    Returns:
    List of diff2y_acf5 features
    """
    diff2y_acf5s = []
    
    for sgmt in segments:
        log_helper(sgmt)

        nlag = min(acfpacf_lag, len(sgmt) - 2)

        diff1x = [sgmt[i] - sgmt[i - 1] for i in range(1, len(sgmt))]
        diff2x = [diff1x[i] - diff1x[i - 1] for i in range(1, len(diff1x))]

        diff2y_acf_list = acf(diff2x, fft=True, nlags=nlag)[1:]

        diff2y_acf5s.append(np.sum(np.asarray(diff2y_acf_list)[:5] ** 2))

    return diff2y_acf5s



def compute_y_pacf5(segments, label, acfpacf_lag = 6, period = 7, default_status = True):
    """

    y_pacf5: sum of squares of first 5 PACF values of original series
    
    Reference: https://stackoverflow.com/questions/36038927/whats-the-difference-between-pandas-acf-and-statsmodel-acf
    R code: https://cran.r-project.org/web/packages/tsfeatures/vignettes/tsfeatures.html
    Paper: Meta-learning how to forecast time series


    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - label (int): Label we want to compute medians for.
    - acfpacf_lag (int) Largest lag number for returning ACF/PACF features
                via statsmodels.
    - period (int) Seasonal period.
    - default_status (Bool): Default status of the switch for calculate the
                features or not.


    Returns:
    List of y_pacf5 features
    """
    y_pacf5s = []
    
    for sgmt in segments:
        log_helper(sgmt)

        nlag = min(acfpacf_lag, len(sgmt) - 2)

        y_pacf_list = pacf(sgmt, nlags=period)[1:]

        y_pacf5s.append(np.nansum(np.asarray(y_pacf_list)[:5] ** 2))

    return y_pacf5s



def compute_diff1y_pacf5(segments, label, acfpacf_lag = 6, period = 7, default_status = True):
    """

    diff1y_pacf5: sum of squares of first 5 PACF values of differenced series
    
    Reference: https://stackoverflow.com/questions/36038927/whats-the-difference-between-pandas-acf-and-statsmodel-acf
    R code: https://cran.r-project.org/web/packages/tsfeatures/vignettes/tsfeatures.html
    Paper: Meta-learning how to forecast time series


    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - label (int): Label we want to compute medians for.
    - acfpacf_lag (int) Largest lag number for returning ACF/PACF features
                via statsmodels.
    - period (int) Seasonal period.
    - default_status (Bool): Default status of the switch for calculate the
                features or not.


    Returns:
    List of diff1y_pacf5 features
    """
    diff1y_pacf5s = []
    for sgmt in segments:
        log_helper(sgmt)

        nlag = min(acfpacf_lag, len(sgmt) - 2)

        diff1x = [sgmt[i] - sgmt[i - 1] for i in range(1, len(sgmt))]
        
        diff1y_pacf_list = pacf(diff1x, nlags=nlag)[1:]
        diff1y_pacf5s.append(np.nansum(np.asarray(diff1y_pacf_list)[:5] ** 2))

    return diff1y_pacf5s




def compute_diff2y_pacf5(segments, label, acfpacf_lag = 6, period = 7, default_status = True):
    """

    diff2y_pacf5: sum of squares of first 5 PACF values of twice-differenced series
    
    Reference: https://stackoverflow.com/questions/36038927/whats-the-difference-between-pandas-acf-and-statsmodel-acf
    R code: https://cran.r-project.org/web/packages/tsfeatures/vignettes/tsfeatures.html
    Paper: Meta-learning how to forecast time series


    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - label (int): Label we want to compute medians for.
    - acfpacf_lag (int) Largest lag number for returning ACF/PACF features
                via statsmodels.
    - period (int) Seasonal period.
    - default_status (Bool): Default status of the switch for calculate the
                features or not.


    Returns:
    List of diff2y_pacf5 features
    """
    diff2y_pacf5s = []
    
    for sgmt in segments:
        log_helper(sgmt)

        nlag = min(acfpacf_lag, len(sgmt) - 2)

        diff1x = [sgmt[i] - sgmt[i - 1] for i in range(1, len(sgmt))]
        diff2x = [diff1x[i] - diff1x[i - 1] for i in range(1, len(diff1x))]

        diff2y_pacf_list = pacf(diff2x, nlags=nlag)[1:]
        diff2y_pacf5s.append(np.nansum(np.asarray(diff2y_pacf_list)[:5] ** 2))

    return diff2y_pacf5s




def compute_seas_acf1(segments, label, acfpacf_lag = 6, period = 7, default_status = True):
    """

    Autocorrelation coefficient at the first seasonal lag
    
    Reference: https://stackoverflow.com/questions/36038927/whats-the-difference-between-pandas-acf-and-statsmodel-acf
    R code: https://cran.r-project.org/web/packages/tsfeatures/vignettes/tsfeatures.html
    Paper: Meta-learning how to forecast time series


    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - label (int): Label we want to compute medians for.
    - acfpacf_lag (int) Largest lag number for returning ACF/PACF features
                via statsmodels.
    - period (int) Seasonal period.
    - default_status (Bool): Default status of the switch for calculate the
                features or not.


    Returns:
    List of seas_acf1 features
    """
    seas_acf1s = []
    
    for sgmt in segments:
        log_helper(sgmt)

        y_acf_list = acf(sgmt, fft=True, nlags=period)[1:]

        seas_acf1s.append(y_acf_list[-1])

    return seas_acf1s




def compute_seas_pacf1(segments, label, acfpacf_lag = 6, period = 7, default_status = True):
    """
    Patial Autocorrelation coefficient at the first seasonal lag
    
    Reference: https://stackoverflow.com/questions/36038927/whats-the-difference-between-pandas-acf-and-statsmodel-acf
    R code: https://cran.r-project.org/web/packages/tsfeatures/vignettes/tsfeatures.html
    Paper: Meta-learning how to forecast time series


    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - label (int): Label we want to compute medians for.
    - acfpacf_lag (int) Largest lag number for returning ACF/PACF features
                via statsmodels.
    - period (int) Seasonal period.
    - default_status (Bool): Default status of the switch for calculate the
                features or not.


    Returns:
    List of seas_pacf1 features
    """
    seas_pacf1s = []

    for sgmt in segments:
        log_helper(sgmt)

        y_pacf_list = pacf(sgmt, nlags=period)[1:]

        y_pacf_list[-1]
        seas_pacf1s.append(y_pacf_list[-1])

    return seas_pacf1s






def compute_firstmin_ac(segments, label, default_status = True):
    """

    Computes firstmin_ac: the time of first minimum in the autocorrelation function

    
    Reference: https://stackoverflow.com/questions/36038927/whats-the-difference-between-pandas-acf-and-statsmodel-acf
    R code: https://cran.r-project.org/web/packages/tsfeatures/vignettes/tsfeatures.html
    Paper: Meta-learning how to forecast time series


    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - label (int): Label we want to compute medians for.
    - default_status (Bool): Default status of the switch for calculate the
                features or not.


    Returns:
    List of firstmin_ac features
    """
    
    # First min AC
    firstmin_ac_list = []
    
    for sgmt in segments:

        AC = acf(sgmt, fft=True, nlags=len(sgmt))[1:]
        i = 0
        while i < len(AC) - 1:
            if AC[i] > AC[i + 1]:
                i += 1
            else:
                break
        firstmin_ac = i + 1
        firstmin_ac_list.append(firstmin_ac)
    return firstmin_ac_list



def compute_firstzero_ac(segments, label, default_status = True):
    """

    Computes firstzero_ac: the time of first zero crossing the autocorrelation function.
    
    Reference: https://stackoverflow.com/questions/36038927/whats-the-difference-between-pandas-acf-and-statsmodel-acf
    R code: https://cran.r-project.org/web/packages/tsfeatures/vignettes/tsfeatures.html
    Paper: Meta-learning how to forecast time series


    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - label (int): Label we want to compute medians for.
    - default_status (Bool): Default status of the switch for calculate the
                features or not.


    Returns:
    List of firstzero_ac features
    """

    firstzero_ac_list = []
    
    for sgmt in segments:

        AC = acf(sgmt, fft=True, nlags=len(sgmt))[1:]
        j = 0
        while j < len(AC) - 1:
            if AC[j] > 0 and AC[j + 1] < 0:
                break
            else:
                j += 1
        firstzero_ac = j + 2
        firstzero_ac_list.append(firstzero_ac)
        
    return firstzero_ac_list



### ADDITIONAL FEATURES NOT CONTAINED IN KATS ###

def compute_medians(segments, label):
    """
    Computes medians of all segments for a certain label

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - indices_dict (dicts): dictionary which contains all segment indices (values) for each label (key) for either train or test set.
    - label (int): Label we want to compute medians for. 1, 3, 5 or 7.

    Returns:
    - Medians of time series segments (list of floats)
    """
    
    medians = []

    for sgmt in segments:
        medians.append(statistics.median(sgmt))

    return medians

    

def compute_minimums(segments, label):
    """
    Computes minimums of all segments for a certain label

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - indices_dict (dicts): dictionary which contains all segment indices (values) for each label (key) for either train or test set.
    - label (int): Label we want to compute minimums for. 1, 3, 5 or 7.

    Returns:
    - Minimums of time series segments (list of floats)
    """
    
    minimums = []

    for sgmt in segments:
        minimums.append(min(sgmt))

    return minimums


def compute_maximums(segments, label):
    """
    Computes maximums of all segments for a certain label

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - indices_dict (dicts): dictionary which contains all segment indices (values) for each label (key) for either train or test set.
    - label (int): Label we want to compute maximums for. 1, 3, 5 or 7.

    Returns:
    - Maximums of time series segments (list of floats)
    """
    
    maximums = []

    for sgmt in segments:
        maximums.append(max(sgmt))

    return maximums


def compute_ranges(segments, label):
    """
    Computes ranges of all segments for a certain label

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - indices_dict (dicts): dictionary which contains all segment indices (values) for each label (key) for either train or test set.
    - label (int): Label we want to compute ranges for. 1, 3, 5 or 7.

    Returns:
    - Ranges of time series segments (list of floats)
    """
    
    ranges = []

    for sgmt in segments:
        ranges.append(max(sgmt) - min(sgmt))

    return ranges


def compute_kurtoses(segments, label):
    """
    Computes kurtoses of all segments for a certain label

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - indices_dict (dict): dictionary which contains all segment indices (values) for each label (key) for either train or test set.
    - label (int): Label we want to compute kurtoses for. 1, 3, 5 or 7.

    Returns:
    - Kurtoses of time series segments (list of floats)
    """
    
    kurtoses = []

    for sgmt in segments:
        kurtoses.append(stats.kurtosis(sgmt))

    return kurtoses


def compute_skewnesses(segments, label):
    """
    Computes skewnesses of all segments for a certain label

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments
    - indices_dict (dict): dictionary which contains all segment indices (values) for each label (key) for either train or test set.
    - label (int): Label we want to compute skewnesses for. 1, 3, 5 or 7.

    Returns:
    - Skewnesses of time series segments (list of floats)
    """
    
    skewnesses = []

    for sgmt in segments:
        skewnesses.append(stats.skew(sgmt))

    return skewnesses


def compute_longest_strike_above_mean(segments, label):
    """
    Computes the longest strike above the mean for all segments for a certain label.

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments.
    - label (int): Label we want to compute longest strikes for. (not used in this function but included to match signature)

    Returns:
    - Longest strikes above the mean of time series segments (list of ints)
    """
    
    longest_strikes_above_mean = []

    for sgmt in segments:
        mean_value = statistics.mean(sgmt)
        longest_strike = 0
        current_strike = 0

        for value in sgmt:
            if value > mean_value:
                current_strike += 1
                if current_strike > longest_strike:
                    longest_strike = current_strike
            else:
                current_strike = 0

        longest_strikes_above_mean.append(longest_strike)

    return longest_strikes_above_mean


def compute_longest_strike_below_mean(segments, label):
    """
    Computes the longest strike below the mean for all segments for a certain label.

    Parameters:
    - segments (list of lists): Complete EEG/EMG segments.
    - label (int): Label we want to compute longest strikes for. (not used in this function but included to match signature)

    Returns:
    - Longest strikes below the mean of time series segments (list of ints)
    """
    
    longest_strikes_below_mean = []

    for sgmt in segments:
        mean_value = statistics.mean(sgmt)
        longest_strike = 0
        current_strike = 0

        for value in sgmt:
            if value < mean_value:
                current_strike += 1
                if current_strike > longest_strike:
                    longest_strike = current_strike
            else:
                current_strike = 0

        longest_strikes_below_mean.append(longest_strike)

    return longest_strikes_below_mean





In [199]:
def create_kats_statistics_feature_dataframes(label_list, segments, data_type, subject):
    """
    Create feature dataframes for each label using segment data and concatenate them.
    
    Args:
    - label_list (list): List of labels.
    - segments (dict): Dictionary containing segments for each label.
    - data_type (str): Type of data.
    - subject (str): Subject identifier.
    
    Returns:
    - pd.DataFrame: Concatenated feature dataframe.
    """
    feature_dataframes = {}

    for label in label_list:
        feature_dataframes[label] = pd.DataFrame()
        feature_dataframes[label][str(data_type)+"_Mean"] = compute_means(segments[label], label)
        feature_dataframes[label][str(data_type)+"_Variance"] = compute_variance(segments[label], label)
        feature_dataframes[label][str(data_type)+"_Entropy"] = compute_entropy(segments[label], label)
        feature_dataframes[label][str(data_type)+"_Lumpiness"] = compute_lumpiness(segments[label], label)
        feature_dataframes[label][str(data_type)+"_Stability"] = compute_stabilities(segments[label], label)
        feature_dataframes[label][str(data_type)+"_Hurst"] = compute_hursts(segments[label], label)
        feature_dataframes[label][str(data_type)+"_STD_Derivative"] = compute_standard_dev_of_first_der(segments[label], label)
        feature_dataframes[label][str(data_type)+"_Crossing_Points"] = compute_crossing_points(segments[label], label)
        feature_dataframes[label][str(data_type)+"_Binarized_Means"] = compute_binarized_means(segments[label], label)
        feature_dataframes[label][str(data_type)+"_Unitroot_KPSS"] = compute_unitroot_kpss(segments[label], label)
        feature_dataframes[label][str(data_type)+"_Heterogenity"] = compute_heterogenity(segments[label], label)
        feature_dataframes[label][str(data_type)+"_Histogram_Mode"] = compute_histogram_mode(segments[label], label)
        feature_dataframes[label][str(data_type)+"_Linearity"] = compute_linearity(segments[label], label)


        feature_dataframes[label]["Label"] = label
        feature_dataframes[label]["Subject"] = subject

    # Concatenate dataframes while resetting indices (ignore_index=True)
    feature_df = pd.concat([feature_dataframes[0], feature_dataframes[1], feature_dataframes[2],
                                  feature_dataframes[3], feature_dataframes[4]], ignore_index=True)
    
    return feature_df



In [200]:
#feature_df = create_kats_statistics_feature_dataframes(label_list, segments, data_type, subject)
#feature_df.to_csv("Features/"+str(subject)+"/"+str(data_type)+"/Statistical_Features_KATS_Statistics.csv")

In [201]:
def create_level_shift_features_dataframes(label_list, segments, data_type, subject):
    """
    Create feature dataframes for each label using segment data and concatenate them.
    
    Args:
    - label_list (list): List of labels.
    - segments (dict): Dictionary containing segments for each label.
    - data_type (str): Type of data.
    - subject (str): Subject identifier.
    
    Returns:
    - pd.DataFrame: Concatenated feature dataframe.
    """
    feature_dataframes = {}

    for label in label_list:
        feature_dataframes[label] = pd.DataFrame()
        feature_dataframes[label][str(data_type)+"_Level_Shift_Idx"] = compute_level_shift_idx(segments[label], label)
        feature_dataframes[label][str(data_type)+"_Level_Shift_Size"] = compute_level_shift_size(segments[label], label)
        
        feature_dataframes[label]["Label"] = label
        feature_dataframes[label]["Subject"] = subject

    # Concatenate dataframes while resetting indices (ignore_index=True)
    feature_df = pd.concat([feature_dataframes[0], feature_dataframes[1], feature_dataframes[2],
                                  feature_dataframes[3], feature_dataframes[4]], ignore_index=True)
    
    return feature_df



In [202]:
#feature_df = create_level_shift_features_dataframes(label_list, segments, data_type, subject)
#feature_df.to_csv("Features/"+str(subject)+"/"+str(data_type)+"/Statistical_Features_Level_Shift_Features.csv")

In [203]:
def create_autocorrelation_features_dataframes(label_list, segments, data_type, subject):
    """
    Create feature dataframes for each label using segment data and concatenate them.
    
    Args:
    - label_list (list): List of labels.
    - segments (dict): Dictionary containing segments for each label.
    - data_type (str): Type of data.
    - subject (str): Subject identifier.
    
    Returns:
    - pd.DataFrame: Concatenated feature dataframe.
    """
    feature_dataframes = {}

    for label in label_list:
        feature_dataframes[label] = pd.DataFrame()
        feature_dataframes[label][str(data_type)+"_y_acf1"] = compute_y_acf1(segments[label], label)
        feature_dataframes[label][str(data_type)+"_y_acf5"] = compute_y_acf5(segments[label], label)
        feature_dataframes[label][str(data_type)+"_diff1y_acf1"] = compute_diff1y_acf1(segments[label], label)
        feature_dataframes[label][str(data_type)+"_diff1y_acf5"] = compute_diff1y_acf5(segments[label], label)
        feature_dataframes[label][str(data_type)+"_diff2y_acf1"] = compute_diff2y_acf1(segments[label], label)
        feature_dataframes[label][str(data_type)+"_diff2y_acf5"] = compute_diff2y_acf5(segments[label], label)
        feature_dataframes[label][str(data_type)+"_diff1y_pacf5"] = compute_diff1y_pacf5(segments[label], label)
        feature_dataframes[label][str(data_type)+"_diff2y_pacf5"] = compute_diff2y_pacf5(segments[label], label)
        feature_dataframes[label][str(data_type)+"_seas_acf1"] = compute_seas_acf1(segments[label], label)
        feature_dataframes[label][str(data_type)+"_seas_pacf1"] = compute_seas_pacf1(segments[label], label)
        feature_dataframes[label][str(data_type)+"_firstmin_ac"] = compute_firstmin_ac(segments[label], label)
        feature_dataframes[label][str(data_type)+"_firstzero_ac"] = compute_firstzero_ac(segments[label], label)
        
        feature_dataframes[label]["Label"] = label
        feature_dataframes[label]["Subject"] = subject

    # Concatenate dataframes while resetting indices (ignore_index=True)
    feature_df = pd.concat([feature_dataframes[0], feature_dataframes[1], feature_dataframes[2],
                                  feature_dataframes[3], feature_dataframes[4]], ignore_index=True)
    
    return feature_df


In [204]:
feature_df = create_autocorrelation_features_dataframes(label_list, segments, data_type, subject)
feature_df.to_csv("Features/"+str(subject)+"/"+str(data_type)+"/Statistical_Features_Autocorrelation_Features.csv")

In [205]:
def create_additional_features_dataframes(label_list, segments, data_type, subject):
    """
    Create feature dataframes for each label using segment data and concatenate them.
    
    Args:
    - label_list (list): List of labels.
    - segments (dict): Dictionary containing segments for each label.
    - data_type (str): Type of data.
    - subject (str): Subject identifier.
    
    Returns:
    - pd.DataFrame: Concatenated feature dataframe.
    """
    feature_dataframes = {}

    for label in label_list:
        feature_dataframes[label] = pd.DataFrame()
        feature_dataframes[label][str(data_type)+"_Median"] = compute_medians(segments[label], label)
        feature_dataframes[label][str(data_type)+"_Maximum"] = compute_maximums(segments[label], label)
        feature_dataframes[label][str(data_type)+"_Minimum"] = compute_minimums(segments[label], label)
        feature_dataframes[label][str(data_type)+"_Range"] = compute_ranges(segments[label], label)
        feature_dataframes[label][str(data_type)+"_Longest_Strike_Above_Mean"] = compute_longest_strike_above_mean(segments[label], label)
        feature_dataframes[label][str(data_type)+"_Longest_Strike_Below_Mean"] = compute_longest_strike_below_mean(segments[label], label)
        
        feature_dataframes[label]["Label"] = label
        feature_dataframes[label]["Subject"] = subject

    # Concatenate dataframes while resetting indices (ignore_index=True)
    feature_df = pd.concat([feature_dataframes[0], feature_dataframes[1], feature_dataframes[2],
                                  feature_dataframes[3], feature_dataframes[4]], ignore_index=True)
    
    return feature_df



In [206]:
#feature_df = create_kats_statistics_feature_dataframes(label_list, segments, data_type, subject)
#feature_df.to_csv("Features/"+str(subject)+"/"+str(data_type)+"/Statistical_Features_Additional_Features.csv")

This code contains modified functions from the python package kats, which itself is only
available for earlier Python versions.

MIT License

Copyright (c) Facebook, Inc. and its affiliates.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
