In [None]:
import numpy as np
import pandas as pd

from tsfresh.feature_extraction import extract_features, EfficientFCParameters
from tsfresh.feature_selection.relevance import calculate_relevance_table

from sklearn.model_selection import train_test_split

In [None]:
from utils import Utils
utils = Utils()

In [None]:
def get_balanced_split(all_sequences, labels, gender_info):

    train_sequences = []
    validation_sequences = []
    train_labels = []
    validation_labels = []
    train_gender = []
    validation_gender = []

    for i in [0, 1]:
        print("Gender:", i)
        gender_ind = list(np.argwhere(gender_info == i).T[0])

        all_sequences_ = [sequences for j, sequences in enumerate(all_sequences) if j in gender_ind]
        print(len(all_sequences_))
        labels_ = [label for j, label in enumerate(labels) if j in gender_ind]
        labels_ = np.array(labels_)

        split_ratio = 0.15
        if i == 1:
            split_ratio = 0.3

        train_sequences_, validation_sequences_, train_labels_, validation_labels_, train_ind, val_ind = train_test_split(all_sequences_, labels_, gender_ind, stratify=labels_, test_size=split_ratio, random_state=2)
        
        """
        print("--Train:", len(train_sequences_))
        print("patient indexes:", train_ind)
        print("----", sum(train_labels_ == 0))
        print("----", sum(train_labels_ == 1))
        print("--Val:", len(validation_sequences_))
        print("patient indexes:", val_ind)
        print("----", sum(validation_labels_ == 0))
        print("----", sum(validation_labels_ == 1))
        """

        train_sequences.append(train_sequences_)
        validation_sequences.append(validation_sequences_)
        train_labels.append(list(train_labels_))
        validation_labels.append(list(validation_labels_))
        train_gender.append([i]*len(train_sequences_))
        validation_gender.append([i]*len(validation_sequences_))

    # flatten
    train_sequences = sum(train_sequences, [])
    validation_sequences = sum(validation_sequences, [])
    train_labels = np.array(sum(train_labels, []))
    validation_labels = np.array(sum(validation_labels, []))
    train_gender = sum(train_gender, [])
    validation_gender = sum(validation_gender, [])
    
    return train_sequences, validation_sequences, train_labels, validation_labels, train_gender, validation_gender

In [None]:
def prepare_data_format(sequences):

    # Create a DataFrame with columns: time, value, electrode, patient_id
    data = []
    for patient_idx, patient_data in enumerate(sequences):  # Iterate over patients
        for electrode_idx, electrode_values in enumerate(patient_data):  # Iterate over electrodes
            time_indices = range(1, len(electrode_values) + 1)  # Create time indices starting from 1
            for time_idx, value in zip(time_indices, electrode_values):  # Iterate over time-value pairs
                data.append([patient_idx, electrode_idx, time_idx, value])

    df = pd.DataFrame(data, columns=["patient_id", "electrode", "time", "value"])

    # Add a unique ID for each patient and electrode combination
    df["id"] = df["patient_id"].astype(str) + "-" + df["electrode"].astype(str)

    print("Patient data prepared... ", len(df))

    return df

In [None]:
def remove_highly_correlated_features(df, correlation_threshold=0.9, value_threshold=0.9):
    # Step 1: Remove features with high correlation
    corr_matrix = df.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    to_drop_corr = [column for column in upper_tri.columns if any(upper_tri[column] > correlation_threshold)]
    
    # Step 2: Remove features with a dominant value (90% or more)
    to_drop_value = []
    for column in df.columns:
        # Check if the most frequent value represents more than `value_threshold` of the data
        if df[column].value_counts(normalize=True).iloc[0] >= value_threshold:
            to_drop_value.append(column)
    
    # Combine the features to drop
    to_drop = set(to_drop_corr + to_drop_value)
    
    # Drop the identified features
    df_cleaned = df.drop(columns=to_drop)
    
    return df_cleaned

In [None]:
def transform_df(df):
    # Convert 'electrode' to integer for proper sorting (optional, depending on your data)
    df['electrode'] = df.index.str.split("-").str[1].astype(int)
    df['patient_id'] = df.index.str.split("-").str[0]

    # Now, create a new column name based on the electrode and use it as a suffix
    df = df.set_index(['patient_id', 'electrode'])

    # Pivot the DataFrame so that each patient has all their electrode features in separate columns
    df_pivot = df.unstack(level='electrode')

    # Flatten the MultiIndex in columns by concatenating the feature and electrode
    df_pivot.columns = [f"{col[0]}_electrode_{col[1]}" for col in df_pivot.columns]

    # Reset index to bring 'patient_id' back as a column
    df_pivot.reset_index(inplace=True)

    return df_pivot


In [None]:
def extract_tsfresh_features(df, labels, n=None):
    # Extract features for all electrodes of all patients
    extracted_features = extract_features(
        df,
        column_id="id",          # Unique identifier (patient + electrode)
        column_sort="time",      # Sort by time for time series
        column_value="value",    # Observed value column
        default_fc_parameters=EfficientFCParameters(),  # Feature extraction parameters
        n_jobs=4                 # Parallelization
    )

    # Expand patient IDs to match electrode IDs
    extracted_features["patient_id"] = extracted_features.index.str.split("_").str[0]
    print(extracted_features.shape)
    print("Initial", extracted_features.columns)

    if n != None:
        extracted_features = extracted_features.dropna(axis=1)
        extracted_features = extracted_features.loc[:, extracted_features.nunique() > 1]

        extracted_features_ = remove_highly_correlated_features(extracted_features.drop("patient_id", axis=1))
        print(extracted_features_.shape)
        remaining_features = list(extracted_features_.columns)
        extracted_features_reduced = extracted_features[[*remaining_features, "patient_id"]]

        aggregated_features = transform_df(extracted_features_reduced)
        print("Reduced", aggregated_features.shape)
    else:
        aggregated_features = transform_df(extracted_features)
        print("Skipping reducing...")

    aggregated_features['patient_id'] = aggregated_features['patient_id'].astype(int)
    aggregated_features = aggregated_features.sort_values(by='patient_id', ascending=True)
    aggregated_features = aggregated_features.drop(columns=["patient_id"], errors="ignore")
    aggregated_features = aggregated_features.reset_index().drop("index", axis=1)
    #print(aggregated_features)

    y_series = pd.Series(labels, index=[p for p in range(len(aggregated_features))])

    if n != None:
        # Compute feature relevance across all patients
        relevance_table = calculate_relevance_table(aggregated_features, y_series)

        top_features = relevance_table.nsmallest(n, "p_value")["feature"]
        # Retain only the selected features in the feature matrix
        final_features = aggregated_features[top_features]
    else:
        final_features = aggregated_features
        top_features = None


    final_features.index = pd.to_numeric(final_features.index)
    final_features = final_features.sort_index()

    #extracted_features.index = pd.to_numeric(extracted_features.index)
    #extracted_features = extracted_features.sort_index()
    
    return final_features, list(final_features.columns)

MDD

In [None]:
path='./feature_matrices/'

for band in ["alpha", "beta", "theta"]:
    print("***************************************")
    print("***********{}********************".format(band))
    print("***************************************")
    
    n = None
    l = 134

    utils.read_data(band=band)
    all_sequences = utils.get_all_patient_signals(length=n)[0:l]
    labels = np.array(utils.get_labels(labeling=1))[0:l]

    test_sequences = utils.get_all_patient_signals(length=n)[l:]
    test_labels = utils.get_labels(labeling=1, dataset="test")

    all_gender_info = utils.get_gender_info()
    gender_info = all_gender_info[:l]
    test_gender = all_gender_info[l:]

    train_sequences, validation_sequences, train_labels, validation_labels, train_gender, validation_gender = get_balanced_split(all_sequences, labels, gender_info)

    # train
    df_train = prepare_data_format(train_sequences)
    # perform feature selection only in the training dataset
    train_feature_matrix, top_features = extract_tsfresh_features(df_train, train_labels, n=80)
    train_feature_matrix["label"] = train_labels
    print("Writing... train feature matrix")
    train_feature_matrix.to_csv(path+"tsfresh_{}_train.csv".format(band), index=True)

    # val
    df_val = prepare_data_format(validation_sequences)
    val_feature_matrix, _ = extract_tsfresh_features(df_val, validation_labels, n=None)
    val_feature_matrix = val_feature_matrix[top_features]
    print(len(list(val_feature_matrix.columns)))

    val_feature_matrix["label"] = validation_labels
    print("Writing... validation feature matrix")
    val_feature_matrix.to_csv(path+"tsfresh_{}_val.csv".format(band), index=True)

    # test
    df_test = prepare_data_format(test_sequences)
    test_feature_matrix, _  = extract_tsfresh_features(df_test, test_labels, n=None)
    test_feature_matrix = test_feature_matrix[top_features]
    print(len(list(test_feature_matrix.columns)))

    test_feature_matrix["label"] = test_labels
    print("Writing... test feature matrix")
    test_feature_matrix.to_csv(path+"tsfresh_{}_test.csv".format(band), index=True)


SCH