In [19]:
%matplotlib inline
import stumpy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.patches import Rectangle
import time

from sklearn.model_selection import train_test_split

fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 20
fig_size[1] = 2
plt.rcParams["figure.figsize"] = fig_size
plt.rcParams['xtick.direction'] = 'out'

In [2]:
from utils import Utils
utils = Utils()

### Balanced train-validation split

In [4]:
def get_balanced_split(all_sequences, labels, gender_info):

    train_sequences = []
    validation_sequences = []
    train_labels = []
    validation_labels = []
    train_gender = []
    validation_gender = []

    for i in [0, 1]:
        print("Gender:", i)
        gender_ind = list(np.argwhere(gender_info == i).T[0])

        all_sequences_ = [sequences for j, sequences in enumerate(all_sequences) if j in gender_ind]
        print(len(all_sequences_))
        labels_ = [label for j, label in enumerate(labels) if j in gender_ind]
        labels_ = np.array(labels_)

        split_ratio = 0.15
        if i == 1:
            split_ratio = 0.3

        train_sequences_, validation_sequences_, train_labels_, validation_labels_, train_ind, val_ind = train_test_split(all_sequences_, labels_, gender_ind, stratify=labels_, test_size=split_ratio, random_state=2)
        print("--Train:", len(train_sequences_))
        print("patient indexes:", train_ind)
        print("----", sum(train_labels_ == 0))
        print("----", sum(train_labels_ == 1))
        print("--Val:", len(validation_sequences_))
        print("patient indexes:", val_ind)
        print("----", sum(validation_labels_ == 0))
        print("----", sum(validation_labels_ == 1))

        train_sequences.append(train_sequences_)
        validation_sequences.append(validation_sequences_)
        train_labels.append(list(train_labels_))
        validation_labels.append(list(validation_labels_))
        train_gender.append([i]*len(train_sequences_))
        validation_gender.append([i]*len(validation_sequences_))

    # flatten
    train_sequences = sum(train_sequences, [])
    validation_sequences = sum(validation_sequences, [])
    train_labels = np.array(sum(train_labels, []))
    validation_labels = np.array(sum(validation_labels, []))
    train_gender = sum(train_gender, [])
    validation_gender = sum(validation_gender, [])
    
    return train_sequences, validation_sequences, train_labels, validation_labels, train_gender, validation_gender

In [52]:
def get_motifs_per_electrode(motifs_bank, input_sequences, input_labels, input_genders, electrode=0):

    #start_time = time.time()
    
    fig_size[0] = 20
    fig_size[1] = 2
    plt.rcParams["figure.figsize"] = fig_size
    
    motifs = []
    motifs_ind = []

    for j, sequences in enumerate(input_sequences):

        #print("Patient", j+1)
        #print("Label", input_labels[j])

        signal = sequences[electrode]

        approx = stumpy.scrump(signal, m, percentage=0.05, pre_scrump=True, s=None)
        approx_mp = approx._P

        distances, indices = stumpy.motifs(signal, approx_mp[:, 0], min_neighbors=3, max_distance=np.inf, cutoff=None, max_matches=10, max_motifs=1)
        #print(distances)
        #print(distances.shape)
        
        #if len(distances) == 0 or distances.shape[1] == 0:
        #    print("not found")
        #    motifs.append(np.nan)
        #    motifs_ind.append(np.nan)
        #    continue
        
        """
        plt.plot(signal)
        plt.show()
        """
        
        for i in range(len(distances)):
            """
            fig_size[0] = 20
            fig_size[1] = 2
            plt.rcParams["figure.figsize"] = fig_size
            
            print(i)
            idx = indices[i][0]
            start=max(0,(idx-500))
            end=min((idx+m+500),len(signal))
            plt.plot(range(start,end), signal[start:end])
            """
            #print(indices)
            
            idx = indices[i][0]
            motif = signal[idx:idx+m]
            class_0_distances, class_1_distances, class_0_distances_, class_1_distances_ = compute_distances_to_classes(motif, input_sequences, input_labels, electrode, j)
            diff = np.abs(np.mean(class_0_distances) - np.mean(class_1_distances))
            diff_ = np.abs(np.mean(class_0_distances_) - np.mean(class_1_distances_))
            #print(diff_)
            
            """
            plt.plot(range(idx, idx+m), signal[idx:idx+m], label="distance: {} diff: {} diff_: {}".format(str(np.mean(distances[i])),str(diff),str(diff_)))
            plt.ylabel(str(i+1), fontsize=20)
            plt.legend()
            plt.show()
            
            fig_size[0] = 5
            fig_size[1] = 3
            plt.rcParams["figure.figsize"] = fig_size

            plt.boxplot([class_0_distances, class_1_distances])
            plt.xticks([1,2], [0, 1])
            plt.show()
            """
        
        # save first occurence
        idx = indices[0][0]
        motifs.append(signal[idx:idx+m])
        motifs_ind.append(idx)
        
        # save the motif
        motifs_bank["motifs"].append(motif)
        motifs_bank["electrodes"].append(electrode)
        motifs_bank["indexes"].append(indices[i][0])
        motifs_bank["genders"].append(input_genders[j])
        motifs_bank["labels"].append(input_labels[j])
        motifs_bank["train_ind"].append(j)
        motifs_bank["diff_scores"].append(diff_)
        
        #print("--- %s seconds ---" % (time.time() - start_time))

    return motifs_bank


### Filter out motifs that are similarly present (heuristics) among the two classes

In [53]:
def compute_distances_to_classes(motif, input_sequences, input_labels, electrode, index):
    
    class_0_ind = list(np.argwhere(input_labels == 0).T[0])
    class_1_ind = list(np.argwhere(input_labels == 1).T[0])
    
    distances = []
    for j, sequences in enumerate(input_sequences):
        #print("Patient", j+1)
        signal = sequences[electrode]

        matches = stumpy.match(motif, signal, max_distance=None, max_matches=10, normalize=True, p=2.0)
        if len(matches) == 0:
            distances.append(np.nan)
        else:
            dist = np.mean(matches[:, 0])
            distances.append(dist)
        
    perc = int(len(input_sequences)/3)
    
    class_0_distances = [distances[i] for i in class_0_ind if round(distances[i]) > 0 and i!=index]
    class_0_distances_ = sorted(class_0_distances)[0:perc]
    class_1_distances = [distances[i] for i in class_1_ind if round(distances[i]) > 0 and i!=index]
    class_1_distances_ = sorted(class_1_distances)[0:perc]
    
    return class_0_distances, class_1_distances, class_0_distances_, class_1_distances_ 

In [54]:
def sort_data(group):
    return group.sort_values("diff_scores", ascending=False).head(20)

### Feature matrices

In [55]:
def generate_feature_matrix(input_sequences, input_motifs, input_electrodes):
    X = []
    # iterate over given patients
    for j, sequences in enumerate(input_sequences):
        print("Patient", j+1)
        patient_motif_distances = []
        # iterate over all discovered motifs
        for i, motif in enumerate(input_motifs):
            electrode = input_electrodes[i]

            signal = sequences[electrode]
            matches = stumpy.match(motif, signal, max_distance=None, max_matches=3, normalize=True, p=2.0)
            if len(matches) == 0:
                patient_motif_distances.append(np.nan)
            else:
                patient_motif_distances.append(np.mean(matches[:, 0]))
        X.append(patient_motif_distances)
    return X

In [56]:
def save_feature_matrices(path='./feature_matrices/'):
    X_train = generate_feature_matrix(train_sequences, final_motifs, final_electrodes)
    print(len(X_train))
    print(len(X_train[0]))

    X_validation_final = generate_feature_matrix(validation_sequences, final_motifs, final_electrodes)
    print(len(X_validation_final))
    print(len(X_validation_final[0]))
    
    X_test_final = generate_feature_matrix(final_test_sequences, final_motifs, final_electrodes)
    print(len(X_test_final))
    print(len(X_test_final[0]))

    df_train = pd.DataFrame(data = np.array(X_train), columns = motif_names)
    df_train["label"] = train_labels
    print(df_train.head())
    df_train.to_csv(path+"motifs_{}_m{}_train.csv".format(band, m), index=False)
    print("Writing... train feature matrix")

    df_validation = pd.DataFrame(data = np.array(X_validation_final), columns = motif_names)
    df_validation["label"] = validation_labels
    print(df_validation.head())
    df_validation.to_csv(path+"motifs_{}_m{}_val.csv".format(band, m), index=False)
    print("Writing... validation feature matrix")
    
    df_test = pd.DataFrame(data = np.array(X_test_final), columns = motif_names)
    df_test["label"] = final_test_labels
    print(df_test.head())
    df_test.to_csv(save_path + "motifs_ostinato{}test_m={}.csv".format(name, m), index=False)
    print("Writing... testing feature matrix")

### Main

In [None]:
labels = np.array(utils.get_labels(labeling=1, dataset="train"))
genders = np.array(utils.get_genders(dataset="train"))

for band in ['beta', 'theta', 'alpha']:
    print("***************************************")
    print("***********{}********************".format(band))
    print("***************************************")
    # read and split 
    utils.read_data(band=band)
    all_sequences = utils.get_all_patient_signals()
    
    train_sequences, validation_sequences, train_labels, validation_labels, train_gender, validation_gender = get_balanced_split(all_sequences, labels, genders)

    final_test_sequences = utils.get_all_patient_signals(dataset="test")
    final_test_labels = np.array(utils.get_labels(labeling=1, dataset="test"))
    
    print("Train:", len(train_sequences))
    print("Val:", len(validation_sequences))

    for m in [50, 100, 250, 500, 1000, 2000]:
        print("--------------- length {}---------------------".format(m))
        motifs_bank = {
            "motifs": [],
            "electrodes": [],
            "indexes": [],
            "labels": [],
            "genders": [],
            "train_ind": [],
            "diff_scores": []
        }


        for electrode in range(19):
            print("Electrode", electrode)
            motifs_bank = get_motifs_per_electrode(motifs_bank, train_sequences, train_labels, train_gender, electrode)

        motifs_df = pd.DataFrame.from_dict(motifs_bank)
        print(motifs_df["labels"].value_counts())
        motifs_df.to_csv("motifs_{}_m{}.csv".format(band, m))

        # in case of filtering
        # grouped_df = motifs_df.groupby(["labels", "genders"]).apply(sort_data)
        
        grouped_df = motifs_df.copy()
        final_motifs =  grouped_df["motifs"].tolist()
        final_electrodes = grouped_df["electrodes"].tolist()
        final_labels = grouped_df["labels"].tolist()
        final_genders = grouped_df["genders"].tolist()

        motif_names = []
        for i, _ in enumerate(final_motifs):
            motif_names.append("motif_el_{}_class_{}_gender_{}_id_{}".format(final_electrodes[i], final_labels[i], final_genders[i], i))

        save_feature_matrices()