In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import stumpy
import time
import ast
import json
import seaborn as sns

from sklearn.model_selection import train_test_split

from utils import Utils
utils = Utils()

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score as accuracy

### Band

In [None]:
band="theta"
utils.read_data(band=band)

In [None]:
all_sequences = utils.get_all_patient_signals(dataset="train")
labels = np.array(utils.get_labels(labeling=1), dataset="train")
gender_info = utis.get_genders(dataset="train")

test_sequences = utils.get_all_patient_signals(dataset="test")
test_labels = utils.get_labels(labeling=1, dataset="test")
test_final_gender = utils.get_genders(dataset="test")

In [None]:
train_sequences = []
validation_sequences = []
train_labels = []
validation_labels = []
train_gender = []
validation_gender = []

for i in [0, 1]:
    print("Gender:", i)
    gender_ind = list(np.argwhere(gender_info == i).T[0])

    all_sequences_ = [sequences for j, sequences in enumerate(all_sequences) if j in gender_ind]
    print(len(all_sequences_))
    labels_ = [label for j, label in enumerate(labels) if j in gender_ind]
    labels_ = np.array(labels_)

    split_ratio = 0.15
    if i == 1:
        split_ratio = 0.3
        
    train_sequences_, validation_sequences_, train_labels_, validation_labels_, train_ind, val_ind = train_test_split(all_sequences_, labels_, gender_ind, stratify=labels_, test_size=split_ratio, random_state=2)
    print("--Train:", len(train_sequences_))
    print("patient indexes:", train_ind)
    print("----", sum(train_labels_ == 0))
    print("----", sum(train_labels_ == 1))
    print("--Val:", len(validation_sequences_))
    print("patient indexes:", val_ind)
    print("----", sum(validation_labels_ == 0))
    print("----", sum(validation_labels_ == 1))
    
    train_sequences.append(train_sequences_)
    validation_sequences.append(validation_sequences_)
    train_labels.append(list(train_labels_))
    validation_labels.append(list(validation_labels_))
    train_gender.append([i]*len(train_sequences_))
    validation_gender.append([i]*len(validation_sequences_))

# flatten
train_sequences = sum(train_sequences, [])
validation_sequences = sum(validation_sequences, [])
train_labels = np.array(sum(train_labels, []))
validation_labels = np.array(sum(validation_labels, []))
train_gender = sum(train_gender, [])
validation_gender = sum(validation_gender, [])

In [None]:
length=500
df = pd.read_csv("motifs_{}_m{}.csv".format(band, length))
df.sort_values("diff_scores", ascending=False)

In [None]:
def sort_data(group):
    return group.sort_values("diff_scores", ascending=False).head(20)

In [None]:
def generate_feature_matrix(input_sequences, input_motifs, input_electrodes, input_indexes, input_ids):
    X = []
    # iterate over given patients
    for j, sequences in enumerate(input_sequences):
        #print("Patient", j+1)
        patient_motif_distances = []
        # iterate over all discovered motifs
        for i, motif in enumerate(input_motifs):
            electrode = input_electrodes[i]
            original_signal = train_sequences[input_ids[i]][electrode]
            motif = original_signal[input_indexes[i]:input_indexes[i]+m]
            #if type(motif) == str:
            #    try:
            #        motif = np.array(ast.literal_eval(' '.join(motif.split()).replace("[ ", "[").replace("\n", "").replace(" ", ", ")), dtype=np.float64)
            #    except:
            #        print("error reading motif")
            #        print(motif)
            #        print(' '.join(motif.split()).replace("[ ", "[").replace("\n", "").replace(" ", ", "))
                        


            signal = sequences[electrode]
            matches = stumpy.match(motif, signal, max_distance=None, max_matches=3, normalize=True, p=2.0)
            if len(matches) == 0:
                patient_motif_distances.append(np.nan)
            else:
                patient_motif_distances.append(np.min(matches[:, 0]))
        X.append(patient_motif_distances)
    return X

In [None]:
def generate_datasets(grouped_df):
    
    final_motifs =  grouped_df["motifs"].tolist()
    final_electrodes = grouped_df["electrodes"].tolist()
    final_genders = grouped_df["genders"].tolist()
    final_labels = grouped_df["labels"].tolist()
    final_indexes = grouped_df["indexes"].tolist()
    final_ids = grouped_df["train_ind"].tolist()
    
    motif_names = []
    for i, _ in enumerate(final_motifs):
        motif_names.append("motif_el_{}_class_{}_gender_{}_id_{}".format(final_electrodes[i], final_labels[i], final_genders[i], i))
    
    X_train = generate_feature_matrix(train_sequences, final_motifs, final_electrodes, final_indexes, final_ids)
    print(len(X_train))
    print(len(X_train[0]))
    df_train = pd.DataFrame(data = np.array(X_train), columns = motif_names)
    df_train["label"] = train_labels

    X_validation_final = generate_feature_matrix(validation_sequences, final_motifs, final_electrodes, final_indexes, final_ids)
    print(len(X_validation_final))
    print(len(X_validation_final[0]))
    df_validation = pd.DataFrame(data = np.array(X_validation_final), columns = motif_names)
    df_validation["label"] = validation_labels
    
    X_test_final = generate_feature_matrix(test_sequences, final_motifs, final_electrodes, final_indexes, final_ids)
    print(len(X_test_final))
    print(len(X_test_final[0]))
    df_test = pd.DataFrame(data = np.array(X_test_final), columns = motif_names)
    df_test["label"] = test_labels

    return df_train, df_validation, df_test

In [None]:
def run_evaluation_result(X_input_train, X_input_test,  y_input_train, y_input_test, train_input_gender, validation_input_gender):
    f1_train_list = []
    f1_test_list = []
    acc_train_list = []
    acc_test_list = []
    gender_train_0_acc = []
    gender_train_1_acc = []
    gender_test_0_acc = []
    gender_test_1_acc = []

    for model in models:
        print(model)
        trained_model, f1_train, f1_test, acc_train, acc_test, gender_train_acc, gender_test_acc = fit_evaluate_model(model, X_input_train, X_input_test, y_input_train, y_input_test, train_input_gender, validation_input_gender, print_evaluation=False)
        f1_train_list.append(f1_train)
        f1_test_list.append(f1_test)
        acc_train_list.append(acc_train)
        acc_test_list.append(acc_test)
        gender_train_0_acc.append(gender_train_acc[0])
        gender_train_1_acc.append(gender_train_acc[1])
        gender_test_0_acc.append(gender_test_acc[0])
        gender_test_1_acc.append(gender_test_acc[1])

    result_df = pd.DataFrame()
    result_df["model"] = models
    result_df["F1_train"] = f1_train_list
    result_df["Acc_train"] = acc_train_list
    result_df["Acc_train_female"] = gender_train_0_acc
    result_df["Acc_train_male"] = gender_train_1_acc
    result_df["F1_val"] = f1_test_list
    result_df["Acc_val"] = acc_test_list
    result_df["Acc_val_female"] = gender_test_0_acc
    result_df["Acc_val_male"] = gender_test_1_acc
    return result_df

In [None]:
def evaluate_by_gender(y_true, y_pred, gender_labels):
    y_true = np.array(y_true)
    gender_0_ind = np.where(np.array(gender_labels) == 0)
    gender_1_ind = np.where(np.array(gender_labels) == 1)
    #print(gender_labels)
    #print("overall acc:", accuracy(y_pred, y_true))
    ##sns.heatmap(confusion_matrix(y_pred, y_true), annot=True, fmt='d')
    #plt.show()
    
    gender_0_accuracy = accuracy(y_pred[gender_0_ind], y_true[gender_0_ind])
    #print("Gender 0 accuracy:", gender_0_accuracy)
    #sns.heatmap(confusion_matrix(y_pred[gender_0_ind], y_true[gender_0_ind]), annot=True, fmt='d')
    #plt.show()
               
    gender_1_accuracy = accuracy(y_pred[gender_1_ind], y_true[gender_1_ind])
    #print("Gender 1 accuracy:", gender_1_accuracy)
    #sns.heatmap(confusion_matrix(y_pred[gender_1_ind], y_true[gender_1_ind]), annot=True, fmt='d')
    #plt.show()
    
    return gender_0_accuracy, gender_1_accuracy
    
    
#trained_model, f1_train, f1_test, acc_train, acc_test = fit_evaluate_model(models[0], X_train, X_test_final, y_train, y_test_final, print_evaluation=False)

In [None]:
def fit_evaluate_model(model, X_train, X_test, y_train, y_test, train_gender, validation_gender, model_name=None, print_evaluation=True):
    model.fit(X_train, y_train)
    
    #print("Training set")
    y_pred = model.predict(X_train)
    f1_train = f1_score(y_train, y_pred)
    acc_train = accuracy(y_train, y_pred)
    #print("F1:", f1_train)
    #print("Acc:", acc_train)
    
    if print_evaluation:

        print(classification_report(y_train,y_pred))
        cm = confusion_matrix(y_train,y_pred)
        sns.heatmap(cm, annot=True, fmt='d')
        plt.show()
        
    #print(len(y_train))
    #print(len(train_gender))
    gender_train_acc = evaluate_by_gender(y_train, y_pred, train_gender)
    
    y_pred = model.predict(X_test)

    #print("Testing set")
    f1_test = f1_score(y_test, y_pred)
    acc_test = accuracy(y_test, y_pred)
    #print("F1:", f1_test)
    #print("Acc:", acc_test)

    if print_evaluation:

        print(classification_report(y_test,y_pred))
        cm = confusion_matrix(y_test,y_pred)
        sns.heatmap(cm, annot=True, fmt='d')
        plt.show()
        
    gender_test_acc = evaluate_by_gender(y_test, y_pred, validation_gender)
        
    
    return model, f1_train, f1_test, acc_train, acc_test, gender_train_acc, gender_test_acc

In [None]:
def fit_evaluate_model_old(model, X_train, X_test, y_train, y_test, model_name=None, print_evaluation=True):
    model.fit(X_train, y_train)
    
    #print("Training set")
    y_pred = model.predict(X_train)
    f1_train = f1_score(y_train, y_pred)
    acc_train = accuracy(y_train, y_pred)
    #print("F1:", f1_train)
    #print("Acc:", acc_train)
    
    if print_evaluation:

        print(classification_report(y_train,y_pred))
        cm = confusion_matrix(y_train,y_pred)
        sns.heatmap(cm, annot=True, fmt='d')
        plt.show()
    
    y_pred = model.predict(X_test)

    #print("Testing set")
    f1_test = f1_score(y_test, y_pred)
    acc_test = accuracy(y_test, y_pred)
    #print("F1:", f1_test)
    #print("Acc:", acc_test)

    if print_evaluation:

        print(classification_report(y_test,y_pred))
        cm = confusion_matrix(y_test,y_pred)
        sns.heatmap(cm, annot=True, fmt='d')
        plt.show()

        
    return model, f1_train, f1_test, acc_train, acc_test

In [None]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# obtained after hyperparameter tunning
models = [
    SVC(kernel='linear', C=0.005),  
    LinearSVC(penalty="l1", dual=False),
    DecisionTreeClassifier(max_depth=3,criterion="gini"),
    RandomForestClassifier(n_estimators=5,max_depth=3,min_samples_leaf=2,min_samples_split=4, random_state=2), 
    LogisticRegression(C=0.7)
]

In [None]:
from sklearn.feature_selection import RFE

def run_feature_selection(selector_class, model=models[3], estimator=None):
    f1s = []
    f1s_train = []
    best_chosen = []
    f1_max = 0.0
    f1_train_ = None
    
    for i in range(1, X_train.shape[1]):

        if estimator == None:
            selector = selector_class(model, n_features_to_select=i, step=1)
        else:
            selector = selector_class(estimator, n_features_to_select=i, step=1)
        selector = selector.fit(X_train, y_train)

        chosen = selector.get_support()
        X_train_selected = X_train[:, chosen]
        X_test_selected = X_test[:, chosen]
        #print(X_train_selected.shape)

        trained_model = fit_evaluate_model_old(model, X_train_selected, X_test_selected, y_train, y_test, model_name=None, print_evaluation=False)

        #print()

        y_pred = model.predict(X_train_selected)
        f1_train = f1_score(y_train, y_pred)
        #print(f1_train)
        f1s_train.append(f1_train)

        y_pred = model.predict(X_test_selected)
        f1 = f1_score(y_test, y_pred)
        #print(f1)
        f1s.append(f1)
        
        if f1 > f1_max:
            f1_max = f1
            f1_train_ = f1_train
            best_chosen = chosen
            
        if f1_train > 0.95 and f1_train - f1 > 0.4:
            print("Overfitting..")
            break
        
    fig_size = plt.rcParams["figure.figsize"]
    fig_size[0] = 10
    fig_size[1] = 5
    plt.rcParams["figure.figsize"] = fig_size

    plt.plot(range(1,len(f1s)+1), f1s, label="Validation set")
    plt.plot(range(1,len(f1s)+1), f1s_train, label="Training set")
    plt.xlabel("Number of features")
    plt.ylabel("F1 score")
    plt.legend()
    plt.show()
        
    print("--------------")
    indexes = list(range(X_train.shape[1]))
    chosen_indexes = np.array(indexes)[best_chosen]
    print(chosen_indexes)
    print(len(chosen_indexes))
    print("Training:", f1_train_)
    print("Testing:", f1_max)
    
    return chosen_indexes

#### Main

In [None]:
for m in [50, 100, 250, 500, 1000, 2000]:
    print("m =", m)
    df = pd.read_csv("motifs_{}_m{}.csv".format(band, m))
    grouped_df = df.groupby(["labels", "genders"]).apply(sort_data)
    
    training_set, testing_set, final_testing_set = generate_datasets(grouped_df)
    target = "label"
    y_train = training_set[target]
    X_train = np.array(training_set.drop(target, axis=1))
    X_test = np.array(testing_set.drop(target, axis=1))
    y_test = testing_set[target]

    X_test_final = np.array(final_testing_set.drop(target, axis=1))
    y_test_final = final_testing_set[target]
    
    result_df = run_evaluation_result(X_train, X_test,  y_train, y_test, train_gender, validation_gender)
    display(result_df)

    features = run_feature_selection(RFE)
    X_train = X_train[:, features]
    X_test = X_test[:, features]
    X_test_final = X_test_final[:, features]
    selected_columns = training_set.columns[features]
    print(sorted(selected_columns))
    
    print("Train+validation, test")
    result_df = run_evaluation_result(np.concatenate((X_train, X_test), axis=0), X_test_final, np.concatenate((y_train,y_test)), y_test_final, np.concatenate((train_gender,validation_gender)), test_final_gender)
    display(result_df)
    
    print("===============================================================")