In [1]:
from tabnanny import verbose
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import LearningRateScheduler
import random
import joblib
import seaborn as sns
import pandas as pd
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import confusion_matrix
from scipy.signal import welch, resample, medfilt
from scipy.interpolate import splev, splrep, interp1d
from scipy import stats
import statsmodels.stats.multicomp as mc
import hrvanalysis
print(np.__version__)
sns.set_theme()

1.21.3


In [2]:
from scipy.stats import kendalltau, pearsonr, spearmanr, ttest_ind, f_oneway
def pearsonr_pval(x,y):
    return pearsonr(x,y)[1]

def ttest_pval(x,y):
    return ttest_ind(x,y)[1]


def find_outliers(df, feature, low, up):
    Q1 = df[feature].quantile(low)
    Q3 = df[feature].quantile(up)
    IQR = Q3 - Q1
    upper_limit = Q3 + 1.5 * IQR
    lower_limit = Q1 - 1.5 * IQR
    return upper_limit, lower_limit

def p_value_anova(df):
    _, p = f_oneway(df)
    return(p)

def p_value_kruskal(df):
    _, p = stats.kruskal(df)
    return(p)

In [233]:
cap_csv = r'C:\thomas\python\panda_cap_60_sec_overnight.csv'
cap_df = pd.read_csv(cap_csv)

In [3]:
def full_worker(remove_outliers, low, up, remove_similar_features, mn_long):

    remove_outliers_mode = 'single_feature'
    normalize = False
    tol_sim = 0.95

    depressed_csv = r'C:\thomas\python\panda_depressed_patient.csv'
    apnea_csv = r'C:\thomas\python\panda_sleep_apnea_patient.csv'
    

    if mn_long:
        depressed_csv = r'C:\thomas\python\panda_depressed_patient_60_sec_overnight.csv'
        apnea_csv = r'C:\thomas\python\panda_sleep_apnea_patient_60_sec_overnight.csv'
        cap_csv = r'C:\thomas\python\panda_cap_60_sec_overnight.csv'


    depressed_df = pd.read_csv(depressed_csv)
    apnea_df = pd.read_csv(apnea_csv)
    cap_df = pd.read_csv(cap_csv)

    depressed_col = depressed_df['Depressed'].copy()
    depressed_df = depressed_df.drop(columns=['Depressed', 'id'])
    depressed_df = depressed_df.drop(columns=depressed_df.columns[0])

    ahi_col = apnea_df['AHI'].copy()
    apnea_df = apnea_df.drop(columns=['Unnamed: 0', 'AHI', 'id'])
    print(apnea_df.columns)
    apnea_df = apnea_df.drop(columns=apnea_df.columns[0])

    print('Removing invalid features for apnea, ', apnea_df.shape)
    #apnea_df.replace(0, float('NaN'), inplace=True)
    apnea_df.replace(float('-inf'), float('NaN'), inplace=True)
    apnea_df.replace(float('inf'), float('NaN'), inplace=True)
    apnea_df = apnea_df.dropna(axis=1)
    print('Invalid features removed for apnea, ', apnea_df.shape)

    print('Removing invalid features for depression, ', depressed_df.shape)
    #depressed_df.replace(0, float('NaN'), inplace=True)
    depressed_df.replace(float('-inf'), float('NaN'), inplace=True)
    depressed_df.replace(float('inf'), float('NaN'), inplace=True)
    depressed_df = depressed_df.dropna(axis=1)
    print('Invalid features removed for depression, ', depressed_df.shape)


    print('Removing invalid features for cap, ', cap_df.shape)
    #cap_df.replace(0, float('NaN'), inplace=True)
    cap_df.replace(float('-inf'), float('NaN'), inplace=True)
    cap_df.replace(float('inf'), float('NaN'), inplace=True)
    cap_df = cap_df.dropna(axis=1)
    print('Invalid features removed for cap, ', cap_df.shape)

    for i in range(len(cap_df.dtypes)):
        if cap_df.dtypes[i] == 'object':
            for j in range(cap_df.shape[0]):
                cap_df.iloc[j,i] = np.float64(cap_df.iloc[j,i][1:-1])

    for i in range(len(apnea_df.dtypes)):
        if apnea_df.dtypes[i] == 'object':
            for j in range(apnea_df.shape[0]):
                apnea_df.iloc[j,i] = np.float64(apnea_df.iloc[j,i][1:-1])

    for i in range(len(depressed_df.dtypes)):
        if depressed_df.dtypes[i] == 'object':
            for j in range(depressed_df.shape[0]):
                depressed_df.iloc[j,i] = np.float64(depressed_df.iloc[j,i][1:-1])

        

    if remove_outliers:
        print('Before removing outliers for apnea dataset: ', apnea_df.shape)
        print(np.sum(ahi_col < 5), np.sum(ahi_col < 15) - np.sum(ahi_col < 5), np.sum(ahi_col < 30) - np.sum(ahi_col < 15) - np.sum(ahi_col < 5), np.sum(ahi_col >= 30))
        outliers0 = np.array([])
        outliers1 = np.array([])
        outliers2 = np.array([])
        outliers3 = np.array([])

        if remove_outliers_mode == 'single_feature':
            for col in apnea_df.columns:
                apnea_df_0 = apnea_df[ahi_col<5]
                apnea_df_1 = apnea_df[ahi_col>=5]
                apnea_df_1 = apnea_df_1.drop(index=np.where(ahi_col>=15)[0])
                apnea_df_2 = apnea_df[ahi_col>=15]
                apnea_df_2 = apnea_df_2.drop(index=np.where(ahi_col>=30)[0])
                apnea_df_3 = apnea_df[ahi_col>=30]

                upper, lower = find_outliers(apnea_df_0, col, low, up)
                outliers0 = np.append(outliers0, apnea_df_0.index[np.where(apnea_df_0[col] < lower)[0]])
                outliers0 = np.append(outliers0, apnea_df_0.index[np.where(apnea_df_0[col] > upper)[0]])

                upper, lower = find_outliers(apnea_df_1, col, low, up)
                outliers1 = np.append(outliers1, apnea_df_1.index[np.where(apnea_df_1[col] < lower)[0]])
                outliers1 = np.append(outliers1, apnea_df_1.index[np.where(apnea_df_1[col] > upper)[0]])

                upper, lower = find_outliers(apnea_df_2, col, low, up)
                outliers2 = np.append(outliers2, apnea_df_2.index[np.where(apnea_df_2[col] < lower)[0]])
                outliers2 = np.append(outliers2, apnea_df_2.index[np.where(apnea_df_2[col] > upper)[0]])

                #upper, lower = find_outliers(apnea_df_3, col, low, up)
                #outliers3 = np.append(outliers3, apnea_df_3.index[np.where(apnea_df_3[col] < lower)[0]])
                #outliers3 = np.append(outliers3, apnea_df_3.index[np.where(apnea_df_3[col] > upper)[0]])

            print(np.unique(outliers0).shape[0] / apnea_df_0.shape[0], np.unique(outliers1).shape[0] / apnea_df_1.shape[0], 
            np.unique(outliers2).shape[0] / apnea_df_2.shape[0], np.unique(outliers3).shape[0] / apnea_df_3.shape[0])
            #outliers = np.concatenate([outliers0, outliers1, outliers2, outliers3])
            outliers = np.concatenate([outliers0, outliers1, outliers2])
            outliers = np.unique(outliers)
            print(outliers.shape)
            apnea_df = apnea_df.drop(index=outliers)
            ahi_col = ahi_col.drop(index=outliers)
            print('After removing outliers for apnea dataset: ', apnea_df.shape)
            print(np.sum(ahi_col < 5), np.sum(ahi_col < 15) - np.sum(ahi_col < 5), np.sum(ahi_col < 30) - np.sum(ahi_col < 15) - np.sum(ahi_col < 5), np.sum(ahi_col >= 30))

    if normalize:
        depressed_df = (depressed_df - depressed_df.mean()) / depressed_df.std()
        apnea_df = (apnea_df - apnea_df.mean()) / apnea_df.std()

    apnea_df.insert(0, "patient", (ahi_col > 5) * 1)
    apnea_df['patient'][ahi_col >= 15] = 1.5
    apnea_df['patient'][ahi_col >= 30] = 1.8
    depressed_df.insert(0, "patient", np.ones((depressed_df.shape[0],1)) * 2)
    apnea_df = apnea_df.drop(index=np.arange(0,100,1), errors='ignore')

    full_df = pd.concat([depressed_df, apnea_df])

    insomnia_df = cap_df[1:8]
    insomnia_df.insert(0, "patient", np.ones((insomnia_df.shape[0],1)) * 3)
    normal_df = cap_df[8:14]
    normal_df.insert(0, "patient", np.ones((normal_df.shape[0],1)) * 4)
    narco_df = cap_df[14:19]
    narco_df.insert(0, "patient", np.ones((narco_df.shape[0],1)) * 5)
    nfle_df = cap_df[19:54]
    nfle_df.insert(0, "patient", np.ones((nfle_df.shape[0],1)) * 6)
    plm_df = cap_df[54:63]
    plm_df.insert(0, "patient", np.ones((plm_df.shape[0],1)) * 7)
    rdb_df = cap_df[63:83]
    rdb_df.insert(0, "patient", np.ones((rdb_df.shape[0],1)) * 8)


    if True == False:
        ########### SLEEP APNEA RESULTS
        test_df = apnea_df.loc[apnea_df['patient'] != 2]
        print(test_df.shape)
        patient_col = test_df['patient'].copy()
        #test_df.replace(0, float('NaN'), inplace=True)
        test_df.replace(float('-inf'), float('NaN'), inplace=True)
        test_df.replace(float('inf'), float('NaN'), inplace=True)
        test_df = test_df.dropna(axis=1)
        #test_df.insert(0, 'patient', patient_col)
        #test_df = test_df.drop(columns=test_df.columns[65:79]) # drop amplitude because not applicable
        #loc = test_df #test_df.drop(columns=["Unnamed: 0", "patient"])
        #normalized_df = (loc - loc.mean()) / loc.std()
        balance_class = False
        if balance_class:
            n_healthy = np.sum(test_df['patient'] == 0)
            test_df = pd.concat([test_df[test_df['patient'] == 1][:n_healthy], test_df[test_df['patient'] == 0]])
        print(test_df.shape)



        ########### DEPRESSION RESULTS
        
        test_df = pd.concat([depressed_df, apnea_df.loc[apnea_df['patient'] == 0]])
        patient_col = test_df['patient'].copy()
        test_df.replace(0, float('NaN'), inplace=True)
        test_df.replace(0, float('-inf'), inplace=True)
        test_df.replace(0, float('inf'), inplace=True)
        test_df = test_df.dropna(axis=1)
        test_df.insert(0, 'patient', patient_col)
        print(test_df.shape)
        for ind in test_df.columns:
            if ind.endswith('amp'):
                test_df = test_df.drop(columns=ind)
        #test_df = test_df.drop(columns=test_df.columns[65:79]) # drop amplitude because not applicable
        #loc = test_df #test_df.drop(columns=["Unnamed: 0", "patient"])
        #normalized_df = (loc - loc.mean()) / loc.std()

    if remove_similar_features:
        print('Before removing similar features: ', test_df.shape)
        corr = test_df.corr()
        new_df = test_df.copy()
        for col in test_df.columns:
            if col not in test_df.columns:
                continue
            corr = test_df.corr()
            similar_cols = test_df.columns[np.logical_and(corr[col]!=1, corr[col] >= tol_sim)]
            test_df = test_df.drop(columns = similar_cols, errors='ignore')
        print('After removing similar features: ', test_df.shape)

    return(apnea_df, depressed_df, insomnia_df, normal_df, narco_df, nfle_df, plm_df, rdb_df)

def merge_df(df1, df2):
    df = pd.concat([df1, df2])
    patient_col = df['patient'].copy()
    df = df.drop(columns={'patient', 'Unnamed: 0'}, errors='ignore')
    df = df.dropna(axis=1)
    df.insert(0, 'patient', patient_col)
    return(df)

In [4]:
apnea_df, depressed_df, insomnia_df, normal_df, narco_df, nfle_df, plm_df, rdb_df = full_worker(
    remove_outliers=False, low=0.05, up=0.95, remove_similar_features=False, mn_long=True)

Index(['sleep_time', 'sleep_efficiency_index', 'sleep_onset_latency',
       'wake_time_after_sleep_onset', 'n1_percentage', 'n2_percentage',
       'n3_percentage', 'rem_percentage', 'latency_until_n2',
       'latency_until_n3',
       ...
       '4_std_range_resp', '4_std_n_resp', '4_std_f1_resp', '4_std_f2_resp',
       '4_std_f3_resp', '4_std_f4_resp', '4_std_f5_resp', '4_std_f6_resp',
       '4_std_f7_resp', '4_std_f8_resp'],
      dtype='object', length=719)
Removing invalid features for apnea,  (824, 718)
Invalid features removed for apnea,  (824, 548)
Removing invalid features for depression,  (5, 188)
Invalid features removed for depression,  (5, 160)
Removing invalid features for cap,  (84, 721)
Invalid features removed for cap,  (84, 644)


In [None]:
apnea_df, depressed_df, insomnia_df, normal_df, narco_df, nfle_df, plm_df, rdb_df = full_worker(
    remove_outliers=False, low=0.5, up=0.95, remove_similar_features=False, mn_long=True)

In [None]:
test_df = merge_df(normal_df, depressed_df)
p_tol_low = 0
p_tol_high = 0.05
p_levene_count = 0
_, p_values = stats.kruskal(test_df[test_df['patient'] == 4], test_df[test_df['patient'] == 2])
print(normal_df.shape, insomnia_df.shape)
stats_df_insomnia = pd.DataFrame(columns=['feature', 'p', 'p_levene'])

print(np.sum(p_values < p_tol_high))
for ind1 in  test_df.columns[p_values < 0.05]:
    if ind1 == 'patient':
        continue
    # Levene test to check homogeneity of variance among groups
    _, p_levene = stats.levene(test_df[test_df['patient'] == 2][ind1], test_df[test_df['patient'] == 4][ind1])
    _, p_shapiro_normal = stats.shapiro(test_df[test_df['patient'] == 4][ind1])
    _, p_shapiro_pathology = stats.shapiro(test_df[test_df['patient'] == 2][ind1])
    _, p_ks = stats.kstest(test_df[test_df['patient'] == 2][ind1], test_df[test_df['patient'] == 4][ind1])
    #if p_values[np.where(test_df.columns == ind1)[0][0]] > 0.03:
    #    continue
    #if p_levene < 0.05:
    #    p_levene_count += 1
    #    continue
    line = {'feature' : ind1, 'p' : p_values[np.where(test_df.columns == ind1)[0]], "p_ks":p_ks, 'p_levene' : p_levene, 
    'p_shapiro_normal':p_shapiro_normal, "p_shapiro_pathology":p_shapiro_pathology}
    stats_df_insomnia = pd.concat([stats_df_insomnia, pd.DataFrame(line)])
    plt.figure(figsize=(10,3))
    fig = sns.histplot(data=test_df, x=ind1, hue='patient', palette="bright", common_norm=False, element="step", kde=False, stat='probability')
    title_plot = str(ind1) + ', p=' + str(np.round(p_values[np.where(test_df.columns == ind1)[0][0]],3)) + ', p_ks = ' + str(np.round(p_ks,3))
    #title_plot = str(ind1)
    plt.title(title_plot, fontsize=15)
    #if ind1 == 'mean_std_hr':
    #    plt.xlim([-2.5,10])
    #if ind1 == 'std_mean_hr':
    #    plt.xlim([-60,80])
    #fig.legend(['Depression patient', 'Control patient'])
    title=str(ind1) + ', p=' + str(np.round(p_values[np.where(test_df.columns == ind1)[0][0]],3)) + 'outliers.png'
    #plt.savefig(title)
    plt.show()
    print(p_values[np.where(test_df.columns == ind1)[0][0]], p_levene, p_shapiro_normal, p_shapiro_pathology)