In [2]:
from tabnanny import verbose
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import LearningRateScheduler
import random
import joblib
import seaborn as sns
import pandas as pd
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import confusion_matrix
from scipy.signal import welch, resample, medfilt
from scipy.interpolate import splev, splrep, interp1d
import hrvanalysis
from scipy.signal import periodogram, welch
from scipy.integrate import trapz
print(np.__version__)
sns.set_theme()

1.21.3


In [3]:
def load_data_cap(file_name, base_dir = './'):
    with open(os.path.join(base_dir, file_name), 'rb') as f: # read preprocessing result
        data = joblib.load(f)
    X, sleep_stages, groups = data["X"], data['y'], data["groups"]
    X = np.array(X)
    sleep_stages = np.array(sleep_stages)
    sleep_stages[sleep_stages == 4] = 3
    sleep_stages[sleep_stages == 5] = 4
    del(data)
    return X, sleep_stages, groups


def balance_class(X, y):
    new_X = np.copy(X)
    new_y = np.copy(y)
    n_class = np.max(y) + 1
    repartition = list(np.sum(np.array(to_categorical(y)), axis=0))
    less_rpz_class = repartition.index(min(repartition))
    for i in np.arange(n_class):
        if i == less_rpz_class:
            continue
        too_much = repartition[i] - repartition[less_rpz_class] 
        idx = np.where(new_y==i)[0]
        idx_to_remove = random.sample(range(0, len(idx)), int(too_much))
        new_X = np.delete(new_X, idx[idx_to_remove], axis = 0)
        new_y = np.delete(new_y, idx[idx_to_remove], axis = 0)
    return(new_X, new_y)

def convert_to_CNN(y, n_class_ori, n_class):
    y_cnn = convert_class(y, n_class_ori, n_class)
    y_cnn = np.array(to_categorical(y_cnn))
    return(y_cnn)


def harmonize_data(X,y,groups):
    _, class_0_idx = np.where(y==0)
    _, class_1_idx = np.where(y==1)

    class_0_idx = np.where(class_0_idx == 1)
    class_1_idx = np.where(class_1_idx == 1)

    idx_0 = random.choices(class_0_idx[0], k=int(len(class_1_idx[0])*1.5))
    idx = np.concatenate([idx_0, class_1_idx[0]])
    X = X[idx]
    y = y[idx]
    groups = groups[idx]
    
    return(X,y,groups)



In [4]:
### Cap database
file_name='cap_data.pkl'
print('Loading following pkl:' , file_name)
X, sleep_stages, groups = load_data_cap(base_dir='./cap/', file_name=file_name)
print('pkl loaded.')
sleep_stages_cnn = np.array(to_categorical(sleep_stages))
X = np.swapaxes(X, 1, 2)

del_idx = []
for i in range(X.shape[0]):
        if np.isnan(np.sum(X[i,:,2])):
                del_idx.append(i)
        
corrupted_patient = np.unique(np.array(groups)[del_idx])
del_patient = []
for i in range(len(corrupted_patient)):
    print(corrupted_patient[i])
    del_patient.append(np.where(np.array(groups) == corrupted_patient[i])[0])

X_clean = np.swapaxes(X, 1, 2)
sleep_stages_cnn_clean = sleep_stages_cnn
groups_clean = np.array(groups)

X_clean = X_clean[:,360:540,:]
print(X_clean.shape)

#X_clean, sleep_apnea_cnn_clean, groups_clean = harmonize_data(X_clean, sleep_apnea_cnn_clean, groups_clean)

Loading following pkl: cap_data.pkl
pkl loaded.
(41180, 180, 3)


In [35]:
############################# FEATURES
def extract_hypnogram_features(hypnogram, size_segment=60):
    # 0: WAKE
    # 1-2-3: N1-2-3
    # 4: REM 
    sleep_features = {
        "sleep_time" : np.sum(hypnogram != 0) * size_segment / 60, # Total sleep time in minutes
        "sleep_efficiency_index" : np.sum(hypnogram != 0) / len(hypnogram),
        "sleep_onset_latency" : np.where(hypnogram != 0)[0][0] * size_segment / 60, # Time to fall asleep in minutes
        "wake_time_after_sleep_onset" : np.sum(hypnogram[np.where(hypnogram != 0)[0][0]:] == 0) * size_segment / 60,
        "n1_percentage" : np.sum(hypnogram == 1) / np.sum(hypnogram != 0),
        "n2_percentage" : np.sum(hypnogram == 2) / np.sum(hypnogram != 0),
        "n3_percentage" : np.sum(hypnogram == 3) / np.sum(hypnogram != 0),
        "rem_percentage" : np.sum(hypnogram == 4) / np.sum(hypnogram != 0),
        "latency_until_n2" : np.where(hypnogram == 2)[0][0] * size_segment / 60,
        "latency_until_n3" : np.where(hypnogram == 3)[0][0] * size_segment / 60,
        "latency_until_rem_sleep" : np.where(hypnogram == 4)[0][0] * size_segment / 60
    }
    return(sleep_features)

def get_mean_std_features_stage(x, hypnogram, stage):
    x_stage = x[hypnogram == stage]
    if stage == 'full':
        x_stage = x
    if stage == 'overnight':
        x_stage = x
        x_overnight = []
        for i in range(x_stage.shape[0]):
            x_overnight = np.append(x_overnight, np.squeeze(x_stage[i]))
        print(x_overnight.shape)
        loc_features = hrvanalysis.get_time_domain_features(x_overnight)
        loc_features.update(hrvanalysis.get_frequency_domain_features(x_overnight))
        loc_features.update(hrvanalysis.get_geometrical_features(x_overnight))
        loc_features.update(hrvanalysis.get_csi_cvi_features(x_overnight))
        loc_features.update(hrvanalysis.get_poincare_plot_features(x_overnight))
        loc_features.update(hrvanalysis.get_sampen(x_overnight))
        stage_features_renamed = loc_features.copy()
        for k in loc_features.keys():
            new_key = 'overnight_' + k
            stage_features_renamed[new_key] = loc_features[k]
        return(stage_features_renamed)
    stage_dicts = []
    for epoch in x_stage:
        loc_features = hrvanalysis.get_time_domain_features(epoch)
        #print(stage)
        #print(hrvanalysis.get_frequency_domain_features(epoch))
        loc_features.update(hrvanalysis.get_frequency_domain_features(epoch))
        loc_features.update(hrvanalysis.get_geometrical_features(epoch))
        loc_features.update(hrvanalysis.get_csi_cvi_features(epoch))
        loc_features.update(hrvanalysis.get_poincare_plot_features(epoch))
        loc_features.update(hrvanalysis.get_sampen(epoch))
        stage_dicts.append(loc_features)

    # stage features is the sum
    stage_features = stage_dicts[0].copy()
    for i in range(1, len(stage_dicts)):
        for k, v in stage_dicts[i].items():
            if stage_features[k] == None:
                continue
            stage_features[k] += v
    
    # Average the values
    for k, v in stage_features.items():
        if stage_features[k] == None:
            continue
        stage_features[k] /= len(stage_dicts)
    # Rename the keys
    stage_features_mean_renamed = stage_features.copy()
    for k in stage_features.keys():
        new_key = str(stage) + '_mean_' + k
        if stage == 'full':
            new_key = 'mean_' + k
        stage_features_mean_renamed[new_key] = stage_features[k]

    # Standard deviation
    std_list = []
    for k, v in stage_features.items():
        loc_list = []
        for i in range(1, len(stage_dicts)):
            if stage_features[k] == None:
                continue
            loc_list.append(stage_dicts[i][k])
        std_list.append(np.std(loc_list))
    # Rename the keys
    stage_features_std_renamed = stage_features.copy()
    i=0
    for k in stage_features.keys():
        new_key = str(stage) + '_std_' + k
        if stage == 'full':
            new_key = 'std_' + k
        stage_features_std_renamed[new_key] = std_list[i]
        i+=1
    features = stage_features_mean_renamed
    features.update(stage_features_std_renamed)
    return(features)



def extract_features(x, hypnogram, patient_id):
    x_rr = x[:,:,0] * 1000
    x_amp = np.expand_dims(x[:,:,1], -1)
    x_resp = np.expand_dims(x[:,:,2], -1)
    features = {'id' : patient_id}
    features.update(get_mean_std_features_stage(x_rr, hypnogram, 'overnight'))
    features.update(get_mean_std_features_stage(x_rr, hypnogram, 'full'))
    features.update(get_mean_std_features_stage(x_rr, hypnogram, 0))
    features.update(get_mean_std_features_stage(x_rr, hypnogram, 1))
    features.update(get_mean_std_features_stage(x_rr, hypnogram, 2))
    features.update(get_mean_std_features_stage(x_rr, hypnogram, 3))
    features.update(get_mean_std_features_stage(x_rr, hypnogram, 4))

    features.update(amp_features(x_amp, hypnogram, 'overnight'))
    features.update(amp_features(x_amp, hypnogram, 'full'))
    features.update(amp_features(x_amp, hypnogram, 0))
    features.update(amp_features(x_amp, hypnogram, 1))
    features.update(amp_features(x_amp, hypnogram, 2))
    features.update(amp_features(x_amp, hypnogram, 3))
    features.update(amp_features(x_amp, hypnogram, 4))

    features.update(resp_features(x_resp, hypnogram, 'overnight'))
    features.update(resp_features(x_resp, hypnogram, 'full'))
    features.update(resp_features(x_resp, hypnogram, 0))
    features.update(resp_features(x_resp, hypnogram, 1))
    features.update(resp_features(x_resp, hypnogram, 2))
    features.update(resp_features(x_resp, hypnogram, 3))
    features.update(resp_features(x_resp, hypnogram, 4))

    return(features)
    
def compute_ahi(sleep_apnea, size_segment):
    print(np.sum(sleep_apnea, 0)[1] / (len(sleep_apnea) * size_segment / 60 / 60) , len(sleep_apnea) * size_segment / 60 / 60)
    ahi = np.sum(sleep_apnea, 0)[1] / (len(sleep_apnea) * size_segment / 60 / 60) 
    return(ahi)

def amp_features(x, hypnogram, stage):
    x_stage = x[hypnogram == stage]
    if stage == 'full':
        x_stage = x
        
    loc_features = np.zeros((x_stage.shape[0], 7))

    if stage == 'overnight':
        loc_features = np.zeros((7,1))
        x_stage = x
        x_overnight = []
        for i in range(x_stage.shape[0]):
            x_overnight = np.append(x_overnight, np.squeeze(x_stage[i]))
        print(x_overnight.shape)
        mean_amp = np.nanmean(x_overnight)
        sd_amp = np.nanstd(x_overnight)
        sdsd_amp = np.nanstd(np.diff(x_overnight, 0))
        med_amp = np.nanmedian(x_overnight)
        range_amp = np.abs(np.max(x_overnight) - np.min(x_overnight))
        qth25_amp = np.nanquantile(x_overnight, 0.25)
        qth75_amp = np.nanquantile(x_overnight, 0.75)
        loc_features = [mean_amp, sd_amp, sdsd_amp, med_amp, range_amp, qth25_amp, qth75_amp]
        amp_features = {
        "overnight_mean_amp" : loc_features[0],
        "overnight_sd_amp" : loc_features[1],
        "overnight_sdsd_amp" : loc_features[2],
        "overnight_med_amp" : loc_features[3],
        "overnight_range_amp" : loc_features[4],
        "overnight_qth25_amp" : loc_features[5],
        "overnight_qth75_amp" : loc_features[6],
        }
        return(amp_features)

    for i in range(x_stage.shape[0]):
        epoch = x_stage[i]
        mean_amp = np.nanmean(epoch)
        sd_amp = np.nanstd(epoch)
        sdsd_amp = np.nanstd(np.diff(epoch, 0))
        med_amp = np.nanmedian(epoch)
        range_amp = np.abs(np.max(epoch) - np.min(epoch))
        qth25_amp = np.nanquantile(epoch, 0.25)
        qth75_amp = np.nanquantile(epoch, 0.75)
        loc_features[i] = [mean_amp, sd_amp, sdsd_amp, med_amp, range_amp, qth25_amp, qth75_amp]
    
    loc_features = np.delete(loc_features, np.sum(np.isnan(loc_features), 1) >= 4, 0)
    average = np.sum(loc_features, axis=0) / x_stage.shape[0]
    std = np.std(loc_features, axis=0)

    amp_features = {
        str(stage) + "_mean_mean_amp" : average[0],
        str(stage) + "_mean_sd_amp" : average[1],
        str(stage) + "_mean_sdsd_amp" : average[2],
        str(stage) + "_mean_med_amp" : average[3],
        str(stage) + "_mean_range_amp" : average[4],
        str(stage) + "_mean_qth25_amp" : average[5],
        str(stage) + "_mean_qth75_amp" : average[6],
        str(stage) + "_std_mean_amp" : std[0],
        str(stage) + "_std_sd_amp" : std[1],
        str(stage) + "_std_sdsd_amp" : std[2],
        str(stage) + "_std_med_amp" : std[3],
        str(stage) + "_std_range_amp" : std[4],
        str(stage) + "_std_qth25_amp" : std[5],
        str(stage) + "_std_qth75_amp" : std[6],
    }
    return(amp_features)

def resp_features(x, hypnogram, stage):
    x_stage = x[hypnogram == stage]
    if stage == 'full':
        x_stage = x
    
    loc_features = np.zeros((x_stage.shape[0], 13))
    fs_idxs = [0,9,18,27,36,45,54,80,129]

    if stage == 'overnight':
        loc_features = np.zeros((13,1))
        x_stage = x
        x_overnight = []
        for i in range(x_stage.shape[0]):
            x_overnight = np.append(x_overnight, np.squeeze(x_stage[i]))
        print(x_overnight.shape)
        mean_resp = np.nanmean(x_overnight)
        sd_resp = np.nanstd(x_overnight)
        med_resp = np.nanmedian(x_overnight)
        range_resp = np.abs(np.max(x_overnight) - np.min(x_overnight))
        n_resp = np.sum(np.abs(np.diff(np.transpose(np.sign(x_overnight)),1) / 2))
        loc_features[:5,0] = [mean_resp, sd_resp, med_resp, range_resp, n_resp]
        f1, pxxs1 = welch(np.squeeze(x_overnight), fs=3, window='hann', nfft=256)
        for j in range(len(fs_idxs)-1):
            loc_features[j+5,0] = trapz(pxxs1[fs_idxs[j]:fs_idxs[j+1]], f1[fs_idxs[j]:fs_idxs[j+1]])
        resp_features = {
        "overnight_mean_resp" : loc_features[0],
        "overnight_sd_resp" : loc_features[1],
        "overnight_med_resp" : loc_features[2],
        "overnight_range_resp" : loc_features[3],
        "overnight_n_resp" : loc_features[4],
        "overnight_f1_resp" : loc_features[5],
        "overnight_f2_resp" : loc_features[6],
        "overnight_f3_resp" : loc_features[7],
        "overnight_f4_resp" : loc_features[8],
        "overnight_f5_resp" : loc_features[9],
        "overnight_f6_resp" : loc_features[10],
        "overnight_f7_resp" : loc_features[11],
        "overnight_f8_resp" : loc_features[12]}
        return(resp_features)
        

    for i in range(x_stage.shape[0]):
        epoch = x_stage[i]
        mean_resp = np.nanmean(epoch)
        sd_resp = np.nanstd(epoch)
        med_resp = np.nanmedian(epoch)
        range_resp = np.abs(np.max(epoch) - np.min(epoch))
        n_resp = np.sum(np.abs(np.diff(np.transpose(np.sign(epoch)),1) / 2))

        loc_features[i,:5] = [mean_resp, sd_resp, med_resp, range_resp, n_resp]
        f1, pxxs1 = welch(np.squeeze(epoch), fs=3, window='hann', nfft=256)
        for j in range(len(fs_idxs)-1):
            loc_features[i,j+5] = trapz(pxxs1[fs_idxs[j]:fs_idxs[j+1]], f1[fs_idxs[j]:fs_idxs[j+1]])

    loc_features = np.delete(loc_features, np.sum(np.isnan(loc_features), 1) >= 4, 0)
    average = np.sum(loc_features, axis=0) / x_stage.shape[0]
    std = np.std(loc_features, axis=0)

    resp_features = {
        str(stage) + "_mean_mean_resp" : average[0],
        str(stage) + "_mean_sd_resp" : average[1],
        str(stage) + "_mean_med_resp" : average[2],
        str(stage) + "_mean_range_resp" : average[3],
        str(stage) + "_mean_n_resp" : average[4],
        str(stage) + "_mean_f1_resp" : average[5],
        str(stage) + "_mean_f2_resp" : average[6],
        str(stage) + "_mean_f3_resp" : average[7],
        str(stage) + "_mean_f4_resp" : average[8],
        str(stage) + "_mean_f5_resp" : average[9],
        str(stage) + "_mean_f6_resp" : average[10],
        str(stage) + "_mean_f7_resp" : average[11],
        str(stage) + "_mean_f8_resp" : average[12],
        str(stage) + "_std_mean_resp" : std[0],
        str(stage) + "_std_sd_resp" : std[1],
        str(stage) + "_std_med_resp" : std[2],
        str(stage) + "_std_range_resp" : std[3],
        str(stage) + "_std_n_resp" : std[4],
        str(stage) + "_std_f1_resp" : std[5],
        str(stage) + "_std_f2_resp" : std[6],
        str(stage) + "_std_f3_resp" : std[7],
        str(stage) + "_std_f4_resp" : std[8],
        str(stage) + "_std_f5_resp" : std[9],
        str(stage) + "_std_f6_resp" : std[10],
        str(stage) + "_std_f7_resp" : std[11],
        str(stage) + "_std_f8_resp" : std[12]
    }
    return(resp_features)

def get_full_features(groups_cl, sleep_stages, mn_long):
    import warnings
    warnings.filterwarnings('ignore')

    patient_list = np.unique(groups_cl)
    bad_patient = []
    for i in range(len(patient_list)):
        patient_idxs = np.where(groups_cl == patient_list[i])[0]
        hypnogram = sleep_stages[patient_idxs]
        if len(np.where(hypnogram == 0)[0])==0:
            bad_patient.append(i)
            continue
        if len(np.where(hypnogram == 1)[0])==0:
            bad_patient.append(i)
            continue
        if len(np.where(hypnogram == 2)[0])==0:
            bad_patient.append(i)
            continue
        if len(np.where(hypnogram == 3)[0])==0:
            bad_patient.append(i)
            continue
        if len(np.where(hypnogram == 4)[0])==0:
            bad_patient.append(i)
            continue

    patient_list = np.delete(patient_list, bad_patient)

    df = pd.DataFrame([])
    ahis = []
    for i in range(len(patient_list)):  

        patient_idxs = np.where(groups_cl == patient_list[i])[0]
        hypnogram = sleep_stages[patient_idxs]

        if mn_long:
            features = {'id' : patient_list[i]}
            print(patient_list[i])
            features.update(extract_hypnogram_features(hypnogram, size_segment=60))
            x = X_clean[patient_idxs]
            print(x.shape)
            features.update(extract_features(x, hypnogram, i))
            df = pd.concat((df, pd.DataFrame.from_dict(features, orient='index')), 1)

    df = df.transpose()
    return(df)
    

In [None]:
df = get_full_features(groups_clean, np.argmax(sleep_stages_cnn_clean,1), mn_long=True)
df.to_csv('panda_cap_60_sec_overnight.csv')