In [18]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import norm
from scipy.stats import kstest # lower mean higher similar
from tqdm import tqdm
NAME_DATASET = 'WESAD'
SUBJECT_ID_TEST = 'S14'

In [19]:
# https://medium.com/geekculture/techniques-to-measure-probability-distribution-similarity-9145678d68a6
# smaller mean higher similarity
# create the data distribution
data_1 = abs(np.random.randn(1000))
data_2 = np.random.lognormal(size=1000)
#compute KL Divergence
"""KL Divergence(P|Q)"""
def KL_div(p_probs, q_probs):    
    KL_div = p_probs * np.log(p_probs / q_probs)
    return np.sum(KL_div)
def JS_Div(p, q):
    p = np.asarray(p)
    q = np.asarray(q)
    # normalize
    p /= p.sum()
    q /= q.sum()
    m = (p + q) / 2
    return (KL_div(p, m) + KL_div(q, m)) / 2
# JS Divergence is symmetric
result_JSD12= JS_Div(data_1, data_2)
result_JSD21= JS_Div(data_2, data_1)

def similar_distribution(dt1, dt2):
    ncol = dt1.shape[-1]
    sum_score = 0
    for idx in range(ncol):
        score = kstest(dt1[:,idx], dt2[:,idx])[0]
        sum_score += score
    mean_score = sum_score / ncol
    return mean_score

In [20]:
##### READ DATASET #####
if NAME_DATASET == 'WESAD':
    DATA_DIR = '/home/nvtu/PhD_Work/StressDetection/DATA/MyDataset/WESAD'
    data_group = np.load(f'{DATA_DIR}/{NAME_DATASET}_WRIST_groups_1_60.npy')
    data_gt = np.load(f'{DATA_DIR}/{NAME_DATASET}_WRIST_ground_truth_1_60.npy')
    data_ft = np.load(f'{DATA_DIR}/{NAME_DATASET}_WRIST_stats_feats_1_60.npy')
else:
    DATA_DIR = '/home/nvtu/PhD_Work/StressDetection/DATA/MyDataset/AffectiveROAD_Data/Database'
    NAME_DATASET = 'AffectiveROAD'
    data_group = np.load(f'{DATA_DIR}/{NAME_DATASET}_groups_1.npy')
    data_gt = np.load(f'{DATA_DIR}/{NAME_DATASET}_ground_truth_1.npy')
    data_ft = np.load(f'{DATA_DIR}/{NAME_DATASET}_stats_feats_1.npy')
    indices = np.where(data_gt >= 0)[0]
    data_ft = data_ft[indices]
    data_group = data_group[indices]
    data_gt = data_gt[indices]

# Create dataframe for dataset
column_values = [f'f{x}' for x in range(data_ft.shape[1])]
data_full = pd.DataFrame(data = data_ft,  
                         columns = column_values)
data_full['subject_id'] = data_group
data_full['label'] = data_gt
list_subject_id = np.unique(data_full['subject_id']).tolist()

In [21]:
subject_id_test = SUBJECT_ID_TEST
data_train_val = data_full[data_full.subject_id != subject_id_test]
data_test = data_full[data_full.subject_id == subject_id_test]
list_id = list(set(data_train_val.subject_id))
list_id.sort()
score_dict = {}
for subject_id_validate in tqdm(list_id):
    data_train = data_train_val[data_train_val.subject_id != subject_id_validate]
    data_validate = data_train_val[data_train_val.subject_id == subject_id_validate]
    data_train_0 = data_train[data_train.label == 0]
    data_train_1 = data_train[data_train.label == 1]
    data_validate_0 = data_validate[data_validate.label == 0]
    data_validate_1 = data_validate[data_validate.label == 1]
    
    X_train_0 = data_train_0.iloc[:,:-2].to_numpy()
    X_train_1 = data_train_1.iloc[:,:-2].to_numpy()
    X_validate_0 = data_validate_0.iloc[:,:-2].to_numpy()
    X_validate_1 = data_validate_1.iloc[:,:-2].to_numpy()
    
    score_0 = similar_distribution(X_validate_0, X_train_0)
    score_1 = similar_distribution(X_validate_1, X_train_1)
    score = score_0*len(X_validate_0) + score_1*len(X_validate_1)
    score = score/(len(X_validate_0) + len(X_validate_1))
    score_dict[subject_id_validate] = score

100%|███████████████████████████████████████████| 14/14 [00:55<00:00,  3.99s/it]


In [22]:
score_dict # S14

{'S10': 0.2459866134857221,
 'S11': 0.33541790504651153,
 'S13': 0.42526189970302175,
 'S15': 0.22464156676636743,
 'S16': 0.25591138555050685,
 'S17': 0.27247231107971837,
 'S2': 0.2773977047535787,
 'S3': 0.28797789479702246,
 'S4': 0.3894865834976222,
 'S5': 0.2361966190212268,
 'S6': 0.2567577263052203,
 'S7': 0.37740772985482113,
 'S8': 0.32538538092690333,
 'S9': 0.3436536274600524}

In [17]:
score_dict # S9

{'S10': 0.24868616390965503,
 'S11': 0.3353142124391459,
 'S13': 0.43174628676198884,
 'S14': 0.4410605015340969,
 'S15': 0.22683161384309725,
 'S16': 0.25205764070812436,
 'S17': 0.27177808844143997,
 'S2': 0.2799986889650542,
 'S3': 0.2878490482531951,
 'S4': 0.3796193591138874,
 'S5': 0.244991436602247,
 'S6': 0.2555274678107994,
 'S7': 0.38101008313383616,
 'S8': 0.3172726849709103}