In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle

%matplotlib inline

In [None]:
def compute_performance(subject_data, proba_thr, distance_thr):
    # simple proxy of performance using only centers and proba at those centers (not accurate)
    centers_pred = subject_data['y_pred_center'][subject_data['y_pred_proba'] >= proba_thr]
    centers_true = subject_data['y_true_center']
    n_detections = centers_pred.size
    n_events = centers_true.size
    # tp with centers distance
    if n_detections > 0:
        centers_distance = centers_true.reshape(-1, 1) - centers_pred.reshape(1, -1)
        centers_distance = np.abs(centers_distance)
        matching = (centers_distance <= distance_thr).astype(np.float32)
        exists_match = (matching.sum(axis=1) > 0).astype(np.int32)
        n_tp = np.sum(exists_match)
        recall = n_tp / n_events
        precision = n_tp / n_detections
    else:
        recall = 0
        precision = 1
    return recall, precision

In [None]:
parent_dir = '/home/ntapia/projects/sleep-rnn/results/embeddings/20191227_bsf_10runs_e1_n2_train_mass_ss/v19'
seeds = [0, 1, 2, 3]
seed_thr_list = [0.3] * 4 #  [0.44, 0.56, 0.58, 0.46] #  [0.3] * 4
embeddings = {}
for seed_thr, seed in zip(seed_thr_list, seeds):
    file_path = os.path.join(parent_dir, 'seed%d' % seed, 'embeddings_n2_thr%1.2f_val.pkl' % seed_thr)
    print("loading %s" % file_path)
    with open(file_path, 'rb') as handle:
        embeddings[seed] = pickle.load(handle)

In [None]:
seed = 2
subject_id = 14

subject_data = embeddings[seed][subject_id]
subject_data.keys()

In [None]:
for key in subject_data.keys():
    print(key, subject_data[key].shape, subject_data[key].dtype)

In [None]:
# Check dist of proba
fig, ax = plt.subplots(1, 2, figsize=(8, 4), dpi=80)
ax[0].hist(subject_data['y_true_proba'], bins=np.linspace(0, 1, 11))
ax[0].set_title("Proba of True Spindles")
ax[1].hist(subject_data['y_pred_proba'], bins=np.linspace(0, 1, 11))
ax[1].set_title("Proba of Predictions")
plt.show()

In [None]:
proba_thr = 0.58
fs = 200
distance_thr = 0.5 * fs
recall, precision = compute_performance(subject_data, proba_thr, distance_thr)
print("Subject %s (seed %d)" % (subject_id, seed))
print("Performance at thr %s: Recall %1.4f - Precision %1.4f" % (proba_thr, recall, precision))

In [None]:
# approximate PR curve
distance_thr = 0.5 * fs
recall_l = []
precision_l = []
thr_list = np.linspace(0.3, 0.95, 30)
for tmp_thr in thr_list:
    recall, precision = compute_performance(subject_data, tmp_thr, distance_thr)
    recall_l.append(recall)
    precision_l.append(precision)

fig, ax = plt.subplots(1, 1, figsize=(4, 4), dpi=140)
ax.plot(recall_l, precision_l, markersize=3, linewidth=1)
ax.set_xlabel("Recall")
ax.set_ylabel("Precision")
ax.set_title("Subject %d (Seed %d)" % (subject_id, seed))
ax.set_xlim([0, 1])
ax.set_ylim([0, 1])
# show some thr
n_thr_to_plot = 5
skip = np.floor(thr_list.size / (n_thr_to_plot - 1))
locs = [int(i * skip) for i in range(n_thr_to_plot)]
for loc in locs:
    ax.plot(recall_l[loc], precision_l[loc], marker='o', markersize=4, label="thr %1.2f" % thr_list[loc])
ax.plot([0, 1], [0, 1], linewidth=1, color='k', alpha=0.5)
ax.plot([.5, .5], [0, 1], linewidth=1, color='k', alpha=0.5)
ax.plot([0, 1], [.5, .5], linewidth=1, color='k', alpha=0.5)
ax.legend(loc='lower left')
plt.show()

In [None]:
# check ranges of embedding
em_true = subject_data['y_true_tensor']
em_pred = subject_data['y_pred_tensor']
em_true.shape, em_pred.shape

In [None]:
em_true.max(), em_true.min(), em_true.mean(), em_true.std()

In [None]:
em_pred.max(), em_pred.min(), em_pred.mean(), em_pred.std()

In [None]:
# Check dist of activations
fig, ax = plt.subplots(1, 2, figsize=(8, 4), dpi=80)
ax[0].hist(em_true.flatten())
ax[0].set_title("Activations True Spindles")
ax[1].hist(em_pred.flatten())
ax[1].set_title("Activations of Predictions")
plt.tight_layout()
plt.show()

In [None]:
# check distance intra and inter cluster

# True spindles

distance_true_true = em_true.reshape(1, -1, 128) - em_true.reshape(-1, 1, 128)
distance_true_true = (distance_true_true ** 2).sum(axis=-1)
print(distance_true_true.shape)
iu = np.triu_indices(distance_true_true.shape[0], k=1)
distance_true_true = distance_true_true[iu]
print(distance_true_true.shape)

# False detections
distance_thr = 0.5 * fs
centers_distance = subject_data['y_true_center'].reshape(-1, 1) - subject_data['y_pred_center'].reshape(1, -1)
centers_distance = np.abs(centers_distance)
matching = (centers_distance <= distance_thr).astype(np.float32)
exists_match = (matching.sum(axis=0) > 0).astype(np.int32)
em_fp = em_pred[np.where(exists_match == 0)[0]]

distance_pred_pred = em_fp.reshape(1, -1, 128) - em_fp.reshape(-1, 1, 128)
distance_pred_pred = (distance_pred_pred ** 2).sum(axis=-1)
print(distance_pred_pred.shape)
iu = np.triu_indices(distance_pred_pred.shape[0], k=1)
distance_pred_pred = distance_pred_pred[iu]
print(distance_pred_pred.shape)

# true spindles vs FP

distance_true_pred = em_fp.reshape(1, -1, 128) - em_true.reshape(-1, 1, 128)
print(distance_true_pred.shape)
distance_true_pred = (distance_true_pred ** 2).sum(axis=-1).flatten()
print(distance_true_pred.shape)

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(4, 6), dpi=80, sharex=True)
ax[0].hist(np.sqrt(distance_true_true))
ax[1].hist(np.sqrt(distance_pred_pred))
ax[2].hist(np.sqrt(distance_true_pred))
plt.tight_layout()
plt.show()

In [None]:
weights = subject_data['y_pred_proba'].reshape(-1, 1)
z = subject_data['y_pred_tensor'] 
proto = np.sum(weights * z, axis=0) / np.sum(weights)

In [None]:
# compute scores
scores_true = np.dot(em_true, proto)
scores_fp = np.dot(em_fp, proto)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(8, 4), dpi=80, sharex=True)
ax[0].hist(scores_true)
ax[0].set_title("Scores True Spindles")
ax[1].hist(scores_fp)
ax[1].set_title("Scores of FP")
plt.tight_layout()
plt.show()

In [None]:
def compute_performance_with_scores(scores_true, scores_fp, n_thr=50):
    min_score = min(scores_true.min(), scores_fp.min())
    max_score = max(scores_true.max(), scores_fp.max())
    thr_list = np.linspace(min_score, max_score, n_thr)
    recall_l = []
    precision_l = []
    for thr in thr_list:
        n_tp = np.sum(scores_true >= thr)
        n_fp = np.sum(scores_fp >= thr)
        n_fn = np.sum(scores_true < thr)
        if (n_tp + n_fp) == 0:
            recall = 0
            precision = 1
        else:
            recall = n_tp / (n_tp + n_fn)
            precision = n_tp / (n_tp + n_fp)
        recall_l.append(recall)
        precision_l.append(precision)
    return thr_list, recall_l, precision_l       

In [None]:
scores_true = np.dot(em_true, proto)
scores_fp = np.dot(em_fp, proto)
thr_list_score, recall_l_score, precision_l_score = compute_performance_with_scores(
    scores_true, scores_fp, n_thr=50)


distance_thr = 0.5 * fs
recall_l_proba = []
precision_l_proba = []
thr_list_proba = np.linspace(0.3, 0.95, 30)
for tmp_thr in thr_list_proba:
    recall, precision = compute_performance(subject_data, tmp_thr, distance_thr)
    recall_l_proba.append(recall)
    precision_l_proba.append(precision)
recall_l_proba = np.array(recall_l_proba)
precision_l_proba = np.array(precision_l_proba)


fig, ax = plt.subplots(1, 2, figsize=(8, 4), dpi=120, sharey=True)
ax[0].plot(thr_list_score, recall_l_score, label="Recall")
ax[0].plot(thr_list_score, precision_l_score, label="Precision")
ax[0].set_title("Score Performance")
ax[0].legend()

ax[1].plot(thr_list_proba, recall_l_proba, label="Recall")
ax[1].plot(thr_list_proba, precision_l_proba, label="Precision")
ax[1].set_title("Proba Performance")
ax[1].legend()

plt.tight_layout()
plt.show()

In [None]:
def compute_score_curve(subject_data, prototype, distance_thr, num=50):
    scores_pred = np.dot(subject_data['y_pred_tensor'], prototype)
    scores_range = np.linspace(scores_pred.min(), scores_pred.max(), num)
    
    recall_l = []
    precision_l = []
    centers_true = subject_data['y_true_center']
    for score_thr in scores_range:
        centers_pred = subject_data['y_pred_center'][scores_pred >= score_thr]
        n_detections = centers_pred.size
        n_events = centers_true.size
        # tp with centers distance
        if n_detections > 0:
            centers_distance = centers_true.reshape(-1, 1) - centers_pred.reshape(1, -1)
            centers_distance = np.abs(centers_distance)
            matching = (centers_distance <= distance_thr).astype(np.float32)
            exists_match = (matching.sum(axis=1) > 0).astype(np.int32)
            n_tp = np.sum(exists_match)
            recall = n_tp / n_events
            precision = n_tp / n_detections
        else:
            recall = 0
            precision = 1
        recall_l.append(recall)
        precision_l.append(precision)
    return scores_range, np.array(recall_l), np.array(precision_l)

In [None]:
# Simple weighted average prototype
weights = subject_data['y_pred_proba'].reshape(-1, 1)
z = subject_data['y_pred_tensor'] 
proto = np.sum(weights * z, axis=0) / np.sum(weights)

In [None]:
# Only top weighted embeddings contribute equally
top_pctl = 5
thr_weights = np.percentile(weights.flatten(), 100 - top_pctl)
weights_top_indicator = np.where(weights.flatten() >= thr_weights)[0]
proto_top_indicator = np.mean(z[weights_top_indicator, :], axis=0)

In [None]:
selected_prototype = proto_top_indicator

scores_range, score_recall, score_precision = compute_score_curve(
    subject_data, selected_prototype, 0.5 * fs, num=50)

fig, ax = plt.subplots(1, 2, figsize=(8, 4), dpi=120, sharey=True)
ax[0].plot(scores_range, score_recall, label="Recall")
ax[0].plot(scores_range, score_precision, label="Precision")
f1 = 2 * score_recall * score_precision / (score_recall + score_precision + 1e-6)
ax[0].plot(scores_range, f1, label="F1-score")
ax[0].set_title("Score Performance")
ax[0].legend()

ax[1].plot(thr_list_proba, recall_l_proba, label="Recall")
ax[1].plot(thr_list_proba, precision_l_proba, label="Precision")
f1 = 2 * recall_l_proba * precision_l_proba / (recall_l_proba + precision_l_proba + 1e-6)
ax[1].plot(thr_list_proba, f1, label="F1-score")
ax[1].set_title("Proba Performance")
ax[1].legend()

plt.tight_layout()
plt.show()

In [None]:
plt.hist(weights.flatten())
plt.show()

In [None]:
# tsne of em true and em fp
n_true = em_true.shape[0]
n_fp = em_fp.shape[0]
label = [1] * n_true
label.extend([0] * n_fp)
label = np.array(label)
em = np.concatenate([em_true, em_fp], axis=0)

In [None]:
# Normalize
em_mean = em.mean(axis=0)
em_std = em.std(axis=0)
em_std[em_std == 0] = 1
em_norm = (em - em_mean) / em_std
# Remove zero-std dimensions
em_std = em.std(axis=0)
em_norm = em_norm[:, em_std > 0]
print(em_norm.shape)

In [None]:
# project with pca first
from sklearn.decomposition import PCA
pca = PCA(n_components=30)
em_norm_pca = pca.fit_transform(em_norm)
em_norm_pca.shape

In [None]:
# now tsne
from sklearn.manifold import TSNE

In [None]:
em_norm_tsne = TSNE(n_components=2, perplexity=20).fit_transform(em_norm_pca)

fig, ax = plt.subplots(1, 1, figsize=(5, 5), dpi=120)
ax.scatter(em_norm_tsne[:, 0], em_norm_tsne[:, 1], s=5, c=label)
plt.show()

In [None]:
em_norm_tsne = TSNE(n_components=2, perplexity=30).fit_transform(em_norm_pca)

fig, ax = plt.subplots(1, 1, figsize=(5, 5), dpi=120)
ax.scatter(em_norm_tsne[:, 0], em_norm_tsne[:, 1], s=5, c=label)
plt.show()

In [None]:
em_norm_tsne = TSNE(n_components=2, perplexity=50).fit_transform(em_norm_pca)

fig, ax = plt.subplots(1, 1, figsize=(5, 5), dpi=120)
ax.scatter(em_norm_tsne[:, 0], em_norm_tsne[:, 1], s=5, c=label)
plt.show()

In [None]:
em_norm_tsne = TSNE(n_components=2, perplexity=100).fit_transform(em_norm_pca)

fig, ax = plt.subplots(1, 1, figsize=(5, 5), dpi=120)
ax.scatter(em_norm_tsne[:, 0], em_norm_tsne[:, 1], s=5, c=label)
plt.show()

# Comparison of mean embedding between subjects

In [None]:
def find_best_thr(subject_data, distance_thr, num=100):
    thr_list = np.linspace(subject_data['y_pred_proba'].min(), subject_data['y_pred_proba'].max(), num)
    f1_l = []
    for thr in thr_list:
        r, p = compute_performance(subject_data, thr, distance_thr)
        f1 = 2 * r * p / (r + p + 1e-6)
        f1_l.append(f1)
    f1_l = np.array(f1_l)
    best_loc = np.argmax(f1_l)
    return thr_list[best_loc]

In [None]:
parent_dir = '/home/ntapia/projects/sleep-rnn/results/embeddings/20191227_bsf_10runs_e1_n2_train_mass_ss/v19'
seeds = [0, 1, 2, 3]
seed_thr_list = [0.3] * 4
embeddings = {}
for seed_thr, seed in zip(seed_thr_list, seeds):
    file_path = os.path.join(parent_dir, 'seed%d' % seed, 'embeddings_n2_thr%1.2f_val.pkl' % seed_thr)
    print("loading %s" % file_path)
    with open(file_path, 'rb') as handle:
        embeddings[seed] = pickle.load(handle)

In [None]:
embeddings_all = {}
opt_thrs = [0.44, 0.56, 0.58, 0.46]
for seed in seeds:
    seed_data = embeddings[seed]
    for subject_id in seed_data.keys():
        embeddings_all[subject_id] = seed_data[subject_id]
        embeddings_all[subject_id]['opt_thr'] = opt_thrs[seed]
subject_ids = list(embeddings_all.keys())
subject_ids.sort()

In [None]:
fs = 200
distance_thr = 0.5 * fs
approx_cheat_thr_dict = {}
for subject_id in subject_ids:
    approx_cheat_thr = find_best_thr(embeddings_all[subject_id], distance_thr)
    print("Suject %02d. Cheat threshold %1.3f" % (subject_id, approx_cheat_thr))
    approx_cheat_thr_dict[subject_id] = approx_cheat_thr

In [None]:
fig, axis = plt.subplots(1, 3, figsize=(12, 4), dpi=140)


ax = axis[0]
f1_l = []
for subject_id in subject_ids:
    r, p = compute_performance(embeddings_all[subject_id], 0.5, distance_thr)
    f1 = 2 * r * p / (r + p + 1e-6)
    f1_l.append(f1)
    ax.plot(r, p, marker='o', markersize=6, color='tab:blue')
ax.set_title("Thr 0.5 (F1 %1.1f +- %1.1f)" % (100 * np.mean(f1_l), 100 * np.std(f1_l)))

ax = axis[1]
f1_l = []
for subject_id in subject_ids:
    opt_thr = embeddings_all[subject_id]['opt_thr']
    r, p = compute_performance(embeddings_all[subject_id], opt_thr, distance_thr)
    f1 = 2 * r * p / (r + p + 1e-6)
    f1_l.append(f1)
    ax.plot(r, p, marker='o', markersize=6, color='tab:blue')
ax.set_title("Thr Opt (F1 %1.1f +- %1.1f)" % (100 * np.mean(f1_l), 100 * np.std(f1_l)))

ax = axis[2]
f1_l = []
for subject_id in subject_ids:
    cheat_thr = approx_cheat_thr_dict[subject_id]
    r, p = compute_performance(embeddings_all[subject_id], cheat_thr, distance_thr)
    f1 = 2 * r * p / (r + p + 1e-6)
    f1_l.append(f1)
    ax.plot(r, p, marker='o', markersize=6, color='tab:blue')
ax.set_title("Thr Cheat (F1 %1.1f +- %1.1f)" % (100 * np.mean(f1_l), 100 * np.std(f1_l)))

for ax in axis:
    ax.set_xlabel("Recall")
    ax.set_ylabel("Precision")
    ax.set_xlim([0, 1])
    ax.set_ylim([0, 1])
    ax.plot([0, 1], [0, 1], linewidth=1, color='k', alpha=0.5)
    ax.plot([.5, .5], [0, 1], linewidth=1, color='k', alpha=0.5)
    ax.plot([0, 1], [.5, .5], linewidth=1, color='k', alpha=0.5)
    ax.legend(loc='lower left')
    ax.set_aspect('equal')
plt.tight_layout()
plt.show()

In [None]:
# compute prototypes
prototype_dict = {}
for subject_id in subject_ids:
    proba = embeddings_all[subject_id]['y_pred_proba'].reshape(-1, 1)
    em_pred = embeddings_all[subject_id]['y_pred_tensor']
    proto = np.sum(proba * em_pred, axis=0) / np.sum(proba)
    prototype_dict[subject_id] = proto

In [None]:
# find ranking of components according to std
component_std = []
for c in range(128):
    all_c = [prototype_dict[subject_id][c] for subject_id in subject_ids]
    component_std.append(np.std(all_c))
component_std = np.array(component_std)

In [None]:
plt.hist(component_std)
plt.show()

In [None]:
largest_std = np.argsort(-component_std)
plt.bar(np.arange(128), component_std[largest_std])
plt.show()

In [None]:
top_size = 120
cheat_thr_list = [approx_cheat_thr_dict[subject_id] for subject_id in subject_ids]
for k in range(top_size):
    component_loc = largest_std[k]
    component_value = [prototype_dict[subject_id][component_loc] for subject_id in subject_ids]
    plt.scatter(component_value, cheat_thr_list)
    plt.ylabel("Cheat Thr")
    plt.xlabel("Component %d" % k)
    plt.show()

In [None]:
# pca first
# project with pca first
from sklearn.decomposition import PCA

In [None]:
all_prototypes = [prototype_dict[subject_id] for subject_id in subject_ids]
all_prototypes = np.stack(all_prototypes, axis=0)
std = all_prototypes.std(axis=0)
all_prototypes = all_prototypes[:, std > 0]
all_prototypes = (all_prototypes - all_prototypes.mean(axis=0)) / all_prototypes.std(axis=0)
pca = PCA(n_components=10)
proto_pca = pca.fit_transform(all_prototypes)

In [None]:
cheat_thr_list = [approx_cheat_thr_dict[subject_id] for subject_id in subject_ids]
for k in range(3):
    component_value = proto_pca[:, k]
    plt.scatter(component_value, cheat_thr_list)
    plt.ylabel("Cheat Thr")
    plt.xlabel("Component %d" % k)
    plt.show()