In [1]:
import numpy as np
from spectral_clustering import spectral_clustering
import functions_for_plotting
from asymmetric_laplacian_distribution import get_index_per_class, get_labels, labels_to_layout_mapping
from sklearn.cluster import KMeans
import training_set_split
import seaborn as sns
import prediction_strength
import importlib
import matplotlib.pyplot as plt
from prediction_strength import get_F1_score_per_k



# Data

In [2]:
#----------------------------------------------- DATA ------------------------------------------------------------------
data_dir = "data/"

#clear_data = np.load(data_dir + "clearly_separated_data_equal_noise=[0,1]_F_signal_noise.npy")
ambig_data = np.load(data_dir + "ambiguous_data_equal_noise=[0,0.2]_F_signal_noise.npy")
#np.load(data_dir + "ambiguous_data_tau_amplitude_F_signal_noise.npy") #np.load(data_dir + "clearly_separated_data_F_signal_noise.npy")

#clear_amplitude_conditions = ["S", "M", "L"]  #["S", "S/M", "M", "M/L", "L"] #["S", "M", "L"]
ambig_amplitude_conditions = ["S", "S/M", "M", "M/L", "L"]

#clear_time_constant_conditions = ["equal_sharp", "equal_wide", "wide_sharp_negative_skew", "sharp_wide_positive_skew"]
ambig_time_constant_conditions = ["equal_sharp", "equal_medium", "equal_wide", "wide_sharp_negative_skew", "wide_medium_negative_skew","medium_sharp_negative_skew","sharp_wide_positive_skew", "medium_wide_positive_skew" ,"sharp_medium_positive_skew"]

#["equal_sharp", "equal_medium", "equal_wide", "wide_sharp_negative_skew", "wide_medium_negative_skew","medium_sharp_negative_skew","sharp_wide_positive_skew", "medium_wide_positive_skew" ,"sharp_medium_positive_skew"]
#["equal_sharp", "equal_wide", "wide_sharp_negative_skew", "sharp_wide_positive_skew"]

ambiguous_conditions = ["S/M", "M/L", "equal_medium", "wide_medium_negative_skew", "medium_sharp_negative_skew", "medium_wide_positive_skew", "sharp_medium_positive_skew"]

samples_per_condition = 1000
samples_per_ambiguous_condition = 400

ambig_cluster_dict = get_index_per_class(ambig_amplitude_conditions,ambig_time_constant_conditions, ambiguous_conditions, samples_per_condition, samples_per_ambiguous_condition)
#clear_cluster_dict = get_index_per_class(clear_amplitude_conditions,clear_time_constant_conditions, [], samples_per_condition, samples_per_ambiguous_condition)


#clear_true_labels = get_labels(clear_data, clear_cluster_dict)
ambig_true_labels = get_labels(ambig_data, ambig_cluster_dict)

#clear_clusters_ordered = list(range(0,len(clear_cluster_dict)+1))
#clear_layout_label_mapping = labels_to_layout_mapping(clear_clusters_ordered, 4, (1,4)) #labels_to_layout_mapping(clusters_ordered, 4, (1,4)) #labels_to_layout_mapping(clusters_ordered, 9, (2,5))

ambig_clusters_ordered = list(range(0,len(ambig_cluster_dict)+1))
ambig_layout_label_mapping = labels_to_layout_mapping(ambig_clusters_ordered, 9, (2,5))

# Clustering

In [3]:
def SSIM(data,alpha,beta,gamma):
    counter1 = np.outer(np.mean(data, axis = 1),np.mean(data, axis = 1))
    counter2 = np.outer(np.std(data, axis = 1),np.std(data, axis = 1))
    counter3 = np.cov(data,rowvar=True)

    diag1 = counter1.diagonal()
    diag2 = counter2.diagonal()
    denom1 = diag1.reshape((len(diag1),1)) + diag1
    denom2 = diag2.reshape((len(diag2),1)) + diag2

    sim_matrix1 = 2 * counter1 / denom1
    sim_matrix2 = 2 * counter2 / denom2
    sim_matrix3 = (np.round(counter3/counter2,3)+1)/2

    sim_matrix_mu_var = sim_matrix1**alpha * sim_matrix2**beta * sim_matrix3**gamma
    
    return sim_matrix_mu_var

In [4]:
def SSIM_RAW(data):
    # RAW SSIM (Dice Similarity)
    #2xy/(x^2+y^2)
    sim_matrix = data @ data.T 
    diag = sim_matrix.diagonal()
    denom = diag.reshape((len(diag),1)) + diag
    sim_matrix = 2 * sim_matrix / denom
    return sim_matrix

In [5]:
data = ambig_data
true_labels = ambig_true_labels
layout = ambig_layout_label_mapping
reg = None

## Clustering SSIM_RAW

In [11]:
ssim_raw_matrix = SSIM_RAW(data)

In [12]:
precomputed_matrix = ssim_raw_matrix

In [13]:
ks = [3,5,7,10,50,100,len(data)]

In [None]:
for k in ks:
    labels, eigvec, eigval = spectral_clustering(data, "precomputed", "similarity", range(1,50), k=k, precomputed_matrix=precomputed_matrix, mutual = False, weighting = True, normalize = True, reg_lambda = None, save_laplacian = False, save_eigenvalues_and_vectors = False)
    if k == len(data):
        k = "Full"
    np.save("labels_ambig_equal_noise=[0,0.2]_SSIM_RAW_k=%s_reg=None_weighting=True" % str(k),labels)

Use precomputed matrix for constructing KNN-Graph
Build symmetric KNN-Graph based on Similarity of data points!
Weighting: True
Calculate Normalized Laplacians
Normalization: symmetric
Calculate Eigenvalues and Vectors of Laplacian


## Clustering SSIM

In [None]:
ssim_matrix_1_1_01 = SSIM(data,1,1,0.1)

In [None]:
ssim_matrix_1_1_1 = SSIM(data,1,1,1)

In [None]:
ks = [3,5,7,10,50,100,len(data)]

In [None]:
precomputed_matrix = ssim_matrix_1_1_01

In [None]:
for k in ks:
    labels, eigvec, eigval = spectral_clustering(data, "precomputed", "similarity", range(1,50), k=k, precomputed_matrix=precomputed_matrix, mutual = False, weighting = True, normalize = True, reg_lambda = None, save_laplacian = False, save_eigenvalues_and_vectors = False)
    if k == len(data):
        k = "Full"
    np.save("labels_ambig_equal_noise=[0,0.2]_SSIM_1101_k=%s_reg=None_weighting=True" % str(k),labels)

In [None]:
precomputed_matrix = ssim_matrix_1_1_1

In [None]:
for k in ks:
    labels, eigvec, eigval = spectral_clustering(data, "precomputed", "similarity", range(1,50), k=k, precomputed_matrix=precomputed_matrix, mutual = False, weighting = True, normalize = True, reg_lambda = None, save_laplacian = False, save_eigenvalues_and_vectors = False)
    if k == len(data):
        k = "Full"
    np.save("labels_ambig_equal_noise=[0,0.2]_SSIM_111_k=%s_reg=None_weighting=True" % str(k),labels)

## Euclidean

In [6]:
from spectral_clustering import calculate_dist_matrix
dist_matrix_euclidean, _ = calculate_dist_matrix(data, "euclidean")

In [7]:
ks = [3,5,7,10,50,100]

In [8]:
precomputed_matrix = dist_matrix_euclidean

In [None]:
for k in ks:
    labels, eigvec, eigval = spectral_clustering(data, "precomputed", "distance", range(1,50), k=k, precomputed_matrix=precomputed_matrix, mutual = False, weighting = False, normalize = True, reg_lambda = None, save_laplacian = False, save_eigenvalues_and_vectors = False)
    if k == len(data):
        k = "Full"
    np.save("labels_ambig_equal_noise=[0,0.2]_Euclidean_k=%s_reg=None_weighting=False" % str(k),labels)

Use precomputed matrix for constructing KNN-Graph
Build symmetric KNN-Graph based on Distance of data points!
Weighting: False
Calculate Normalized Laplacians
Normalization: symmetric
Calculate Eigenvalues and Vectors of Laplacian


# Configurations

In [42]:
data = ambig_data
true_labels = ambig_true_labels

save_file = "F1_k=10_ambig_SSIM_RAW_clear_clusters"#"F1_k=Full_clear_Noise=[0,1]_SSIM_RAW" 
save_file_clear_clusters = "F1_k=10_ambig_SSIM_RAW_clear_clusters" 
calculate_F1_for_clear_clusters_in_ambig_data = True
#save_file_clear_clusters = "F1_clear_clusters_k=%d_reg=%s_ambig_balanced_true_SSIM_EUCLIDEAN" 

In [49]:
data = clear_data
true_labels = clear_true_labels

save_file = "F1_k=10_clear_Noise=[0,1]_SSIM_Raw_weighting=True" 
#save_file_clear_clusters = "F1_k=Full_ambig_SSIM_clear_clusters" 
calculate_F1_for_clear_clusters_in_ambig_data = False
#save_file_clear_clusters = "F1_clear_clusters_k=%d_reg=%s_ambig_balanced_true_SSIM_EUCLIDEAN" 

In [50]:
clustered_labels = np.load("label_noise=[0,1]_SSIM_RAW_k=10_reg=None_weighted=True.npy")

clustered_labels_dict = {}
    
for i, labels in enumerate(clustered_labels):
    clustered_labels_dict[i+1] = labels
       
        
F1_score_per_k = get_F1_score_per_k(data, range(len(data)), range(len(data)), None, clustered_labels_dict, combination_type = "true" ,true_train_labels = true_labels)    

np.save(save_file, F1_score_per_k) 
    
if calculate_F1_for_clear_clusters_in_ambig_data:
    clusters_from_ambig_dataset, counts = np.unique(true_labels, return_counts = True)
    clear_clusters_from_ambig = clusters_from_ambig_dataset[np.where(counts == 1000)]
    
    clear_clusters_from_ambig_idx = np.where(np.isin(true_labels,clear_clusters_from_ambig) == True)[0]

    clear_inidices = np.asarray(range(len(data)))[clear_clusters_from_ambig_idx]
    

    clear_clustered_labels_dict = {}
    for i, labels in enumerate(clustered_labels):
        clear_clustered_labels_dict[i+1] = labels[clear_clusters_from_ambig_idx]

    clear_true_labels = true_labels[clear_clusters_from_ambig_idx]    

    F1_score_per_k_clear_clusters = get_F1_score_per_k(data, clear_inidices, clear_inidices, None, clear_clustered_labels_dict, combination_type = "true" ,true_train_labels = clear_true_labels)

    np.save(save_file_clear_clusters,F1_score_per_k_clear_clusters)

Calculate F1 score based on true training centroids!


In [44]:
F1_score_per_k

{1: 0.1539520974236562,
 2: 0.2793395475806112,
 3: 0.34800058485997576,
 4: 0.4062630010387748,
 5: 0.49845682803827784,
 6: 0.5870787336271097,
 7: 0.6005523475951335,
 8: 0.7057481739600017,
 9: 0.7503794305927199,
 10: 0.7741173118293209,
 11: 0.7718687998517706,
 12: 0.8141573719715776,
 13: 0.7777232131611541,
 14: 0.7672674929245016,
 15: 0.7661165138017607,
 16: 0.7405745348094017,
 17: 0.7609373026212791,
 18: 0.7349683519864573,
 19: 0.7032886652871626,
 20: 0.6619521776807519,
 21: 0.647291580965391,
 22: 0.5851346842635546,
 23: 0.5865151628794486,
 24: 0.5848442073021184,
 25: 0.5648700467030915,
 26: 0.5368231846558567,
 27: 0.5391132551217305,
 28: 0.5362109764517032,
 29: 0.5201907184964494,
 30: 0.506401469473674,
 31: 0.4986165833434918,
 32: 0.48690712211689735,
 33: 0.4605545272032836,
 34: 0.46980670950809095,
 35: 0.4561349081264966,
 36: 0.43122673937772826,
 37: 0.4350657506814467,
 38: 0.42293790272600146,
 39: 0.42468187909668975,
 40: 0.4026516521941282,
 41:

In [24]:
F1_score_per_k_clear_clusters

{1: 0.15402960652515688,
 2: 0.25908921476939833,
 3: 0.3505291025909605,
 4: 0.39726436740921234,
 5: 0.45834110919806464,
 6: 0.47645313342532875,
 7: 0.4757990434079785,
 8: 0.6632345401611653,
 9: 0.6630907565173416,
 10: 0.7010324122618977,
 11: 0.7944956057331485,
 12: 0.794200273105369,
 13: 0.7941217642307014,
 14: 0.8518802827668048,
 15: 0.8514694712257833,
 16: 0.9187238910797932,
 17: 0.9187175748870922,
 18: 0.9120526939579391,
 19: 0.9118801077921017,
 20: 0.8980971374956452,
 21: 0.8952377796537089,
 22: 0.87395071016424,
 23: 0.8710560545121807,
 24: 0.8673593966150471,
 25: 0.8645145761393829,
 26: 0.8679720892397811,
 27: 0.8699686119868788,
 28: 0.8661758248491223,
 29: 0.8662564512955203,
 30: 0.8651004246300973,
 31: 0.8648204556995084,
 32: 0.8148657577490128,
 33: 0.8131567672231682,
 34: 0.8142136193986921,
 35: 0.7864742191345094,
 36: 0.7571249182991883,
 37: 0.785257020452987,
 38: 0.7847372035095789,
 39: 0.7536084669510672,
 40: 0.7533574562502139,
 41: 0.7

# Plot F1 for different Configurations

In [None]:
files = ["F1_k=Full_ambig_SSIM_clear_clusters.npy","F1_k=10_ambig_SSIM_clear_clusters.npy","F1_k=10_ambig_Euclidean_clear_clusters.npy","F1_k=Full_ambig_SSIM_RAW_clear_clusters.npy ","F1_k=10_ambig_SSIM_RAW_clear_clusters.npy"]

In [None]:
F1_scores_clear_clusters = {}
for file in files: 
    f1_dict = np.load(file,allow_pickle=True).item()
    k_clusters = list(f1_dict.keys())
    F1_scores = []
    for i in k_clusters:
        #mean_prediction_strengths.append(np.mean(prediction_strengths_per_k[k]))
        #err_prediction_strengths.append(np.std(prediction_strengths_per_k[k]))
        #min_prediction_strengths.append(np.amin(prediction_strengths_per_k[k]))
        F1_scores.append(f1_dict[i])
    F1_scores_clear_clusters_reg[reg] = F1_scores

In [None]:
figsize=(20,10)
plot_adjustments = [0.05,0.08,0.95, 0.91]
save_file = "Ambig_F1_clear_clusters_regularization_comparison_None_to_100_k=10_SSIM_EUCLIDEAN.pdf"

In [None]:
fig, ax = plt.subplots(figsize=figsize)
k_clusters = range(1,50)
for i,reg in enumerate(regs):
    F1_scores = F1_scores_clear_clusters_reg[reg]
    ax.plot(k_clusters, F1_scores, "o-", label="reg=%s" % str(reg),color = "C0%d" % i,linewidth=3)

    argmax_f1 = np.argmax(F1_scores[1:]) + 1

    ax.annotate("#%d|Score=%.3f" % (argmax_f1+1, F1_scores[argmax_f1]), (k_clusters[argmax_f1] - 1, F1_scores[argmax_f1] + 0.03 - i*0.03), fontsize=16, color = "C0%d" % i)


title = "F1-Score of Clear Clusters for Clustering with k Clusters \n" + configuration 

ax.set_title(title, fontsize=22, pad=20)
ax.set_xticks(k_clusters)
ax.set_xlabel("# Number of clusters", fontsize=18, labelpad=10)
ax.set_ylabel("F1-Score", fontsize=18, labelpad=10),
ax.set_ylim((0, 1.1))
ax.tick_params(axis='y',labelsize=14)
ax.tick_params(axis='x',labelsize=14)

ax.set_yticks(np.arange(0, 1.1,0.1))
left = plot_adjustments[0]
bottom = plot_adjustments[1]
right = plot_adjustments[2]
top = plot_adjustments[3]

plt.subplots_adjust(left,bottom,right, top)

ax.legend(fontsize = 14, loc="lower right")

plt.savefig(save_file)
plt.close()