In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from asymmetric_laplacian_distribution import generate_ALF_data, get_index_per_class, get_labels, labels_to_layout_mapping
import functions_for_plotting 
from prediction_strength import get_statistic_score_per_k
from training_set_split import get_training_folds
from spectral_clustering import spectral_clustering
import prediction_strength
from matplotlib.legend import Legend
import wagenaar_dataset



# Data
- load dataset
- load culture dict specifiying for each culture start and end point with respect to the dataset for indexing

In [10]:
data_dir = "data/raw_data/daily_spontanous_dense/day20/"
data = np.load(data_dir + "data_burst_by_time_day_20.npy").T
culture_dict = np.load(data_dir + "culture_dict_day_20.npy",allow_pickle=True).item()

## Specify Data Splitting
split styles: 
- 'balanced' with respect to cultures (5 fold random split for each culture) --> culture_dict must be provided
- 'random' 5 fold random split --> no culture_dict needed

In [None]:
split_style = "balanced"
folds = 5

In [18]:
train_fold_indices, valid_fold_indices = training_set_split.get_training_folds(data,culture_dict,cluster_split = split_style,folds = folds)

# Labels

In [5]:
labels_total = np.load("labels_day20_Euclidean_k=10_reg=None_100clusters.npy")

In [7]:
labels_four_fifth = np.load("labels_day20_Euclidean_k=10_reg=None_5_fold_random_train_100clusters.npy", allow_pickle = True)
labels_one_fifth = np.load("labels_day20_Euclidean_k=10_reg=None_5_fold_random_train_100clusters.npy", allow_pickle = True) 

In [8]:
clustered_labels_total = {}
clustered_labels_four_fifth = {}
clustered_labels_one_fifth_centroid_predicted = {}

for i in range(len(labels_total)):
    clustered_labels_total[i+1] = np.asarray(labels_total)[i]
    clustered_labels_one_fifth_in_total[i+1] = np.asarray(labels_one_fifth_in_total)[:,i]
    clustered_labels_four_fifth[i+1] = np.asarray(labels_four_fifth)[:,i]

# Clusterwise F1-Score for validated clusters 

In [None]:
F1_scores = np.load("F1_day20_Euclidean_k=10_reg=None_5_fold_random_train_100clusters_jackknife_one_fifth_centroid_vs_total_clusterwise.npy",allow_pickle=True).item()

# Plot Clusters 

In [9]:
save_file_clusters = "test.pdf" # file to store plot
k_clusters = 10 # number of clusters to plot with corresponding labeling 
reference_clusters = 10 # in order to compare to another clustering of same data, if not equal to k_clusters 
title = "" # title of the plot

In [10]:
# Layout
rows = 3 
columns = 4

# Figure parameter
figsize = (20,20)
subplot_adjustments = [0.05,0.95,0.03,0.9,0.4, 0.15] #spacing between subplots and borders
# left, right, bottom, top, hspace, wspace

In [None]:
# Validation 
functions_for_plotting.plot_clusters(data[valid_fold_indices[0]], # the dataset 
                                     clustered_labels_one_fifth_in_total[k_clusters][0], # the reference labels for the dataset (if not wanted equal to clustered labels)
                                     clustered_labels_one_fifth_in_total[k_clusters][0],  # the clustered labels 
                                     rows, # the number of rows in the grid 
                                     columns, # the number of columns in the grid 
                                     None, # layout mapping specifing the position of each plot 
                                     figsize=figsize, # the figsize
                                     reference_clustering="F1-Score", # show F1-score for each cluster
                                     scores = F1_scores[k_clusters][0],
                                     n_bursts = 100, # the number of bursts you want to plot for each cluster 
                                     y_lim = (0,16), # the y_lim for zoomed plot (0,1) normal (0,16)
                                     save_file=save_file_clusters, # the file you want to save the plot 
                                     subplot_adjustments= subplot_adjustments, # adjustments for suplots and overall spacing (tricky) 
                                     plot_mean=False, # plot the mean of each cluster ? 
                                     title= title )# title of the plot     

In [None]:
# training
functions_for_plotting.plot_clusters(data[train_fold_indices[0]], # the dataset 
                                     clustered_labels_four_fifth[k_clusters][0], # the reference labels for the dataset (if not wanted equal to clustered labels)
                                     clustered_labels_four_fifth[k_clusters][0],  # the clustered labels 
                                     rows, # the number of rows in the grid 
                                     columns, # the number of columns in the grid 
                                     None, # layout mapping specifing the position of each plot 
                                     figsize=figsize, # the figsize
                                     reference_clustering="True", # show the "true"-reference cluster
                                     scores = None, 
                                     n_bursts = 100, # the number of bursts you want to plot for each cluster 
                                     y_lim = (0,16), # the y_lim for zoomed plot (0,1) normal (0,16)
                                     save_file=save_file_clusters, # the file you want to save the plot 
                                     subplot_adjustments= subplot_adjustments, # adjustments for suplots and overall spacing (tricky) 
                                     plot_mean=False, # plot the mean of each cluster ? 
                                     title= title)# title of the plot 