In [None]:
import os
import numpy as np
from spectral_clustering import spectral_clustering
import seaborn as sns
import functions_for_plotting
from asymmetric_laplacian_distribution import get_index_per_class, get_labels, labels_to_layout_mapping
from sklearn.cluster import KMeans
import training_set_split
import seaborn as sns
import prediction_strength
import importlib
import matplotlib.pyplot as plt
from prediction_strength import get_F1_score_per_k
from matplotlib.legend import Legend
from training_set_split import get_training_folds
import wagenaar_dataset

# Toy Dataset

For validating the method of Spectral Clustering the idea was to generate toy datasets and check whether the method is able to capture known true clusters. In order to have a burst like toy-dataset we decided to use asymmetric laplace distributions for which we adjusted amplitude and time constants. 

$\begin{align}f(x,\mu,\lambda, \tau_1, \tau_2):\\   
\text{if } x < \mu:\hspace{1mm} &f = \lambda \cdot exp(\tau_1(x-\mu)) \\  
\text{else }: \hspace{1mm} &f = \lambda \cdot exp(-\tau_2(x-\mu)) \\ 
\text{return }:\hspace{1mm} & f\end{align}$

We came up with three different toy-datasets each with different charactersitics. 
The values used are inspired by the daily spontaneous activation of cultured neurons after 20 days in vitro taken from Wagenaar, Pine, and Potter's dataset (Wagenaar, Pine & Potter 2006). 

## Amplitudes: 
- Small (S) : [1, 2]
- Small/Mediun (S_M): [3,4]
- Medium (M): [5,7]
- Medium/Large (M_L): [8,11]
- Large (L): [12,14]  

## Time constants: 
- $\tau_1 \ll \tau_2$ (slow rise - fast fall)
- $\tau_1 < \tau_2$ (slow rise - medium fall / medium rise - fast fall)
- $\tau_1 \gg \tau_2$ (fast rise - slow fall) 
- $\tau_1 > \tau_2$(medium rise - slow fall / fast rise - medium fall)
- $\tau_1  ≈  \tau_2$ (sharp)
- $\tau_1  ≈  \tau_2$ (wide)
- $\tau_1  ≈  \tau_2$ (medium)
- $small_\tau = [0.001, 0.003]$
- $medium_\tau = [0.004, 0.019]$
- $large_\tau = [0.02, 0.5]$


The first and most simple one we created was called 'clearly separable'. It was build from a 3x4 combination of amplitude and time constants and contained 12 clusters in total. We used a time range of [0, 3500] and aligned peaks at $\mu = 1750$

### Clearly Separable Dataset: 
- Amplitudes: S,M,L
- Time Constants: $small_\tau$,$large_\tau$ 
    1. $\tau_1  ≈  \tau_2$ (sharp)
    2. $\tau_1  ≈  \tau_2$ (wide)
    3. $\tau_1 \ll \tau_2$ (slow rise - fast fall)
    4. $\tau_1 \gg \tau_2$ (fast rise - slow fall)


For each combination we calculated 1000 samples. For each sample we uniformly,randomly drew the parameters from the coressponding range. Afterwards we added gaussian noise with a std of 0.2. Negative value after adding the noise were set to zero.

The second dataset call 'ambiguous' was created in order to introduce some small ambiguity between clusters and make the synthetic data closer to the reality. We therefore introduced the $medium_\tau$ time constant and the amplitudes Small/Medium and Medium/Large.    
With these additional parameters and resulting additional conditions we tried to create datapoints right between the previous clusters. We calculated 400 additional datapoints for each of these conditions.
The new ambiguous toy dataset contained 25200 examples from 45 different conditions (for each amplitude 9 time constant combinations).

### Ambiguous Dataset:
- Amplitudes: S,S_M,M,S_L,L
- Time Constants: $small_\tau$,$large_\tau$ 
    1. $\tau_1  ≈  \tau_2$ (sharp)
    2. $\tau_1  ≈  \tau_2$ (medium)
    3. $\tau_1  ≈  \tau_2$ (wide)
    4. $\tau_1 \ll \tau_2$ (slow rise - fast fall)
    5. $\tau_1 < \tau_2$ (slow rise - medium fall)
    6. $\tau_1 < \tau_2$ (medium rise - fast fall)
    7. $\tau_1 \gg \tau_2$ (fast rise - slow fall) 
    8. $\tau_1 > \tau_2$(medium rise - slow fall) fast rise - medium fall)
    9. $\tau_1 > \tau_2$(fast rise - medium fall)



With respect to the inbalance in amplitudes observed in Wagenaar et al.'s (Wagenaar, Pine & Potter 2006) real 
world data we created a third dataset. 
We adapt the distribution of clusters in the clearly separable toy dataset by looking at amplitudes of bursts occuring in Wagenaar et al.'s data at day 20. We considered every burst with an amplitude <= 1 as "small" or "tiny". This resulted in 13383 'tiny' and 523 'non-tiny' bursts. 
We uniformly randomly distributed the 13383 'tiny' burst over the 4 small clusters (small amplitude in all time constant conditions). Same was done for the remaining 523 bursts and the remaining 8 other classes.
This resulted in a toy dataset with 13906 bursts. For the time range we used  [0, 3411].

### Tiny burst informed toy-dataset:
- Amplitudes: S,M,L
- Time Constants: $small_\tau$,$large_\tau$ --> Samples: [S,M,L]
    1. $\tau_1  ≈  \tau_2$ (sharp) -->  [3298, 56, 67]
    2. $\tau_1  ≈  \tau_2$ (wide) --> [3401, 54, 73]
    3. $\tau_1 \ll \tau_2$ (slow rise - fast fall) --> [3347, 80, 60]
    4. $\tau_1 \gg \tau_2$ (fast rise - slow fall) --> [3337, 68, 65]

## Tiny Burst Overloaded Wagenaar Data inspired 

In [None]:
data_dir = "data/raw_data/daily_spontanous_dense/day20/"
day20_data = np.load(data_dir + "data_burst_by_time_day_20.npy").T

### Parameter

In [None]:
TIME_RANGE = [0, day20_data.shape[1]]
EQUAL_NOISE = [0, 0.2]
NOISE_TYPE = "gaussian"
USE_EQUAL_NOISE = True

AMPLITUDE_CONDITIONS = ["S", "M", "L"]
TIME_CONSTANT_CONDITIONS = ["equal_sharp", "equal_wide", "wide_sharp_negative_skew", "sharp_wide_positive_skew"]
AMBIGUOUS_CONDITIONS = []
SAMPLES_PER_AMBIGUOUS_CONDITION = 0
MU = day20_data.shape[1]/2

In [None]:
n_tiny_bursts_in_real_data = len(np.where(np.amax(day20_data,axis=1) <= 1)[0])
n_non_tiny_bursts_in_real_data = len(day20_data) - n_tiny_bursts_in_real_data

In [None]:
print("Tiny Bursts: ", n_tiny_bursts_in_real_data)
print("Non-Tiny Bursts: ", n_non_tiny_bursts_in_real_data)

### Uniformly sample the tiny burst conditions

In [None]:
np.random.seed(42)
tiny_burst_conditions = np.random.multinomial(n_tiny_bursts_in_real_data, [1/4.]*4, size=1)[0]
non_tiny_burst_conditions =  np.random.multinomial(n_non_tiny_bursts_in_real_data, [1/8.]*8, size=1)[0]

In [None]:
print("Tiny burst #/condition:", tiny_burst_conditions)
print("Non-Tiny burst #/condition:", non_tiny_burst_conditions)

In [None]:
SAMPLES_PER_CONDITION = list(tiny_burst_conditions) + list(non_tiny_burst_conditions)
print(SAMPLES_PER_CONDITION)

In [None]:
F_signal, F_signal_noise, noises, param_data = generate_ALF_data(X, AMPLITUDE_CONDITIONS, 
                                                                 TIME_CONSTANT_CONDITIONS,
                                                                 AMBIGUOUS_CONDITIONS, 
                                                                 SAMPLES_PER_CONDITION,
                                                                 SAMPLES_PER_AMBIGUOUS_CONDITION,MU, 
                                                                 noise_type = NOISE_TYPE, 
                                                                 equal_noise = USE_EQUAL_NOISE)

In [None]:
#param_data.to_csv( "clear_data_equal_noise=[0,0.2]_tiny_burst_overload_non_tiny_upsampled" + "_parameter" + ".csv",index=False)
#np.save("clear_data_equal_noise=[0,0.2]_tiny_burst_overload_non_tiny_upsampled" + "_F_signal",F_signal)
#np.save("clear_data_equal_noise=[0,0.2]_tiny_burst_overload_non_tiny_upsampled" + "_F_signal_noise", F_signal_noise)

In [None]:
data = np.load("clear_data_equal_noise=[0,0.2]_tiny_burst_overload_F_signal_noise.npy")
class_dict = get_index_per_class(AMPLITUDE_CONDITIONS, TIME_CONSTANT_CONDITIONS, 
                                 AMBIGUOUS_CONDITIONS, SAMPLES_PER_CONDITION, 
                                 SAMPLES_PER_AMBIGUOUS_CONDITION)
true_labels = get_labels(data, class_dict)

# Plot examples

In [None]:
clear_clusters_ordered = list(range(0,len(class_dict)+1))
layout_mapping = labels_to_layout_mapping(clear_clusters_ordered, 4, (1,4))

In [None]:
rows = 3
columns = 4
figsize = (20,20)
subplot_adjustments = [0.05,0.95,0.03,0.9,0.4, 0.15]
title = ""
save_file_clusters="test.pdf"

In [None]:
functions_for_plotting.plot_clusters(data, # the dataset 
                                     true_labels, # the true labels for the dataset 
                                     true_labels,  # the clustered labels 
                                     rows, # the number of rows in the grid 
                                     columns, # the number of columns in the grid 
                                     layout_mapping, # our layout mapping 
                                     figsize=figsize, # the figsize
                                     n_bursts = 100, # the number of bursts you want to plot for each cluster 
                                     y_lim = (0,16), # the y_lim
                                     save_file=save_file_clusters, # the file you want to save the plot 
                                     subplot_adjustments= subplot_adjustments, # adjustments for suplots and overall spacing (tricky) 
                                     plot_mean=True, # plot the mean of each cluster ? 
                                     title= title) # title of the plots