# Denoise Data based on prediction Strength per sample 
Calculate prediction strength per sample and throw out samples with low prediction strength.
Reduce the number of clusters if all samples of one cluster are thrown out. 

In [16]:
import os
os.environ['OPENBLAS_NUM_THREADS'] ='40'
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import scipy
from scipy.io import loadmat
from scipy.io import savemat
import matplotlib.pyplot as plt 
import pandas as pd
import struct
import json
import sys
import seaborn as sns 
from scipy import signal, stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture as GMM
import functools

In [13]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [17]:
import ipynb.fs.defs.prediction_strength

In [18]:
data_dir = '../Data/'
data_burst_by_time = np.load(data_dir + 'data_burst_by_time.npy').T
data_burst_by_time_shuffled = (np.random.permutation(data_burst_by_time))
print("Averaged over channels: ", data_burst_by_time.shape)
print(data_burst_by_time.shape)
print(data_burst_by_time_shuffled.shape)

Averaged over channels:  (13092, 3410)
(13092, 3410)
(13092, 3410)


In [19]:
data = data_burst_by_time

In [20]:
train_folds = np.load(data_dir + "50_50_split/train_folds_50_50.npy") #np.load(data_dir + "culture_balanced/culture_balanced_training_split.npy")
valid_folds = np.load(data_dir + "50_50_split/valid_folds_50_50.npy") #np.load(data_dir + "culture_balanced/culture_balanced_validation_split.npy")

if len(train_folds.shape)>1:
    training_sets = []
    validation_sets = []
    for i, split in enumerate(train_folds):
        training_sets.append(data[split])
        validation_sets.append(data[valid_folds[i]])
else:
    train_folds = [train_folds]
    valid_folds = [valid_folds]
    training_sets = [data[train_folds]]  #data_burst_by_time[training_split] # extract training bursts from dataset with indices
    validation_sets = [data[valid_folds]]  #data_burst_by_time[test_split] # extract validation bursts from dataset with indices 

In [21]:
for i, train_set in enumerate(training_sets):
    print("Split %d :" % (i+1))
    print("%d Bursts in Training Set equal to %.2f %% of the total data. " % (len(train_set), np.round((len(train_set)/len(data)), 4) * 100))
    print("%d Bursts in Validation Set equal to %.2f %% of the total data. " % (len(validation_sets[i]), np.round((len(validation_sets[i])/len(data)), 4) * 100))

Split 1 :
6546 Bursts in Training Set equal to 50.00 % of the total data. 
6546 Bursts in Validation Set equal to 50.00 % of the total data. 
Split 2 :
6546 Bursts in Training Set equal to 50.00 % of the total data. 
6546 Bursts in Validation Set equal to 50.00 % of the total data. 


In [22]:
bics = list(np.load(data_dir + '50_50_split/bics_1.npy',allow_pickle=True)) # contains bic score for training and validation set since 50/50 split one file contains full info about bic 
scores_1 = list(np.load(data_dir + '50_50_split/scores_1.npy',allow_pickle=True)) # scores for training and validation set 
scores_2 = list(np.load(data_dir + '50_50_split/scores_1.npy',allow_pickle=True))

predictions_valid_1 = list(np.load(data_dir + '50_50_split/predictions_valid_1.npy',allow_pickle=True)) # predictions of validation set 
predictions_valid_2 = list(np.load(data_dir + '50_50_split/predictions_valid_2.npy',allow_pickle=True))

predictions_train_1 = list(np.load(data_dir + '50_50_split/predictions_train_1.npy',allow_pickle=True)) # predictions of training set 
predictions_train_2 = list(np.load(data_dir + '50_50_split/predictions_train_2.npy',allow_pickle=True)) 

predictions_valid_fitted_1 = list(np.load(data_dir + '50_50_split/predictions_valid_fitted_1.npy',allow_pickle=True)) # predictions of validation set after fitting gmm on it
predictions_valid_fitted_2 = list(np.load(data_dir + '50_50_split/predictions_valid_fitted_2.npy',allow_pickle=True)) 

In [23]:
n_clusters = 20
n_folds = 2


train_fold_labels_gmm = np.stack((predictions_train_1,predictions_train_2),axis = 0)
valid_fold_labels_gmm = np.stack((predictions_valid_fitted_1,predictions_valid_fitted_2), axis = 0)

In [24]:
predictions_strengths_cv_gmm_per_sample = list(np.load(data_dir + '50_50_split/prediction_strength_gmm_per_sample.npy',allow_pickle=True))
predictions_strengths_cv_gmm = list(np.load(data_dir + '50_50_split/prediction_strength_gmm.npy',allow_pickle=True))
valid_fold_labels_predicted_gmm = list(np.load(data_dir + '50_50_split/valid_fold_labels_predicted_gmm.npy',allow_pickle=True))

In [25]:
from ipynb.fs.defs.prediction_strength import calculate_prediction_strength_per_k
k_predictions_strength_cv_gmm, k_valid_fold_labels_predicted_gmm, k_valid_fold_labels_gmm, valid_cluster_size_gmm, valid_cluster_size_predicted_gmm = calculate_prediction_strength_per_k(predictions_strengths_cv_gmm,valid_fold_labels_gmm,valid_fold_labels_predicted_gmm, strength_sorted = True)

In [11]:
init_k_clusters = [8,9]
counter = 0
threshold = 0.8
n_low_ps_bursts_per_fold = [100,100]
n_folds = 2

n_init=100
max_iter=100


sub_dir = '50_50_split/denoising/GMM/k_init=[8,9]_denoising_per_sample/threshold=0.8/'

In [12]:
from ipynb.fs.defs.Spectral_clustering_pipeline import spectral_clustering
from ipynb.fs.defs.prediction_strength import cross_valdation_prediction_strength
from ipynb.fs.defs.prediction_strength import get_low_individual_ps_bursts
from ipynb.fs.defs.prediction_strength import get_low_and_high_ps_bursts_fold_with_labels

In [14]:
def get_low_individual_ps_bursts(data, train_folds, valid_folds, train_fold_labels, valid_fold_labels ,predictions_strengths_cv_per_samples, n_clusters = range(1,21), threshold = 0.8):
    """ extract burst indices for burst with low individual prediction strength for clustering with k clusters per cluster
    Args:
        data (nd.array): Array containing data (n x m)
        train_folds (list of nd.arrays): list of k training set indices each with dimension n-(n/k) x m
        valid_folds (list of nd.arrays): list of k validation set indices each with dimension n/k x m
        train_fold_labels (list of lists): list of lists containing the cluster labels for each point in training set
        valid_fold_labels (list of lists): list of lists containing the cluster labels for each point in validation set
        predictions_strengths_cv_per_samples(list of lists): list of lists containin the prediction strength for individual bursts in each cluster by clustering with k clusters for each folds 
                                                             1. dim n-folds 2.dim n-clusters 3.dim prediction strength for each cluster i in clustering with k clusters 
        n_clusters (nd.array): range of clusters to use for clustering
        n_folds (int): number of folds the data is splitted 
        threshold (float): cutoff for defining low individual prediction strength                                                    
    
    Returns:
        k_low_individual_ps_cv (list of lists): list of lists containin the burst indices for bursts with individual ps below threshold  strength for individual bursts in each cluster by clustering with k clusters for each folds 
                                                keys = n_clusters  values = k_folds x n_clusters (burst indices of bursts from validation set with individual ps below threshold for each cluster) 
    
    """     
    
    
    k_low_individual_ps_cv = {}
    k_low_individual_ps_cv_sizes = {}
    k_low_individual_ps_cv_sizes_prop = {}
    
    for i in n_clusters: # for each clustering ranging from 1 to max n_clusters 
        k_low_individual_ps_cv[i] = []
        k_low_individual_ps_cv_sizes[i] = []
        k_low_individual_ps_cv_sizes_prop[i] = []
    
    for f, fold in enumerate(predictions_strengths_cv_per_samples): # for each fold
        #print(f,len(train_fold_labels))
        train_fold = train_folds[f] # training set for fold k splitting
        valid_fold = valid_folds[f] # validation set for fold k splitting
        
        train_labels = train_fold_labels[f] # labels for training set  
        valid_labels = valid_fold_labels[f] # labels for validation set 
        
        for j,k in enumerate(fold): # for each clustering j with k clusters prediction strenght of fold 
            train_labels_k = train_labels[j] 
            valid_labels_k = valid_labels[j]
            
            k_low_individual_ps_cv_k = []
            k_low_individual_ps_cv_sizes_k = []
            k_low_individual_ps_cv_sizes_prop_k = []
            
            for i in range(j+1):
                index_in_class = np.where(np.asarray(predictions_strengths_cv_per_samples[f][j][i]) < threshold) #get position relative to class 
                low_predictive_bursts = valid_fold[np.where(valid_labels_k == i)[0][index_in_class]]
                
                k_low_individual_ps_cv_k.append(low_predictive_bursts) #get burst indices relative to overall data
                k_low_individual_ps_cv_sizes_k.append(len(low_predictive_bursts))
                k_low_individual_ps_cv_sizes_prop_k.append(len(low_predictive_bursts)/len(valid_fold[np.where(valid_labels_k == i)[0]]))
            
            k_low_individual_ps_cv[j+1].append(k_low_individual_ps_cv_k)
            k_low_individual_ps_cv_sizes[j+1].append(k_low_individual_ps_cv_sizes_k)
            k_low_individual_ps_cv_sizes_prop[j+1].append(k_low_individual_ps_cv_sizes_prop_k)
            
            
            
    return  k_low_individual_ps_cv,k_low_individual_ps_cv_sizes,k_low_individual_ps_cv_sizes_prop        

In [None]:
k_clusters = np.asarray(init_k_clusters)
threshold = 0.8
while True:
    print("Look [%d %d] Clusters in each fold!" % tuple(list(k_clusters)))
    np.save(data_dir + sub_dir + 'k_clusters_per_fold_%d' % counter , k_clusters)
    
    if counter == 0:
        k_low_individual_ps_bursts, k_low_individual_ps_cv_sizes,k_low_individual_ps_cv_sizes_prop = get_low_individual_ps_bursts(data,train_folds, valid_folds, train_fold_labels_gmm, valid_fold_labels_gmm,predictions_strengths_cv_gmm_per_sample,threshold = threshold)
        n_low_ps_bursts_per_fold = [np.sum(k_low_individual_ps_cv_sizes[k_clusters[i]][i]) for i in range(n_folds)] # get low burst examples for each fold after clustering with k_clusters   
        
        k_high_ps_bursts_folds, high_ps_bursts_fold_labels, k_low_ps_bursts_folds, low_ps_bursts_fold_labels = get_low_and_high_ps_bursts_fold_with_labels(valid_folds,valid_fold_labels_gmm,k_low_individual_ps_bursts,k_low_individual_ps_cv_sizes, n_folds = n_folds,n_clusters = range(1,max(k_clusters)+1))
        
    else: 
        k_low_individual_ps_bursts, k_low_individual_ps_cv_sizes,k_low_individual_ps_cv_sizes_prop = get_low_individual_ps_bursts(data,high_ps_train_folds, high_ps_valid_folds, high_ps_train_fold_labels, high_ps_valid_fold_labels,predictions_strengths_cv_per_sample_without_low_samples,threshold = threshold)
        n_low_ps_bursts_per_fold = [np.sum(k_low_individual_ps_cv_sizes[k_clusters[i]][i]) for i in range(n_folds)] # get low burst examples for each fold after clustering with k_clusters   
        
        k_high_ps_bursts_folds, high_ps_bursts_fold_labels, k_low_ps_bursts_folds, low_ps_bursts_fold_labels = get_low_and_high_ps_bursts_fold_with_labels(high_ps_valid_folds,high_ps_valid_fold_labels,k_low_individual_ps_bursts,k_low_individual_ps_cv_sizes, n_folds = n_folds,n_clusters = range(1,max(k_clusters)+1))
    
    
    print("[%d %d] Bursts found in each fold with Prediction Strength below threshold = %.2f!" % (tuple(list(n_low_ps_bursts_per_fold) + [threshold])))
    
    if np.sum(n_low_ps_bursts_per_fold) == 0:
        print("Converged!")
        break
        
    print("Save Number of Bursts with low Prediction Strength for each fold!")
    np.save(data_dir + sub_dir + 'n_low_ps_bursts_per_fold_%d' % counter , n_low_ps_bursts_per_fold)
    
     
    high_ps_valid_folds= [] #indices of bursts in validation set with high prediction strenght per fold
    new_k_clusters = []
    
    for i in range(n_folds):  
        high_ps_valid_folds.append(np.asarray(k_high_ps_bursts_folds[k_clusters[i]][i])) # get bursts indices with high ps for clustering with k_clusters 
        new_k_clusters.append(len(np.unique(high_ps_bursts_fold_labels[i][k_clusters[i]-1]))) #get classes with at least one burst 
    
    high_ps_valid_folds = np.asarray(high_ps_valid_folds)
    
    
    clusters = list(range(1,np.amax(new_k_clusters) + 1))
        
    high_ps_train_folds=[]
    high_ps_train_fold_labels=[]
    high_ps_valid_fold_labels=[]
        
    for i in range(n_folds):
        high_ps_bursts_train_i = np.concatenate((high_ps_valid_folds[:i],high_ps_valid_folds[(i+1):]), axis = 0)[0]
        high_ps_train_folds.append(np.asarray(high_ps_bursts_train_i))
            
        high_ps_bursts_valid_i = high_ps_valid_folds[i]
        high_ps_train_labels_i = np.empty((np.amax(new_k_clusters),), dtype=object)
        high_ps_valid_labels_i = np.empty((np.amax(new_k_clusters),), dtype=object)
        
        for i,c in enumerate(clusters): 
            print("Fitting GMM with %d clusters:" % c)
            
            gmm= GMM(c, n_init=n_init, max_iter=max_iter).fit(data[high_ps_bursts_train_i]) 
            gmm_valid = GMM(c, n_init=n_init, max_iter=max_iter).fit(data[high_ps_bursts_valid_i]) 
            
            
            high_ps_train_labels_i[i] = np.asarray(gmm.predict(data[high_ps_bursts_train_i]))
            high_ps_valid_labels_i[i] = np.asarray(gmm_valid.predict(data[high_ps_bursts_valid_i]))
        
        high_ps_train_fold_labels.append(high_ps_train_labels_i)   
        high_ps_valid_fold_labels.append(high_ps_valid_labels_i)
        
   
    high_ps_train_folds = np.asarray(high_ps_train_folds)
        
    
    print("Save denoised folds!")
    np.save(data_dir + sub_dir + 'high_ps_valid_folds_%d' % (counter + 1), high_ps_valid_folds)
    #np.save(data_dir + sub_dir + 'high_ps_train_folds_%d' % (counter + 1), high_ps_train_folds) #not necessary can be derived from validation folds 
    
    print("Save labels!")
    np.save(data_dir + sub_dir + 'high_ps_train_fold_labels_%d' % (counter + 1), high_ps_train_fold_labels)
    np.save(data_dir + sub_dir + 'high_ps_valid_fold_labels_%d' % (counter + 1), high_ps_valid_fold_labels)
    
    
    
    print("Calculate Prediction Strength per sample for each fold! ")  
    predictions_strengths_cv_per_sample_without_low_samples, _ = cross_valdation_prediction_strength(data, high_ps_train_folds, high_ps_valid_folds, high_ps_train_fold_labels, high_ps_valid_fold_labels, per_sample = True)
    print("Done!")   
    
    print("Save Prediction Strength!")
    np.save(data_dir + sub_dir + 'predictions_strengths_cv_per_sample_without_low_samples_%d' % (counter + 1), predictions_strengths_cv_per_sample_without_low_samples)
 
    
    k_clusters = new_k_clusters
    counter += 1

Look [8 9] Clusters in each fold!
[105 96] Bursts found in each fold with Prediction Strength below threshold = 0.80!
Save Number of Bursts with low Prediction Strength for each fold!
Fitting GMM with 1 clusters:


In [28]:
np.save(data_dir + sub_dir + 'high_ps_valid_fold_labels_%d' % (counter + 1), high_ps_valid_fold_labels)

ValueError: could not broadcast input array from shape (7,6441) into shape (7)

In [51]:
np.save(data_dir + sub_dir + 'test', high_ps_train_fold_labels)

ValueError: could not broadcast input array from shape (7,6450) into shape (7)

In [54]:
spectral_labels = list(np.load(data_dir + '50_50_split/denoising/spectral_clustering/k_init=11_denoising_per_sample/threshold=0.5/' + 'high_ps_train_fold_labels_3.npy' ,allow_pickle=True))

In [107]:
np.save(data_dir + sub_dir + 'test', spectral_labels)

In [113]:
spectral_labels[0].shape

(10,)

In [120]:
spectral_labels[0]

array([array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
       array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
       array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
       array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
       array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
       array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
       array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
       array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
       array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
       array([0, 0, 0, ..., 0, 0, 0], dtype=int32)], dtype=object)

In [134]:
for i in spectral_labels[0]:
    print(i.shape)

(6480,)
(6480,)
(6480,)
(6480,)
(6480,)
(6480,)
(6480,)
(6480,)
(6480,)
(6480,)


In [119]:
high_ps_train_fold_labels[0].shape

(7, 6450)

In [124]:
high_ps_train_fold_labels[0]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [3, 0, 3, ..., 0, 3, 3],
       [5, 0, 5, ..., 0, 5, 5],
       [6, 0, 6, ..., 0, 6, 6]])

In [147]:
test =  np.empty((7,), dtype=object)

for i,j in enumerate(high_ps_train_fold_labels[0]):
    test[i] = j

(7,)