In [57]:
import os
os.environ['OPENBLAS_NUM_THREADS'] ='40'
import numpy as np

import matplotlib.pyplot as plt
import sklearn
import scipy
from scipy.io import loadmat
from scipy.io import savemat
import matplotlib.pyplot as plt 
import pandas as pd
import struct
import os
import json
import sys
import seaborn as sns 
from scipy import signal, stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture as GMM

In [2]:
np.show_config()

blas_mkl_info:
  NOT AVAILABLE
blis_info:
  NOT AVAILABLE
openblas_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/opt/conda/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
blas_opt_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/opt/conda/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
lapack_mkl_info:
  NOT AVAILABLE
openblas_lapack_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/opt/conda/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
lapack_opt_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/opt/conda/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]


In [58]:
data_dir = '../Data/'
sub_dir = '50_50_split/'

In [59]:
def fit_and_validate_gmm_models(X_train_sets, X_valid_sets, clusters,n_folds = 1, n_init=100, max_iter=100, init_from_checkpoint = False):
    bics_per_fold = []
    scores_per_fold = []
    predictions_train_per_fold =[] 
    predictions_valid_per_fold = [] 
    predictions_valid_fitted_per_fold = []
    
    for i in range(n_folds):
        print("Start Process for %d. split!" %(i+1))
        if init_from_checkpoint:
            print("Initialize from stored Checkpoint!")
            bics = list(np.load(data_dir + sub_dir + 'bics_%d.npy' % (i+1),allow_pickle=True))
            scores = list(np.load(data_dir + sub_dir + 'scores_%d.npy' % (i+1),allow_pickle=True))
            predictions_valid = list(np.load(data_dir + sub_dir + 'predictions_valid_%d.npy' % (i+1),allow_pickle=True))
            predictions_train = list(np.load(data_dir + sub_dir + 'predictions_train_%d.npy' % (i+1),allow_pickle=True))
            predictions_valid_fitted = list(np.load(data_dir + sub_dir + 'predictions_valid_fitted_%d.npy' % (i+1),allow_pickle=True))

        else:
            bics = [] # for each fold and each cluster 
            scores = [] # for each fold and each cluster
            predictions_valid = []
            predictions_train = []
            predictions_valid_fitted = []
        
        X_train = X_train_sets[i]
        X_valid = X_valid_sets[i]
        
        for c in clusters:
            print("Fitting GMM with %d clusters:" % c)

            gmm= GMM(c, n_init=n_init, max_iter=max_iter).fit(X_train) 
            gmm_valid = GMM(c, n_init=n_init, max_iter=max_iter).fit(X_valid) 

            bics.append((gmm.bic(X_train),gmm_valid.bic(X_valid)))
            scores.append((gmm.score(X_train),gmm.score(X_valid)))

            print("Predicting Data Set!")
            predictions_train.append(gmm.predict(X_train))

            print("Predicting Validation Set!")
            predictions_valid.append(gmm.predict(X_valid))

            print("Prediction Vlaidation Set after fitting separately!")
            predictions_valid_fitted.append(gmm_valid.predict(X_valid))

            np.save(data_dir + sub_dir + 'bics_%d.npy' % (i+1),bics) 
            np.save(data_dir + sub_dir + 'scores_%d.npy' % (i+1),scores)
            np.save(data_dir + sub_dir + 'predictions_valid_%d.npy' % (i+1),predictions_valid)
            np.save(data_dir + sub_dir + 'predictions_train_%d.npy' % (i+1),predictions_train)
            np.save(data_dir + sub_dir + 'predictions_valid_fitted_%d.npy' % (i+1),predictions_valid_fitted)

            print("Saved GMM data with %d clusters!" % c)
        
        bics_per_fold.append(bics)
        scores_per_fold.append(scores)
        predictions_train_per_fold.append(predictions_train)
        predictions_valid_per_fold.append(predictions_valid)
        predictions_valid_fitted_per_fold.append(predictions_valid_fitted)

    return bics_per_fold, scores_per_fold, predictions_train_per_fold, predictions_valid_per_fold, predictions_valid_fitted_per_fold

In [60]:
data_burst_by_time = np.load(data_dir + 'data_burst_by_time.npy').T
data_burst_by_time_shuffled = (np.random.permutation(data_burst_by_time))
print("Averaged over channels: ", data_burst_by_time.shape)
print(data_burst_by_time.shape)
print(data_burst_by_time_shuffled.shape)

Averaged over channels:  (13092, 3410)
(13092, 3410)
(13092, 3410)


In [61]:
dataset_cutted = data_burst_by_time[:,1000:2500] # 1. cut 1000 - 2500 2. cut 1200 - 2200
dataset_cutted2 = data_burst_by_time[:,1200:2200]
print("First Cut: ", dataset_cutted.shape)
print("Second Cut: ", dataset_cutted2.shape)

First Cut:  (13092, 1500)
Second Cut:  (13092, 1000)


In [62]:
data = data_burst_by_time

In [63]:
train_folds = np.load(data_dir + sub_dir +  "train_folds_50_50.npy")#np.load(data_dir + "culture_balanced/culture_balanced_training_split.npy")
valid_folds = np.load(data_dir + sub_dir + "valid_folds_50_50.npy")#np.load(data_dir + "culture_balanced/culture_balanced_validation_split.npy")

if len(train_folds.shape)>1:
    training_sets = []
    validation_sets = []
    for i, split in enumerate(train_folds):
        training_sets.append(data[split])
        validation_sets.append(data[valid_folds[i]])
else:
    train_folds = [train_folds]
    valid_folds = [valid_folds]
    training_sets = [data[train_folds]]  #data_burst_by_time[training_split] # extract training bursts from dataset with indices
    validation_sets = [data[valid_folds]]  #data_burst_by_time[test_split] # extract validation bursts from dataset with indices 

In [64]:
for i, train_set in enumerate(training_sets):
    print("Split %d :" % (i+1))
    print("%d Bursts in Training Set equal to %.2f %% of the total data. " % (len(train_set), np.round((len(train_set)/len(data)), 4) * 100))
    print("%d Bursts in Validation Set equal to %.2f %% of the total data. " % (len(validation_sets[i]), np.round((len(validation_sets[i])/len(data)), 4) * 100))

Split 1 :
6546 Bursts in Training Set equal to 50.00 % of the total data. 
6546 Bursts in Validation Set equal to 50.00 % of the total data. 
Split 2 :
6546 Bursts in Training Set equal to 50.00 % of the total data. 
6546 Bursts in Validation Set equal to 50.00 % of the total data. 


In [68]:
n_clusters = range(1,21)
print("Number of clusters to look at: ", [x for x in n_clusters])

Number of clusters to look at:  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


In [66]:
test1 = [training_sets[0][0:2]]
test2 = [validation_sets[0][0:2]]
print(len(test1))
print(test1)

1
[array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])]


In [None]:
bics_per_fold, scores_per_fold, predictions_train_per_fold, predictions_valid_per_fold, predictions_valid_fitted_per_fold = fit_and_validate_gmm_models(training_sets, validation_sets, n_clusters,n_folds=2, n_init=100, max_iter=100, init_from_checkpoint = False)

Start Process for 1. split!
Fitting GMM with 1 clusters:
Predicting Data Set!
Predicting Validation Set!
Prediction Vlaidation Set after fitting separately!
Saved GMM data with 1 clusters!
Fitting GMM with 2 clusters:
Predicting Data Set!
Predicting Validation Set!
Prediction Vlaidation Set after fitting separately!
Saved GMM data with 2 clusters!
Fitting GMM with 3 clusters:
Predicting Data Set!
Predicting Validation Set!
Prediction Vlaidation Set after fitting separately!
Saved GMM data with 3 clusters!
Fitting GMM with 4 clusters:
Predicting Data Set!
Predicting Validation Set!
Prediction Vlaidation Set after fitting separately!
Saved GMM data with 4 clusters!
Fitting GMM with 5 clusters:
Predicting Data Set!
Predicting Validation Set!
Prediction Vlaidation Set after fitting separately!
Saved GMM data with 5 clusters!
Fitting GMM with 6 clusters:
Predicting Data Set!
Predicting Validation Set!
Prediction Vlaidation Set after fitting separately!
Saved GMM data with 6 clusters!
Fittin