In [1]:
import logging
import sys
logging.basicConfig(
    stream=sys.stdout,
    level=logging.DEBUG,
    format='%(asctime)s %(name)s-%(levelname)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S')

import numpy as np
from scipy.spatial.distance import squareform
sys.path.append('modules/')
import Compute_Relevance
import matplotlib.pyplot as plt

logger = logging.getLogger("")

In [2]:
'''
INPUT DATA and INPUT PARAMETERS
'''

home_dir = '/media/mkasimova/Data2/Anton.PRODUCTION_RUN/COMBINED/analysis/CLUSTERING.CHOSEN.CV/NEURAL.NETWORK/marina-relevance-prop/ALL_SC/'

# Load the array with collective variables and clusters indices
data = np.load(home_dir+'frame_i_j_contacts_dt1.npy')
clustering = np.loadtxt('/media/mkasimova/Data2/Anton.PRODUCTION_RUN/COMBINED/analysis/CLUSTERING.CHOSEN.CV/NEURAL.NETWORK/marina-relevance-prop/BIAS.CVs/cluster_indices_mean.0.sigma.0.67.2.txt')

# Set number of iterations
n_iter = 10

# Set number of halves to use for relevance estimation
n_halves = 2

# Set parameters of the neural network
hidden_layer_sizes = (100,)

# Set points to keep for further analysis (some points should be discarded if clustering is not clean)
# Set empty if all points should be kept
points_to_keep = [[0,1450],\
                 [1750,3650],\
                 [4150,5900],\
                 [6800,-1]]

In [3]:
'''
Pre-processing the data
'''

# Vectorize data if it is given in the form of matrices
data = Compute_Relevance.vectorize(data)

# Discard some points if needed
data, clustering = Compute_Relevance.keep_datapoints(data,clustering,points_to_keep)

# Convert inverse distances to contacts using a pre-defined cutoff
#data = Compute_Relevance.convert_to_contact(1/data)

# Kullback-Leibler divergence based data filtering (also works with contacts)
data_init = np.copy(data)
DKL, DKL_filtered_ind, data = Compute_Relevance.filter_by_DKL(data,clustering)

2018-10-17 13:06:27 Compute Relevance-INFO: Discarding points ...
2018-10-17 13:06:27 Compute Relevance-INFO: Number of points before discarding is 10704
2018-10-17 13:06:33 Compute Relevance-INFO: Number of points after discarding is 9003
2018-10-17 13:06:36 Compute Relevance-INFO: Number of features before DKL based filtering is 121278
2018-10-17 13:06:57 Compute Relevance-INFO: Bin size for probability calculation is 0.0410311371868
2018-10-17 13:07:34 Compute Relevance-INFO: Number of features after DKL based filtering is 4675


In [4]:
'''
RUN the CODE
'''

# Output relevance[number of clusters, number of features, av/std]
# Output error[n_iter*n_halves]
relevance, error = Compute_Relevance.perform_relevance_propagation(data,\
                                                                   clustering,\
                                                                   hidden_layer_sizes,\
                                                                   n_iter,\
                                                                   n_halves,\
                                                                   scaling=True)

2018-10-17 13:07:43 Compute Relevance-INFO: Performing relevance propagation for the dataset with 4675 features and 9003 samples
2018-10-17 13:07:43 Compute Relevance-INFO: Number of clusters is 2
2018-10-17 13:07:43 Compute Relevance-INFO: Running iteration 1 ...
2018-10-17 13:07:43 Compute Relevance-INFO: 			Scaling the input dataset ...
2018-10-17 13:07:45 Compute Relevance-INFO: 			Training the neural network ...
2018-10-17 13:07:47 Compute Relevance-INFO: 			Checking for overfit ...
2018-10-17 13:07:47 Compute Relevance-INFO: 			Overfit error is 0.0444345700955
2018-10-17 13:07:47 Compute Relevance-INFO: 			Error is less than 5%, therefore computing relevance ...
2018-10-17 13:07:48 Compute Relevance-INFO: 			Rescaling relevance according to min and max in each frame ...
2018-10-17 13:07:48 Compute Relevance-INFO: 			... and averaging it over each cluster
2018-10-17 13:07:48 Compute Relevance-INFO: 			Scaling the input dataset ...
2018-10-17 13:07:50 Compute Relevance-INFO: 			Tra

2018-10-17 13:08:46 Compute Relevance-INFO: 			Overfit error is 0.0444345700955
2018-10-17 13:08:46 Compute Relevance-INFO: 			Error is less than 5%, therefore computing relevance ...
2018-10-17 13:08:47 Compute Relevance-INFO: 			Rescaling relevance according to min and max in each frame ...
2018-10-17 13:08:47 Compute Relevance-INFO: 			... and averaging it over each cluster
2018-10-17 13:08:47 Compute Relevance-INFO: 			Scaling the input dataset ...
2018-10-17 13:08:48 Compute Relevance-INFO: 			Training the neural network ...
2018-10-17 13:08:50 Compute Relevance-INFO: 			Checking for overfit ...
2018-10-17 13:08:51 Compute Relevance-INFO: 			Overfit error is 0.0444247001333
2018-10-17 13:08:51 Compute Relevance-INFO: 			Error is less than 5%, therefore computing relevance ...
2018-10-17 13:08:52 Compute Relevance-INFO: 			Rescaling relevance according to min and max in each frame ...
2018-10-17 13:08:52 Compute Relevance-INFO: 			... and averaging it over each cluster
2018-10-17 1

In [5]:
'''
WRITE RESULTS
'''

number_of_clusters = relevance.shape[0]

try:
    relevance_init = np.zeros((number_of_clusters,data_init.shape[1],2))
    relevance_init[:,DKL_filtered_ind,:] = relevance
    relevance = np.copy(relevance_init)
except NameError:
    pass

Compute_Relevance.write_results_input_matrix(relevance,home_dir+'analysis/','results.dat')

2018-10-17 13:13:17 Compute Relevance-INFO: Writing the results ...
2018-10-17 13:13:17 Compute Relevance-INFO: Sigma based filtering of relevances is False
2018-10-17 13:13:17 Compute Relevance-INFO: Done!


In [6]:
DKL_square = squareform(DKL)
DKL_mean = np.mean(DKL)
DKL_std = np.std(DKL)
out = []
out_filt = []
for i in range(DKL_square.shape[0]):
    out.append(np.sum(DKL_square[i,:]))
    above_sigma = np.where(DKL_square[i,:]>=DKL_mean+2*DKL_std)[0]
    out_filt.append(np.sum(DKL_square[i,above_sigma]))

out = np.asarray(out)
out_filt = np.asarray(out_filt)

np.savetxt(home_dir+'analysis/'+'DKL.no_filter.dat',out.T)
np.savetxt(home_dir+'analysis/'+'DKL.filter.dat',out_filt.T)