In [1]:
import logging
import sys
logging.basicConfig(
    stream=sys.stdout,
    level=logging.DEBUG,
    format='%(asctime)s %(name)s-%(levelname)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S')

import numpy as np
from scipy.spatial.distance import squareform
sys.path.append('modules/')
import Compute_Relevance
import matplotlib.pyplot as plt

logger = logging.getLogger("")

In [2]:
'''

INPUT DATA and INPUT PARAMETERS

'''

home_dir = '/media/mkasimova/Data2/Anton.PRODUCTION_RUN/COMBINED/analysis/CLUSTERING.CHOSEN.CV/NEURAL.NETWORK/marina-relevance-prop/VSD_SC/'
# Load clusters indices and the array with collective variables
clustering = np.loadtxt('/media/mkasimova/Data2/Anton.PRODUCTION_RUN/COMBINED/analysis/CLUSTERING.CHOSEN.CV/NEURAL.NETWORK/marina-relevance-prop/BIAS.CVs/cluster_indices_mean.0.sigma.0.67.2.txt')
data = np.load(home_dir+'frame_i_j_contacts_dt1.npy')
# Set number of iterations
n_iter = 10
# Set number of halves to use for relevance estimation
n_halves = 2
# Set parameters of the neural network
hidden_layer_sizes = (100,)
# Set points to keep for further analysis (some points should be discarded if clustering is not clean)
points_to_keep = [[0,1450],\
                 [1750,3650],\
                 [4150,5900],\
                 [6800,-1]]

In [3]:
'''

RUN the CODE

'''

# Keep only chosen points in the input data
data_keep, clustering_keep = Compute_Relevance.keep_datapoints(data,clustering,points_to_keep)

# Run NN and relevance propagation
data_vect = Compute_Relevance.vectorize(data_keep)

# Convert distances to contacts
data_cont = Compute_Relevance.convert_to_contact(1/data_vect,0.5)

# Output relevance[number of clusters, number of features, av/std]
# Output error[n_iter*n_halves]
relevance, error = Compute_Relevance.perform_relevance_propagation(data_vect,\
                                                                   clustering_keep,\
                                                                   hidden_layer_sizes,\
                                                                   n_iter,\
                                                                   n_halves,\
                                                                   scaling=True)

2018-10-12 16:26:44 Compute Relevance-INFO: Performing relevance propagation for the dataset with 18915 features and 9003 samples
2018-10-12 16:26:44 Compute Relevance-INFO: Number of clusters is 2
2018-10-12 16:26:44 Compute Relevance-INFO: Running iteration 1 ...
2018-10-12 16:26:45 Compute Relevance-INFO: 			Scaling the input dataset ...
2018-10-12 16:26:51 Compute Relevance-INFO: 			Training the neural network ...
2018-10-12 16:27:00 Compute Relevance-INFO: 			Checking for overfit ...
2018-10-12 16:27:02 Compute Relevance-INFO: 			Overfit error is 0.0444345700955
2018-10-12 16:27:02 Compute Relevance-INFO: 			Error is less than 5%, therefore computing relevance ...
2018-10-12 16:27:05 Compute Relevance-INFO: 			Rescaling relevance according to min and max in each frame ...
2018-10-12 16:27:05 Compute Relevance-INFO: 			... and averaging it over each cluster
('not av', array([], dtype=int64))
('not av', array([], dtype=int64))
('not av', array([], dtype=int64))
2018-10-12 16:27:07 C

2018-10-12 16:30:40 Compute Relevance-INFO: 			... and averaging it over each cluster
('not av', array([], dtype=int64))
('not av', array([], dtype=int64))
('not av', array([], dtype=int64))
2018-10-12 16:30:41 Compute Relevance-INFO: 			Scaling the input dataset ...
2018-10-12 16:30:47 Compute Relevance-INFO: 			Training the neural network ...
2018-10-12 16:30:55 Compute Relevance-INFO: 			Checking for overfit ...
2018-10-12 16:30:58 Compute Relevance-INFO: 			Overfit error is 0.0
2018-10-12 16:30:58 Compute Relevance-INFO: 			Error is less than 5%, therefore computing relevance ...
2018-10-12 16:31:01 Compute Relevance-INFO: 			Rescaling relevance according to min and max in each frame ...
2018-10-12 16:31:01 Compute Relevance-INFO: 			... and averaging it over each cluster
('not av', array([], dtype=int64))
('not av', array([], dtype=int64))
('not av', array([], dtype=int64))
2018-10-12 16:31:02 Compute Relevance-INFO: Running iteration 7 ...
2018-10-12 16:31:03 Compute Relevance-IN

In [5]:
# This part of the code works only if input data was NxN matrix or squareform of it

'''

WRITE RESULTS

'''

sigma=3

number_of_clusters = relevance.shape[0]
number_of_features = (squareform(relevance[0,:,0])).shape[0]

relevance_ave_per_residue = np.zeros((number_of_features,number_of_clusters))
relevance_std_per_residue = np.zeros((number_of_features,number_of_clusters))

for i in range(number_of_clusters):
    relevance_global_mean = np.mean(relevance[i,:,0])
    relevance_global_sigma = np.std(relevance[i,:,0])

    relevance_ave_matrix = squareform(relevance[i,:,0])
    relevance_std_matrix = squareform(relevance[i,:,1])

    for j in range(number_of_features):
        ind_above_sigma = np.where(relevance_ave_matrix[j,:]>=\
                                  (relevance_global_mean+sigma*relevance_global_sigma))[0]
        relevance_ave_per_residue[j,i] = np.sum(relevance_ave_matrix[j,ind_above_sigma])
        relevance_std_per_residue[j,i] = np.sqrt(np.sum(relevance_ave_matrix[j,ind_above_sigma]**2))

results = np.zeros((number_of_features,number_of_clusters*2))

for i in np.arange(number_of_clusters):
    results[:,2*i] = relevance_ave_per_residue[:,i]
    results[:,2*i+1] = relevance_std_per_residue[:,i]

np.savetxt(home_dir+'analysis/results.2.clean.negative_relevance.dat',results)