In [1]:
import logging
import sys
logging.basicConfig(
    stream=sys.stdout,
    level=logging.DEBUG,
    format='%(asctime)s %(name)s-%(levelname)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S')

import numpy as np
from scipy.spatial.distance import squareform
sys.path.append('modules/')
import Compute_Relevance
import matplotlib.pyplot as plt

logger = logging.getLogger("")

In [2]:
'''
INPUT DATA and INPUT PARAMETERS
'''

home_dir = '/media/mkasimova/Data2/Anton.PRODUCTION_RUN/COMBINED/analysis/CLUSTERING.CHOSEN.CV/NEURAL.NETWORK/marina-relevance-prop/ALL_SC/'

# Load the array with collective variables and clusters indices
data = np.load(home_dir+'frame_i_j_contacts_dt1.npy')
clustering = np.loadtxt('/media/mkasimova/Data2/Anton.PRODUCTION_RUN/COMBINED/analysis/CLUSTERING.CHOSEN.CV/NEURAL.NETWORK/marina-relevance-prop/BIAS.CVs/cluster_indices_mean.0.sigma.0.67.2.txt')

# Set points to keep for further analysis (some points should be discarded if clustering is not clean)
# Set empty if all points should be kept
points_to_keep = [[0,1450],\
                 [1750,3650],\
                 [4150,5900],\
                 [6800,-1]]

# Set number of iterations
n_iter = 10

# Set number of halves to use for relevance estimation
n_halves = 2

# Set parameters of the neural network
hidden_layer_sizes = (100,)

In [3]:
'''
Pre-processing the data
'''

# Vectorize data if it is given in the form of matrices
data = Compute_Relevance.vectorize(data)

# Discard some points if needed
data, clustering = Compute_Relevance.keep_datapoints(data,clustering,points_to_keep)

# Convert inverse distances to contacts using a pre-defined cutoff
#data = Compute_Relevance.convert_to_contact(data)

'''
Filtering the data
'''

data_init = np.copy(data)

# Contact cutoff based data filtering
contact_filtered_ind, data = Compute_Relevance.filter_by_contact_cutoff(data)

# Kullback-Leibler divergence based data filtering (also works with contacts)
DKL, DKL_filtered_ind, data = Compute_Relevance.filter_by_DKL(data,clustering)

2018-10-19 12:00:59 Compute Relevance-INFO: Discarding points ...
2018-10-19 12:00:59 Compute Relevance-INFO: Number of points before discarding is 10704
2018-10-19 12:01:04 Compute Relevance-INFO: Number of points after discarding is 9003
2018-10-19 12:01:08 Compute Relevance-INFO: Number of features before contact cutoff based filtering is 121278
2018-10-19 12:01:21 Compute Relevance-INFO: Number of features after contact cutoff based filtering is 6529
2018-10-19 12:01:22 Compute Relevance-INFO: Number of features before DKL based filtering is 6529
2018-10-19 12:01:23 Compute Relevance-INFO: Bin size for probability calculation is 0.300485012528
2018-10-19 12:01:24 Compute Relevance-INFO: Number of features after DKL based filtering is 245


In [4]:
'''
RUN the CODE
'''

# Output relevance[number of clusters, number of features, av/std]
# Output error[n_iter*n_halves]
relevance, error = Compute_Relevance.perform_relevance_propagation(data,\
                                                                   clustering,\
                                                                   hidden_layer_sizes,\
                                                                   n_iter,\
                                                                   n_halves,\
                                                                   scaling=True)

2018-10-19 12:01:29 Compute Relevance-INFO: Performing relevance propagation for the dataset with 245 features and 9003 samples
2018-10-19 12:01:29 Compute Relevance-INFO: Number of clusters is 2
2018-10-19 12:01:29 Compute Relevance-INFO: Running iteration 1 ...
2018-10-19 12:01:29 Compute Relevance-INFO: 			Scaling the input dataset ...
2018-10-19 12:01:29 Compute Relevance-INFO: 			Training the neural network ...
2018-10-19 12:01:30 Compute Relevance-INFO: 			Checking for overfit ...
2018-10-19 12:01:30 Compute Relevance-INFO: 			Overfit error is 0.0444345700955
2018-10-19 12:01:30 Compute Relevance-INFO: 			Error is less than 5%, therefore computing relevance ...
2018-10-19 12:01:30 Compute Relevance-INFO: 			Rescaling relevance according to min and max in each frame ...
2018-10-19 12:01:30 Compute Relevance-INFO: 			... and averaging it over each cluster
2018-10-19 12:01:30 Compute Relevance-INFO: 			Scaling the input dataset ...
2018-10-19 12:01:30 Compute Relevance-INFO: 			Trai

2018-10-19 12:01:34 Compute Relevance-INFO: 			Error is less than 5%, therefore computing relevance ...
2018-10-19 12:01:34 Compute Relevance-INFO: 			Rescaling relevance according to min and max in each frame ...
2018-10-19 12:01:34 Compute Relevance-INFO: 			... and averaging it over each cluster
2018-10-19 12:01:34 Compute Relevance-INFO: 			Scaling the input dataset ...
2018-10-19 12:01:34 Compute Relevance-INFO: 			Training the neural network ...
2018-10-19 12:01:34 Compute Relevance-INFO: 			Checking for overfit ...
2018-10-19 12:01:34 Compute Relevance-INFO: 			Overfit error is 0.0222123500666
2018-10-19 12:01:34 Compute Relevance-INFO: 			Error is less than 5%, therefore computing relevance ...
2018-10-19 12:01:34 Compute Relevance-INFO: 			Rescaling relevance according to min and max in each frame ...
2018-10-19 12:01:34 Compute Relevance-INFO: 			... and averaging it over each cluster
2018-10-19 12:01:34 Compute Relevance-INFO: Running iteration 8 ...
2018-10-19 12:01:34 Comp

In [5]:
'''
WRITE RESULTS
'''

# Remapping relevance to all input features (if filtering was applied)

number_of_clusters = relevance.shape[0]

try:
    data_init
except NameError:
    pass
else:
    relevance_init = np.zeros((number_of_clusters,data_init.shape[1],2))
    try:
        contact_filtered_ind
        DKL_filtered_ind
    except NameError:
        pass
    else:
        if (len(contact_filtered_ind)>=len(DKL_filtered_ind)):
            ind_1 = contact_filtered_ind
            ind_2 = DKL_filtered_ind
            DKL_init = np.zeros((data_init.shape[1]))
            DKL_init[ind_1] = DKL
            DKL = np.copy(DKL_init)
        else:
            ind_2 = contact_filtered_ind
            ind_1 = DKL_filtered_ind
        relevance_2 = relevance
        relevance_1 = np.zeros((number_of_clusters,len(ind_1),2))
        relevance_1[:,ind_2,:] = relevance_2
        relevance_init[:,ind_1,:] = relevance_1
        relevance = np.copy(relevance_init)
    try:
        contact_filtered_ind
    except NameError:
        pass
    else:
        if relevance.shape[1]!=data_init.shape[1]:
            relevance_init[:,contact_filtered_ind,:] = relevance
            relevance = np.copy(relevance_init)
    try:
        DKL_filtered_ind
    except NameError:
        pass
    else:
        if relevance.shape[1]!=data_init.shape[1]:
            relevance_init[:,DKL_filtered_ind,:] = relevance
            relevance = np.copy(relevance_init)

Compute_Relevance.write_results_input_matrix(relevance,home_dir+'analysis/','results.0.5nm.cutoff.dat',DKL)

2018-10-19 12:01:48 Compute Relevance-INFO: Writing the results ...
2018-10-19 12:01:48 Compute Relevance-INFO: Sigma based filtering of relevances is False
2018-10-19 12:01:48 Compute Relevance-INFO: Done!
