# Init

In [2]:
import logging
import sys
logging.basicConfig(
    stream=sys.stdout,
    level=logging.DEBUG,
    format='%(asctime)s %(name)s-%(levelname)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S')
import os
import numpy as np
from scipy.spatial.distance import squareform
import modules.Compute_Relevance as Compute_Relevance
import matplotlib.pyplot as plt

logger = logging.getLogger("relProp")

# INPUT DATA and INPUT PARAMETERS

In [3]:
points_to_keep = []
# Set number of iterations
n_iter = 10

# Set number of halves to use for relevance estimation
n_halves = 2

# Set parameters of the neural network
hidden_layer_sizes = (100,)


env = "oliver_gpcr" #TODO load these things from config file / environment parameters

if env == "oliver_gpcr":
    home_dir = os.path.expanduser("~/projects/gpcr/mega/Result_Data/beta2-dror/clustering/")
    data = np.load(home_dir + "training_samples_CA_inv.npy")
    clustering = np.loadtxt(home_dir + 'cluster_indices_.txt')
elif env == "marina":
    #home_dir = '/media/mkasimova/Data2/Anton.PRODUCTION_RUN/COMBINED/analysis/CLUSTERING.CHOSEN.CV/NEURAL.NETWORK/marina-relevance-prop/ALL_SC/'
    home_dir = '/media/mkasimova/Data2/calmodulin_from_Annie/'

    # Load the array with collective variables and clusters indices
    data = np.load(home_dir+'frame_i_j_contacts_dt1.npy')
    #clustering = np.loadtxt('/media/mkasimova/Data2/Anton.PRODUCTION_RUN/COMBINED/analysis/CLUSTERING.CHOSEN.CV/NEURAL.NETWORK/marina-relevance-prop/BIAS.CVs/cluster_indices_mean.0.sigma.0.67.2.txt')
    clustering = np.loadtxt(home_dir+'labels.dat')


    # Set points to keep for further analysis (some points should be discarded if clustering is not clean)
    # Set empty if all points should be kept
    #points_to_keep = [[0,1450],\
    #                 [1750,3650],\
    #                 [4150,5900],\
    #                 [6800,-1]]
else:
    raise Exception("No configuration for {}".format(env))

logger.info("Loading data from %s", env)

2018-10-24 17:34:42 relProp-INFO: Loading data from oliver_gpcr


# Pre-processing the data

In [4]:
# Vectorize data if it is given in the form of matrices
data = Compute_Relevance.vectorize(data)

# Discard some points if needed
data, clustering = Compute_Relevance.keep_datapoints(data,clustering,points_to_keep)

# Convert inverse distances to contacts using a pre-defined cutoff
#data = Compute_Relevance.convert_to_contact(data)
logger.info("Loaded data of shape %s and %s clusters", data.shape, len(set(clustering)))

2018-10-24 17:34:46 relProp-INFO: Loaded data of shape (857, 40186) and 3 clusters


## Filtering the data

In [None]:
data_init = np.copy(data)

# Contact cutoff based data filtering
contact_filtered_ind, data = Compute_Relevance.filter_by_contact_cutoff(data)

# Kullback-Leibler divergence based data filtering (also works with contacts)
DKL, DKL_filtered_ind, data = Compute_Relevance.filter_by_DKL(data,clustering)
logger.info("Done")

# RUN the CODE

In [None]:
# Output relevance[number of clusters, number of features, av/std]
# Output error[n_iter*n_halves]
relevance, error = Compute_Relevance.perform_relevance_propagation(data,\
                                                                   clustering,\
                                                                   hidden_layer_sizes,\
                                                                   n_iter,\
                                                                   n_halves,\
                                                                   scaling=True)


logger.info("Done")

2018-10-24 17:09:50 Compute Relevance-INFO: Performing relevance propagation for the dataset with 40186 features and 857 samples
2018-10-24 17:09:50 Compute Relevance-INFO: Number of clusters is 3
2018-10-24 17:09:50 Compute Relevance-INFO: Running iteration 1 ...
2018-10-24 17:09:50 Compute Relevance-INFO: 			Scaling the input dataset ...
2018-10-24 17:09:53 Compute Relevance-INFO: 			Training the neural network ...
2018-10-24 17:10:05 Compute Relevance-INFO: 			Checking for overfit ...
2018-10-24 17:10:05 Compute Relevance-INFO: 			Overfit error is 0.0
2018-10-24 17:10:05 Compute Relevance-INFO: 			Error is less than 5%, therefore computing relevance ...
2018-10-24 17:10:06 Compute Relevance-INFO: 			Rescaling relevance according to min and max in each frame ...
2018-10-24 17:10:06 Compute Relevance-INFO: 			... and averaging it over each cluster
2018-10-24 17:10:06 Compute Relevance-INFO: 			Scaling the input dataset ...
2018-10-24 17:10:10 Compute Relevance-INFO: 			Training the ne

2018-10-24 17:13:21 Compute Relevance-INFO: 			Rescaling relevance according to min and max in each frame ...
2018-10-24 17:13:21 Compute Relevance-INFO: 			... and averaging it over each cluster
2018-10-24 17:13:22 Compute Relevance-INFO: 			Scaling the input dataset ...
2018-10-24 17:13:25 Compute Relevance-INFO: 			Training the neural network ...
2018-10-24 17:13:37 Compute Relevance-INFO: 			Checking for overfit ...
2018-10-24 17:13:37 Compute Relevance-INFO: 			Overfit error is 0.0
2018-10-24 17:13:37 Compute Relevance-INFO: 			Error is less than 5%, therefore computing relevance ...
2018-10-24 17:13:38 Compute Relevance-INFO: 			Rescaling relevance according to min and max in each frame ...
2018-10-24 17:13:38 Compute Relevance-INFO: 			... and averaging it over each cluster
2018-10-24 17:13:38 Compute Relevance-INFO: Running iteration 8 ...
2018-10-24 17:13:38 Compute Relevance-INFO: 			Scaling the input dataset ...
2018-10-24 17:13:41 Compute Relevance-INFO: 			Training the neu

# WRITE RESULTS

In [15]:
# Remapping relevance to all input features (if filtering was applied)

number_of_clusters = relevance.shape[0]

try:
    data_init
except NameError:
    pass
else:
    relevance_init = np.zeros((number_of_clusters,data_init.shape[1],2))
    try:
        contact_filtered_ind
        DKL_filtered_ind
    except NameError:
        pass
    else:
        if (len(contact_filtered_ind)>=len(DKL_filtered_ind)):
            ind_1 = contact_filtered_ind
            ind_2 = DKL_filtered_ind
            DKL_init = np.zeros((data_init.shape[1]))
            DKL_init[ind_1] = DKL
            DKL = np.copy(DKL_init)
        else:
            ind_2 = contact_filtered_ind
            ind_1 = DKL_filtered_ind
        relevance_2 = relevance
        relevance_1 = np.zeros((number_of_clusters,len(ind_1),2))
        relevance_1[:,ind_2,:] = relevance_2
        relevance_init[:,ind_1,:] = relevance_1
        relevance = np.copy(relevance_init)
    try:
        contact_filtered_ind
    except NameError:
        pass
    else:
        if relevance.shape[1]!=data_init.shape[1]:
            relevance_init[:,contact_filtered_ind,:] = relevance
            relevance = np.copy(relevance_init)
    try:
        DKL_filtered_ind
    except NameError:
        pass
    else:
        if relevance.shape[1]!=data_init.shape[1]:
            relevance_init[:,DKL_filtered_ind,:] = relevance
            relevance = np.copy(relevance_init)

Compute_Relevance.write_results_input_matrix(relevance,home_dir+'analysis/','results.dat')
logger.info("Done")

2018-10-24 17:26:06 Compute Relevance-INFO: Writing the results ...
2018-10-24 17:26:06 Compute Relevance-INFO: Sigma based filtering of relevances is False
2018-10-24 17:26:06 Compute Relevance-INFO: Done!
2018-10-24 17:26:06 relProp-INFO: Done
