# Init

In [1]:
from __future__ import absolute_import, division, print_function

import logging
import sys
logging.basicConfig(
    stream=sys.stdout,
    level=logging.DEBUG,
    format='%(asctime)s %(name)s-%(levelname)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S')
import os
import numpy as np
from modules import utils, feature_extraction as fe, postprocessing
import matplotlib.pyplot as plt

logger = logging.getLogger("beta2")

## Load data 

In [2]:
working_dir = os.path.expanduser("~/projects/gpcr/mega/Result_Data/beta2-dror/clustering/")
raw_data = np.load(working_dir + "frame_distances_CA_inv.npy")
scale_data = True
#data = np.load(home_dir + "training_samples_CA_inv.npy")
cluster_indices = np.loadtxt(working_dir + 'cluster_indices_.txt')
data = utils.vectorize(raw_data) # Our training data to classifiers
labels = utils.transform_to_matrix(cluster_indices)  #Our training labels to classifiers
index_to_residue_mapping = None #Mapping from residue Id to the index of that residue in the data (for proteins with missing residues)
logger.info("Loaded data of shape %s and %s clusters", data.shape, len(set(cluster_indices)))

2018-10-26 15:14:01 beta2-INFO: Loaded data of shape (857, 40186) and 3 clusters


## Define the different methods to use

In [5]:
feature_extractors = [
    fe.MlpFeatureExtractor(data, labels, n_splits=4, scaling=True, hidden_layer_sizes=(100,)),
    fe.ElmFeatureExtractor(data, labels),
    #fe.KLFeatureExtractor(data, labels),
    fe.PCA_feature_extract(data, labels),
    fe.RF_feature_extract(data, labels)
]


# Run the relevance analysis

In [8]:
results = []
for extractor in feature_extractors:
    logger.info("Computing relevance for extractors %s", extractor.name)
    feats, relevance, errors = extractor.extract_features()
    results.append((extractor, relevance))
logger.info("Done")

2018-10-26 15:15:20 beta2-INFO: Computing relevance for extractors KL
2018-10-26 15:15:20 Extracting feature-INFO: Iteration 1 of 12
2018-10-26 15:15:24 Extracting feature-DEBUG: Error below 5% - computing feature importance.


ValueError: all the input array dimensions except for the concatenation axis must match exactly

# Remap and persist results 

In [5]:
for (extractor, relevance) in results:
    postprocessing.average_and_persist(extractor, relevance, cluster_indices, visualize=True)

plt.show()
logger.info("Done")

2018-10-26 14:59:14 beta2-INFO: Done
