# Init

In [1]:
from __future__ import absolute_import, division, print_function

import logging
import sys
logging.basicConfig(
    stream=sys.stdout,
    level=logging.DEBUG,
    format='%(asctime)s %(name)s-%(levelname)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S')
import os
import numpy as np
from modules import utils, feature_extraction as fe, postprocessing
import matplotlib.pyplot as plt

logger = logging.getLogger("beta2")

## Load data 

Data should be in an appropriate format and filtered before when we load it here. It does not have to be scaled. 

In [2]:
working_dir = os.path.expanduser("~/projects/gpcr/mega/Result_Data/beta2-dror/clustering/")
raw_data = np.load(working_dir + "frame_distances_CA_inv.npy")
scale_data = True
#data = np.load(home_dir + "training_samples_CA_inv.npy")
cluster_indices = np.loadtxt(working_dir + 'cluster_indices_.txt')
data = utils.vectorize(raw_data) # Our training data to classifiers
labels = utils.create_class_labels(cluster_indices)  #Our training labels to classifiers
index_to_residue_mapping = None #Mapping from residue Id to the index of that residue in the data (for proteins with missing residues)
logger.info("Loaded data of shape %s and %s clusters", data.shape, len(set(cluster_indices)))

2018-10-29 16:53:00 beta2-INFO: Loaded data of shape (857, 40186) and 3 clusters


## Define the different methods to use

Every method is encapsulated in a so called FeatureExtractor class which all follow the same interface

In [7]:
n_iterations, n_splits = 1, 4
feature_extractors = [
    fe.MlpFeatureExtractor(data, labels, n_splits=n_splits, n_iterations=n_iterations),
    #fe.ElmFeatureExtractor(data, labels, n_splits=n_splits, n_iterations=n_iterations),
    fe.KLFeatureExtractor(data, labels, n_splits=n_splits, n_iterations=n_iterations),
    #fe.PCA_feature_extract(data, labels, n_splits=n_splits, n_iterations=n_iterations),
    #fe.RF_feature_extract(data, labels, n_splits=n_splits, n_iterations=n_iterations),
]
logger.info("Done. using %s feature extractors", len(feature_extractors))

2018-10-29 16:53:30 beta2-INFO: Done. using 1 feature extractors


# Run the relevance analysis

In [None]:
results = []
for extractor in feature_extractors:
    logger.info("Computing relevance for extractors %s", extractor.name)
    feature_importance, std_feature_importance, errors = extractor.extract_features()
    logger.info("Get feature_importance and std of shapes %s, %s", feature_importance.shape, std_feature_importance.shape)
    results.append((extractor, feature_importance, std_feature_importance))
logger.info("Done")

2018-10-29 16:53:31 beta2-INFO: Computing relevance for extractors KL
2018-10-29 16:53:31 Extracting feature-DEBUG: Iteration 1 of 2
2018-10-29 16:53:35 Extracting feature-DEBUG: Computing feature importance on all data.


# Remap and persist results 

In [None]:
for (extractor, feature_importance, std_feature_importance) in results:
    postprocessing.average_and_persist(extractor, feature_importance, std_feature_importance, cluster_indices, working_dir, visualize=True)

plt.show()
logger.info("Done")