# Init

In [1]:
from __future__ import absolute_import, division, print_function

import logging
import sys

logging.basicConfig(
    stream=sys.stdout,
    level=logging.DEBUG,
    format='%(asctime)s %(name)s-%(levelname)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S')
import os
import numpy as np
from modules import utils, feature_extraction as fe, postprocessing, visualization
from modules.data_generation import DataGenerator
from modules import filtering, data_projection as dp
import matplotlib.pyplot as plt

logger = logging.getLogger("beta2")


## Load data 

Data should be in an appropriate format and filtered before when we load it here. It does not have to be scaled. 

In [2]:
feature_type = "CA_inv" # "contacts_5_cutoff", "closest_heavy_inv" or "CA_inv"
working_dir = os.path.expanduser("~/projects/gpcr/mega/Result_Data/beta2-dror/clustering/{}/".format(feature_type))
raw_data = np.load(working_dir + "samples.npy")
scale_data = True
cluster_indices = np.loadtxt(working_dir + 'cluster_indices_.txt')
data = utils.vectorize(raw_data)  # Our training data to classifiers
labels = utils.create_class_labels(cluster_indices)  #Our training labels to classifiers
feature_to_resids = np.load(
    working_dir + "feature_to_resids.npy")  #Mapping from residue Id to the index of that residue in the data (for proteins with missing residues)
if len(data) != len(labels) or data.shape[1] != len(feature_to_resids):
    raise Exception()
logger.info("Loaded data of shape %s and %s clusters for feature_type %s", data.shape, len(set(cluster_indices)), feature_type)


2018-12-03 18:21:48 beta2-INFO: Loaded data of shape (857, 40186) and 3 clusters for feature_type CA_inv


## Define the different methods to use

Every method is encapsulated in a so called FeatureExtractor class which all follow the same interface

In [5]:
n_iterations, n_splits = 1, 1
rbm_data = np.copy(data)
np.random.shuffle(rbm_data)
filter_by_distance_cutoff = False
feature_extractors = [
#      fe.MlpFeatureExtractor(data, cluster_indices, n_splits=n_splits, n_iterations=n_iterations, 
#                             hidden_layer_sizes=(100,), #, 50, 25),
#                             activation="relu",
#                             filter_by_distance_cutoff=filter_by_distance_cutof),
    #fe.RbmFeatureExtractor(rbm_data, cluster_indices, n_splits=n_splits, n_iterations=n_iterations, 
    #                       n_components=8,
    #                       filter_by_distance_cutoff=filter_by_distance_cutoff),                           
     fe.ElmFeatureExtractor(data, cluster_indices, n_splits=n_splits, n_iterations=n_iterations,
                            filter_by_distance_cutoff=filter_by_distance_cutoff,
                            alpha=1, activation="relu"),
#      fe.KLFeatureExtractor(data, cluster_indices, n_splits=n_splits,
#                             filter_by_distance_cutoff=filter_by_distance_cutoff),
     fe.PCAFeatureExtractor(data, cluster_indices, n_splits=n_splits,
                            filter_by_distance_cutoff=filter_by_distance_cutoff),
#      fe.RandomForestFeatureExtractor(data, cluster_indices, n_splits=n_splits, n_iterations=n_iterations,
#                             filter_by_distance_cutoff=filter_by_distance_cutoff),
]
logger.info("Done. using %s feature extractors", len(feature_extractors))


2018-12-03 18:23:06 beta2-INFO: Done. using 2 feature extractors


# Run the relevance analysis

In [None]:
results = []
for extractor in feature_extractors:
    logger.info("Computing relevance for extractors %s", extractor.name)
    feature_importance, std_feature_importance, errors = extractor.extract_features()
    #logger.info("Get feature_importance and std of shapes %s, %s", feature_importance.shape, std_feature_importance.shape)
    results.append((extractor, feature_importance, std_feature_importance, errors))
logger.info("Done")


2018-12-03 18:23:06 beta2-INFO: Computing relevance for extractors ELM
2018-12-03 18:23:06 Extracting features-INFO: Performing feature extraction with ELM on data of shape (857, 40186)
2018-12-03 18:23:16 Extracting features-INFO: Using all data in training and validation sets


# Remap and persist results 

In [None]:
postprocessors = []
for (extractor, feature_importance, std_feature_importance, errors) in results:
    p = postprocessing.PostProcessor(extractor, feature_importance, std_feature_importance, errors, cluster_indices,
                                     working_dir, 
                                     pdb_file=working_dir + "analysis/all_orig.pdb",
                                     feature_to_resids=feature_to_resids, filter_results=True)
    p.average()
    p.persist()
    postprocessors.append(p)

logger.info("Done")


# Visualize results

In [None]:
data_projectors = []
for p in postprocessors:
    #p.filter_feature_importance_by_rank(filter_by_rank_cutoff=filter_by_rank_cutoff)
    #p.average().persist()
    projector = dp.DataProjector(p, data)
    projector.project().score_projection()
    data_projectors.append([projector])
visualization.visualize([[p] for p in postprocessors],data_projectors)
logger.info("Done. The settings were n_iterations, n_splits = %s, %s.\nFiltering (filter_by_distance_cutoff, filter_by_DKL, filter_by_KS_test = %s, %s, %s)", 
            n_iterations, n_splits, filter_by_distance_cutoff, filter_by_DKL, filter_by_KS_test)