In [1]:
import sys
sys.path.insert(0, '../../')

from data import load_data_gse135820 as gse135820
from data import load_data_gse68465 as gse68465
from data import load_data_gse94873 as gse94873 
from data import load_data_gse96058 as gse96058

from pipeline import MuLT

from sklearn.model_selection import StratifiedKFold
from evaluation import optimize_threshold, classification_metrics
from sklearn.metrics import roc_auc_score, log_loss, confusion_matrix

from constants import N_FOLDS, RANDOM_STATE
from util import join_values
from pymfe.complexity import MFEComplexity
from pymfe.mfe import MFE

import lightgbm as lgb
import pickle as pkl
import pandas as pd
import numpy as np
import time
import os

# creating analyser object to compute and group 
# classification matrics grouped by training and validation
# dataset and by experiment id
# analyser = Analyser()

dataset_id = ['GSE94873', 'GSE68465', 'GSE135820']

complexity_keys = ['ft_f1', 'ft_f1v', 'ft_f2', 'ft_f3', 'ft_f4', 'ft_l1', 'ft_l2', 
                   'ft_l3', 'ft_n1', 'ft_n2', 'ft_n3', 'ft_n4', 'ft_c1', 'ft_c2', 
                   'ft_t1', 'ft_t2', 'ft_t3', 'ft_t4', 'ft_lsc', 
                   'ft_density', 'ft_cls_coef', 'ft_hubs']

for i, func in enumerate([gse94873, gse68465, gse135820]):
    
    print('=============================================================================')
    print('Dataset {}'.format(dataset_id[i]))
    print('=============================================================================\n')
    
    BASE_PATH = os.path.join('output/mult/', dataset_id[i])
    path = os.path.join(BASE_PATH, 'complexity')
    
    print('LOADING DATASET')
    
    c, g, o = func()
    y = o.iloc[:, 0]

    # Creating 10-fold CV splits stratified by treatments and outcome
    kfold = StratifiedKFold(N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    split = kfold.split(np.zeros(o.shape[0]), o)

    # mfe_result = {0: None, 7: None}
    cmp_result = {0: None, 7: None}
    
    for experiment, (train_index, valid_index) in enumerate(split):
        
        # mapping variables

        initial_time = time.time()

        print('LOADING MODEL')
        
        with open(os.path.join(BASE_PATH, 'trained_model_{}.pkl'.format(experiment)), 'rb') as file:
            mult = pkl.load(file)

        for j in [0, 7]:        

            if j == 7:
                print('TRANSFORMING DATASET')
                X_ = mult.transform(g, c, None, *[bool(int(v)) for v in '{0:>03b}1'.format(j)])
            else:
                X_ = X = c[mult.selected_clinical[0]].join(g[mult.selected_genes[0]])

            path_ = os.path.join(path, {0: 'no-feature-extraction', 7: 'feature-extraction'}[j])

            if not os.path.exists(path_):
                os.makedirs(path_)

            filepath = os.path.join(path_, 'dataset.csv')

            if not os.path.exists(filepath):
                X_.to_csv(filepath, sep=',', index=True)

            mfe_cmp = MFEComplexity()

            print('COMPUTING METRICS')
            complexity = [
                np.mean(mfe_cmp.ft_f1(X_.values, y.values)),
                np.mean(mfe_cmp.ft_f1v(X_.values, y.values)),
                np.mean(mfe_cmp.ft_f2(X_.values, y.values)),
                np.mean(mfe_cmp.ft_f3(X_.values, y.values)),
                np.mean(mfe_cmp.ft_f4(X_.values, y.values)),
                np.mean(mfe_cmp.ft_l1(X_.values, y.values)),
                np.mean(mfe_cmp.ft_l2(X_.values, y.values)),
                np.mean(mfe_cmp.ft_l3(X_.values, y.values)),
                mfe_cmp.ft_n1(X_.values, y.values),
                np.mean(mfe_cmp.ft_n2(X_.values, y.values)),
                np.mean(mfe_cmp.ft_n3(X_.values, y.values)),
                np.mean(mfe_cmp.ft_n4(X_.values, y.values)),
                mfe_cmp.ft_c1(y.values),
                mfe_cmp.ft_c2(y.values),
                np.mean(mfe_cmp.ft_t1(X_.values, y.values)),
                mfe_cmp.ft_t2(X_.values),
                mfe_cmp.ft_t3(X_.values),
                mfe_cmp.ft_t4(X_.values),
                mfe_cmp.ft_lsc(X_.values, y.values),
                mfe_cmp.ft_density(X_.values, y.values),
                mfe_cmp.ft_cls_coef(X_.values, y.values),
                np.mean(mfe_cmp.ft_hubs(X_.values, y.values))]

            # r = MFE().fit(X_.values, y_valid.values).extract()

            label = 'round_{}'.format(experiment)
            
            if cmp_result[j] is None:
                # mfe_result[j] = pd.DataFrame({'round_{}'.format(experiment): r[1]}, index=r[0])
                cmp_result[j] = pd.DataFrame({label: complexity}, index=complexity_keys)

            else:
                # mfe_result[j]['round_{}'.format(experiment + 1)] = r[1]
                cmp_result[j][label] = complexity

        # mfe_result[0].to_csv(os.path.join(path, 'mfe_no_feature_extraction_{}.csv'.format(dataset_id[i])), 
        #                 sep=',', index=True)

        # mfe_result[7].to_csv(os.path.join(path, 'mfe_feature_extraction_{}.csv'.format(dataset_id[i])), 
        #                 sep=',', index=True)
        
        cmp_result[0].to_csv(os.path.join(path, 'cmp_no_feature_extraction_{}.csv'.format(dataset_id[i])), 
                         sep=',', index=True)

        cmp_result[7].to_csv(os.path.join(path, 'cmp_feature_extraction_{}.csv'.format(dataset_id[i])), 
                         sep=',', index=True)  
        
    del mult

    del c
    del g
    del o

    del y

    del X
    del X_




Using TensorFlow backend.


Dataset GSE94873

LOADING DATASET
LOADING MODEL
COMPUTING METRICS
TRANSFORMING DATASET

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from C:\Users\Venezian\git\multiple-myeloma\experiments\exp-geo\output\mult\GSE94873\dae\data_augmentation_adadelta_000\graph\data_augmentation_adadelta_000


COMPUTING METRICS
LOADING MODEL
COMPUTING METRICS
TRANSFORMING DATASET
INFO:tensorflow:Restoring parameters from C:\Users\Venezian\git\multiple-myeloma\experiments\exp-geo\output\mult\GSE94873\dae\data_augmentation_adadelta_001\graph\data_augmentation_adadelta_001
COMPUTING METRICS
LOADING MODEL
COMPUTING METRICS
TRANSFORMING DATASET
INFO:tensorflow:Restoring parameters from C:\Users\Venezian\git\multiple-myeloma\experiments\exp-geo\output\mult\GSE94873\dae\data_augmentation_adadelta_002\graph\data_augmentation_adadelta_002
COMPUTING METRICS
LOADING MODEL
COMPUTING METRICS
TRANSFORMING DATASET
INFO:tensorflow:Restoring pa