In [245]:
%load_ext autoreload
%autoreload 2

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from sklearn.preprocessing import StandardScaler
from keras.models import Model
from keras.layers import *    
import os 
import pyreadr
import numpy as np
import pyreadr
import pandas as pd
import multiprocessing
from joblib import Parallel, delayed
import butterfly.album
import butterfly.Models
from itertools import combinations 
from joblib import parallel_backend
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import r2_score
import pickle
from sklearn.model_selection import GroupKFold
from random import sample
from scipy import stats
from sklearn.metrics import mean_absolute_error
from collections import defaultdict
from cython.parallel import prange

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [256]:
n_samples = 5 #features to sample from each dataset

#Model
nruns = 1 #number of runs
folds = 10 #number of folds
features = 1 #number of features to predict
epochs = 180 #number of epochs
optimiser = 'adam' #model optimiser
loss = 'mse' #model loss
ntrees = 100

In [247]:
#Import your data
#DF = pyreadr.read_r('/Users/mxenoc/Desktop/workspace/butterfly/data/omics.RData')
DF = pyreadr.read_r('/home/mxenoc/workspace/butterfly/data/omics.RData')
DF = DF["DF"]

In [248]:
groups = DF['patientID']

In [249]:
#Define the predictor datasets
omics = ['rna', 'plasma_l', 'serum_l', 'microb', 'immune', 'metabol', 'plasma_s']

In [250]:
os.chdir('/home/mxenoc/workspace/butterfly/src/butterfly')

In [251]:
with open('albums_all.pkl', 'rb') as f:
    albums_all = pickle.load(f)

In [252]:
with open('RF_predictor.pkl', 'rb') as f:
    RF_predictor = pickle.load(f)

In [253]:
with open('albums.pkl', 'rb') as f:
    albums = pickle.load(f)

In [None]:
#Initialise lists
fCNN = defaultdict(list)
fRF = defaultdict(list)
fMCNN = defaultdict(list)

for predictor_index in tqdm(range(len(omics))):
    
    CNN = defaultdict(list)
    RF = defaultdict(list)
    MCNN = defaultdict(list)

    #Get your response dataset
    DFB = DF.copy()
    response = sample([col for col in DFB if col.startswith(omics[predictor_index])], n_samples)
    response_df = DFB[response]
    
    y = response_df.values
    
    #Get your predictor dataset

    #CNN
    X_CNN = np.asarray(albums_all[predictor_index])
    
    #Multi-layered CNN 
    X_MCNN = [albums[0], albums[1], albums[2], albums[3], albums[4], albums[5], albums[6]]
    del X_MCNN[predictor_index]
    X_MCNN = np.array(X_MCNN, dtype = float)
    
    #RF
    #Get your predictor dataset
    X_RF = RF_predictor[predictor_index]
    X_RF = X_RF.values
    X_RF = StandardScaler().fit_transform(X_RF)
    
    for feat in prange(y.shape[1], nogil=True):

        prediction_train, observed_train, prediction_test, observed_test = zip(*Parallel(n_jobs=nruns)
                               (delayed(butterfly.Models.CNN)
                                (X_CNN, y[:,feat], groups, 128, features, 
                                 folds, epochs, optimiser, loss, 'CNN') 
                                for cv in range(nruns)))                

        CNN['prediction_train'].append(prediction_train)
        CNN['observed_train'].append(observed_train)
        CNN['prediction_test'].append(prediction_test)
        CNN['observed_test'].append(observed_test)
        
        prediction_train, observed_train, prediction_test, observed_test = zip(*Parallel(n_jobs=nruns)
                               (delayed(butterfly.Models.RF)
                                (X_RF, y[:,feat], groups, folds, ntrees, 'RF_regressor') 
                                for cv in range(nruns)))                

        RF['prediction_train'].append(prediction_train)
        RF['observed_train'].append(observed_train)
        RF['prediction_test'].append(prediction_test)
        RF['observed_test'].append(observed_test)
        
        prediction_train, observed_train, prediction_test, observed_test = zip(*Parallel(n_jobs=nruns)
                                (delayed(butterfly.Models.CNN)
                        (X_MCNN, y[:,feat], groups, 40, features, 
                         folds, epochs, optimiser, loss, 'MCNN') 
                        for cv in range(nruns)))
        
        MCNN['prediction_train'].append(prediction_train)
        MCNN['observed_train'].append(observed_train)
        MCNN['prediction_test'].append(prediction_test)
        MCNN['observed_test'].append(observed_test)


    fCNN['prediction_train'].append(CNN['prediction_train'])
    fCNN['observed_train'].append(CNN['observed_train'])
    fCNN['prediction_test'].append(CNN['prediction_test'])
    fCNN['observed_test'].append(CNN['observed_test'])
    
    fRF['prediction_train'].append(RF['prediction_train'])
    fRF['observed_train'].append(RF['observed_train'])
    fRF['prediction_test'].append(RF['prediction_test'])
    fRF['observed_test'].append(RF['observed_test'])
    
    fMCNN['prediction_train'].append(MCNN['prediction_train'])
    fMCNN['observed_train'].append(MCNN['observed_train'])
    fMCNN['prediction_test'].append(MCNN['prediction_test'])
    fMCNN['observed_test'].append(MCNN['observed_test'])






  0%|          | 0/7 [00:00<?, ?it/s][A[A[A[A[A

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.







 14%|█▍        | 1/7 [30:15<3:01:32, 1815.49s/it][A[A[A[A[A




 29%|██▊       | 2/7 [2:06:19<4:09:59, 2999.92s/it][A[A[A[A[A




 43%|████▎     | 3/7 [4:50:40<5:37:12, 5058.24s/it][A[A[A[A[A

In [151]:
each_omic_CNN = []
for l in range(len(omics)):
    each_feature_CNN = []
    for k in range(n_samples):
        each_run_CNN = []
        for i in range(nruns):
            each_run_CNN.append(stats.spearmanr(fCNN['prediction_test'][l][k][i], 
                                                          fCNN['observed_test'][l][k][i]))
        each_feature_CNN.append(np.mean(each_run_CNN))
    each_omic_CNN.append(np.mean(each_feature_CNN))    

In [152]:
each_omic_RF = []

for l in range(len(omics)):
    each_feature_RF = []
    for k in range(n_samples):
        each_run_RF = []
        for i in range(nruns):
            each_run_RF.append(stats.spearmanr(fRF['prediction_test'][l][k][i], 
                                                          fRF['observed_test'][l][k][i]))
        each_feature_RF.append(np.mean(each_run_RF))
    each_omic_RF.append(np.mean(each_feature_RF))    

In [153]:
each_omic_MCNN = []

for l in range(len(omics)):
    each_feature_MCNN = []
    for k in range(n_samples):
        each_run_MCNN = []
        for i in range(nruns):
            each_run_MCNN.append(stats.spearmanr(fMCNN['prediction_test'][l][k][i], 
                                                          fMCNN['observed_test'][l][k][i]))
        each_feature_MCNN.append(np.mean(each_run_MCNN))
    each_omic_MCNN.append(np.mean(each_feature_MCNN))    

In [164]:
np.mean(each_omic_RF)

-0.08011759541851689

In [165]:
np.mean(each_omic_CNN)

0.16343609903575282

In [166]:
np.mean(each_omic_MCNN)

0.21882940586129962