In [None]:
import pandas as pd
import numpy as np
from openpyxl import load_workbook
from tqdm.notebook import tqdm
from typing import List
import os

## SGD weight optimization

In [None]:
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
from sklearn.model_selection import StratifiedShuffleSplit
import torch.nn.functional as F
import copy
import matplotlib.pyplot as plt
torch.manual_seed(0)

class model_together(nn.Module):
    def __init__(self, num_dtypes):
        super(model_together, self).__init__()
        self.weights = nn.Linear(num_dtypes, 3)
        #self.fc = nn.Linear(3, 1)
    
    def forward(self,xb):
        out = self.weights(xb)
        #out = self.fc(F.relu(out))
        out = torch.diagonal(out, 0, dim1=-2, dim2=-1)
        #out = F.softmax(out, dim=1)
        
        return out
def init_params(size, std=1.0): return (torch.rand(size)*std).requires_grad_()   
def init_model(model, num_dtypes):
    with torch.no_grad():
        params = init_params((3,num_dtypes))
        params = params / params.sum(1, keepdim=True)                     
        model.weights.weight.copy_(params).float()
        model.weights.bias.fill_(0)

def norm_params(model):
    with torch.no_grad():
        # Push positive before scaling:
        #model.model.weight[model.model.weight < 0] = 0.
        #weights_norm = model.model.weight / model.model.weight.sum(1, keepdim=True)
        weights_norm = F.softmax(model.weights.weight, dim=1)
        model.weights.weight.copy_(weights_norm).float()
        model.weights.bias.fill_(0)
        
# this is ok to be with numpy but then need to be converted to tensors
def get_val_set(x, y, classes, percentage = 0.1):
    np.random.seed(42)  
    x_train = np.array([]).reshape(0,x.shape[1])
    y_train = np.array([]).reshape(0,y.shape[1])
    x_val = np.array([]).reshape(0,x.shape[1])
    y_val = np.array([]).reshape(0,y.shape[1])
    for c in classes:
        indexes = np.where(y.argmax(axis=1) == c)[0]
        np.random.shuffle(indexes)
        len_val = int(percentage * len(indexes))
        len_train = len(indexes) - len_val
        index_train = indexes[0:len_train]
        index_val = indexes[len_train:]
        x_train = np.concatenate([x_train, x[index_train,...]], axis=0)
        y_train = np.concatenate([y_train, y[index_train]], axis=0)
        x_val = np.concatenate([x_val, x[index_val,...]], axis=0)
        y_val = np.concatenate([y_val, y[index_val]], axis=0)
    
    index_train = list(range(x_train.shape[0]))
    index_val = list(range(x_val.shape[0]))
    np.random.shuffle(index_train)
    np.random.shuffle(index_val)
    
    return x_train[index_train,...],y_train[index_train], x_val[index_val,...], y_val[index_val]

def train_model(model, dls, optimizer, criterion, num_epochs):
# copy code from the other notebook
    # Start training
    best_epoch = 0
    best_acc = 100
    val_preds = []
    val_labels = []
    val_labels_auc = []
    val_preds_auc = []
    train_preds = []
    train_labels = []
    train_preds_auc = []
    train_labels_auc = []
    best_model_wts = copy.deepcopy(model.state_dict())
    losses = {
        'train': [],
        'val': []
    }
    sizes = {'train': 0, 'val': 0}
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs))
        batch_loss = 0
        sizes = {}
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()
            sizes[phase] = 0
            running_loss = 0
            running_corrects = 0
            for inputs, labels in dls[phase]:
                inputs = inputs.float()
                labels = labels.type(torch.LongTensor)
                #labels = F.one_hot(labels, num_classes=3)
                # zero the parameter gradients
                optimizer.zero_grad()
                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    #import pdb; pdb.set_trace()
                    loss = criterion(outputs, labels)
                if phase == 'val':
                    val_preds += list(preds.numpy())
                    val_labels += list(labels.numpy())
                    val_labels_auc += [labels.numpy()]
                    val_preds_auc += [preds.numpy()]

                
                # Accumulating the loss over time
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels)
                sizes[phase] += inputs.size(0)
                
                # backward + optimize only if in training phase
                if phase == 'train':
                    train_preds += list(preds.numpy())
                    train_labels += list(labels.numpy())
                    train_labels_auc += [labels.numpy()]
                    train_preds_auc += [preds.numpy()]
                    # Getting gradients w.r.t. parameters
                    loss.backward()
                    # Updating parameters
                    optimizer.step()
                    #norm_params(model)
                
            epoch_loss = running_loss / sizes[phase]
            epoch_acc = running_corrects.item() / sizes[phase]
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                    phase, epoch_loss, epoch_acc))
            losses[phase].append(epoch_loss)
            if phase == 'val' and epoch_loss < best_acc:
                best_epoch = epoch
                best_acc = epoch_loss
                #best_auc = epoch_auc
                best_model_wts = copy.deepcopy(model.state_dict())
            
        # Lr scheduler
        #lr_scheduler.step()
                    
    print('Best epoch {}'.format(best_epoch))
    
    model.load_state_dict(best_model_wts)
    
    '''
    plt.figure()
    plt.plot(list(range(num_epochs)),losses['train'], label='train', color='blue')
    plt.plot(list(range(num_epochs)),losses['val'], label='val', color='orange')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()
    plt.close()
    '''
    norm_params(model)
    return model

def get_alphas_SGD(df, classes, data_types):
    x_train = []
    for _, row in tqdm(df.iterrows()):
        x = [[],[],[]]
        for d_type in data_types:
            if row['Has ' + d_type] != -1:
                luad_new = row[d_type+ ' Prob LUAD']
                hlt_new = row[d_type+ ' Prob HLT']
                lusc_new = row[d_type+ ' Prob LUSC']
                #luad_new = luad / (luad + hlt +lusc)
                #hlt_new = hlt / (luad + hlt +lusc)
                #lusc_new = lusc / (luad + hlt +lusc)
                x[0].append(luad_new)
                x[1].append(hlt_new)
                x[2].append(lusc_new)
            else:
                x[0].append(0)
                x[1].append(0)
                x[2].append(0)
        x_train.append(x)
    x_train = np.asarray(x_train)
    real = df['Real'].values
    
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
    
    for train_index, test_index  in sss.split(x_train, real):
        train_x, train_y = x_train[train_index,:,:], real[train_index]
        val_x, val_y = x_train[test_index,:,:], real[test_index]
    
    dl_train = DataLoader(list(zip(train_x,train_y)), batch_size=32, shuffle=True)
    dl_val = DataLoader(list(zip(val_x,val_y)), batch_size=32, shuffle=False)
    dls = {'train': dl_train, 'val': dl_val}
    
    model = model_together(len(data_types))
    init_model(model, len(data_types))
    
    optimizer = torch.optim.Adam(model.parameters(), 0.01)
    loss = nn.CrossEntropyLoss() 
    
    model = train_model(model, dls, optimizer, loss, num_epochs=5)

    # a
    # once the optimization has been carried out
    weights = model.weights.weight.detach().numpy()
    
    alphas = {}
    index_dtype = 0
    for d_type in data_types:
        alphas[d_type] = []
        for i in range(len(classes)):
            alphas[d_type].append(weights[i,index_dtype])
        index_dtype += 1
    return alphas

In [None]:
import pickle
def integration_model(data_types: List[str], datasets: List[str], name: str, path: str, 
                      fusion_type='probs', use_alphas=False, m_alphas=None) -> None:
    for d in datasets:
        writer = pd.ExcelWriter(path+'data_integration_model_'+d+'_'+fusion_type+'_'+name+'.xlsx', engine='openpyxl')
        data = pd.read_excel('data_integration_noBDN

In [None]:
import pickle
def integration_model(data_types: List[str], datasets: List[str], name: str, path: str, 
                      fusion_type='probs', use_alphas=False, m_alphas=None) -> None:
    for d in datasets:
        writer = pd.ExcelWriter(path+'data_integration_model_'+d+'_'+fusion_type+'_'+name+'.xlsx', engine='openpyxl')
        data = pd.read_excel('data_integration_'+d+'.xlsx',
              sheet_name=[0,1,2,3,4,5,6,7,8,9],engine='openpyxl')
        
        if use_alphas:
            print('Getting alphas from training set...')
            data_train = pd.read_excel('data_integration_train.xlsx',
              sheet_name=[0,1,2,3,4,5,6,7,8,9],engine='openpyxl')
            splits_alphas = {}
            for df_name, df in data_train.items():
                #print(df_name)
                alphas = get_alphas_SGD(df, [0,1,2], data_types)
                splits_alphas[df_name] = alphas
                print(alphas)
            print('Saving alphas...')
            f = open(path+'data_integration_model_'+d+'_'+fusion_type+'_'+name+'_alphas.pkl',"wb")
            pickle.dump(splits_alphas,f)
            f.close()

        for df_name, df in data.items():
            integration_probs = {
            'LUAD': [],
            'HLT': [],
            'LUSC': []
            }
            integration_preds = []
            for _, row in tqdm(df.iterrows()):
                local_probs = {
                    'LUAD': [],
                    'HLT': [],
                    'LUSC': []
                }
                local_preds = []
                for d_type in data_types:
                    if row['Has ' + d_type] != -1:
                        luad = row[d_type+ ' Prob LUAD']
                        hlt = row[d_type+ ' Prob HLT']
                        lusc = row[d_type+ ' Prob LUSC']
                        luad_new = luad / (luad + hlt +lusc)
                        hlt_new = hlt / (luad + hlt +lusc)
                        lusc_new = lusc / (luad + hlt +lusc)
                        local_probs['LUAD'].append(luad_new)
                        local_probs['HLT'].append(hlt_new)
                        local_probs['LUSC'].append(lusc_new)
                        local_preds.append(row[d_type + ' Pred'])
                    elif use_alphas:
                        local_probs['LUAD'].append(0)
                        local_probs['HLT'].append(0)
                        local_probs['LUSC'].append(0)
                        
                if fusion_type == 'probs':
                    if use_alphas:
                        if m_alphas:
                        #alphas_manual = [0.65, 0.35]
                            alphas_manual = m_alphas
                            luad_prob = integrate_probs(local_probs['LUAD'], 0, alphas_manual)
                            hlt_prob = integrate_probs(local_probs['HLT'], 1, alphas_manual)
                            lusc_prob = integrate_probs(local_probs['LUSC'], 2, alphas_manual)
                        else:
                            luad_prob_new = integrate_probs(local_probs['LUAD'], 0, splits_alphas[df_name])
                            hlt_prob_new = integrate_probs(local_probs['HLT'], 1, splits_alphas[df_name])
                            lusc_prob_new = integrate_probs(local_probs['LUSC'], 2, splits_alphas[df_name])
                            luad_prob = luad_prob_new/ (luad_prob_new + hlt_prob_new + lusc_prob_new)
                            lusc_prob = lusc_prob_new/ (luad_prob_new + hlt_prob_new + lusc_prob_new)
                            hlt_prob = hlt_prob_new/ (luad_prob_new + hlt_prob_new + lusc_prob_new)
                    else:
                        luad_prob = integrate_probs(local_probs['LUAD'])
                        hlt_prob = integrate_probs(local_probs['HLT'])
                        lusc_prob = integrate_probs(local_probs['LUSC'])
                    integration_probs['LUAD'].append(luad_prob)
                    integration_probs['HLT'].append(hlt_prob)
                    integration_probs['LUSC'].append(lusc_prob)

                    pred = np.argmax([luad_prob,hlt_prob,lusc_prob], axis=0)
                    integration_preds.append(pred)

                elif fusion_type == 'preds':
                    if len(local_preds) == 2:
                        # if there are only two predictions, we need to fuse the probabilities
                        luad_prob = integrate_probs(local_probs['LUAD'])
                        hlt_prob = integrate_probs(local_probs['HLT'])
                        lusc_prob = integrate_probs(local_probs['LUSC'])
                        
                        integration_probs['LUAD'].append(luad_prob)
                        integration_probs['HLT'].append(hlt_prob)
                        integration_probs['LUSC'].append(lusc_prob)
                        
                        pred = np.argmax([luad_prob,hlt_prob,lusc_prob], axis=0)
                        
                    else:
                        pred = integrate_preds(local_preds)
                        if pred == 0:
                            luad_prob = 1
                            hlt_prob = 0
                            lusc_prob = 0
                        elif pred == 1:
                            luad_prob = 0
                            hlt_prob = 1
                            lusc_prob = 0
                        else:
                            luad_prob = 0
                            hlt_prob = 0
                            lusc_prob = 1

                        integration_probs['LUAD'].append(luad_prob)
                        integration_probs['HLT'].append(hlt_prob)
                        integration_probs['LUSC'].append(lusc_prob)
                    
                
                    integration_preds.append(pred)

            
            for cls in integration_probs.keys():
                df['Integration Prob '+ cls] = integration_probs[cls]

            df['Integration Pred'] = integration_preds

            # save to sheet
            df.to_excel(writer, sheet_name='split_'+str(df_name), index=False)

        writer.close()

In [None]:
integration_model(data_types=["WSI", "RNA", "miRNA", "CNV", 'DNA'],
                  datasets=['test', 'train'], name="SGD-all_sources",
                  path='results_SGD/', fusion_type='probs', use_alphas=True)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, f1_score

data_model = pd.read_excel('../result_files/data_integration_model_test_probs_SGD-all_sources.xlsx',
              sheet_name=[0,1,2,3,4,5,6,7,8,9],engine='openpyxl')

accs = {
    'WSI': [],
    'RNA': [],
    'miRNA': [],
    'CNV': [],
    'DNA': [],
    'Integration': {'WSI': [], 'RNA': [], 'miRNA': [], 'CNV': [], 'DNA': [], 'Integration': []}
}

f1_scores = {
    'WSI': [],
    'RNA': [],
    'miRNA': [],
    'CNV': [],
    'DNA': [],
    'Integration': {'WSI': [], 'RNA': [], 'miRNA': [], 'CNV': [], 'DNA': [],'Integration': []}
}

aucs = {
    'WSI': [],
    'RNA': [],
    'miRNA': [],
    'CNV': [],
    'DNA': [],
    'Integration': {'WSI': [], 'RNA': [], 'miRNA': [], 'CNV': [], 'DNA': [], 'Integration': []}
}

auprcs = {
    'WSI': [],
    'RNA': [],
    'miRNA': [],
    'CNV': [],
    'DNA': [],
    'Integration': {'WSI': [], 'RNA': [], 'miRNA': [], 'CNV': [], 'DNA': [], 'Integration': []}
}

for d_type in ['WSI', 'RNA', 'miRNA', 'CNV', 'DNA']:
        for df_name, df in data_model.items():
            if d_type != 'Integration':
                df_only = df.loc[df['Has '+ d_type] != -1]
            else:
                df_only = df
            probs = [[x,y,z] for x,y,z in zip(df_only[d_type + ' Prob LUAD'], df_only[d_type + ' Prob HLT'], df_only[d_type + ' Prob LUSC'])]
            preds = df_only[d_type + ' Pred'].values
            probs = np.asarray(probs)
            real = df_only['Real'].values
            real_binarized = label_binarize(real, classes=[*range(3)])
            acc = accuracy_score(real, preds)*100
            f1 = f1_score(real, preds, average='weighted')*100
            auc = roc_auc_score(real, probs, multi_class='ovr')
            accs[d_type].append(acc)
            f1_scores[d_type].append(f1)
            aucs[d_type].append(auc)
            aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
            aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
            aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
            auprcs[d_type].append(np.mean([aucpr1, aucpr2, aucpr3]))
            
            # integration
            probs_int = [[x,y,z] for x,y,z in zip(df_only['Integration Prob LUAD'], df_only['Integration Prob HLT'], df_only['Integration Prob LUSC'])]
            preds_int = df_only['Integration Pred'].values
            probs_int = np.asarray(probs_int)
            real = df_only['Real'].values
            real_binarized = label_binarize(real, classes=[*range(3)])
            acc_int = accuracy_score(real, preds_int)*100
            f1_int = f1_score(real, preds_int, average='weighted')*100
            auc_int = roc_auc_score(real, probs_int, multi_class='ovr')
            accs['Integration'][d_type].append(acc_int)
            f1_scores['Integration'][d_type].append(f1_int)
            aucs['Integration'][d_type].append(auc_int)
            aucpr1 = average_precision_score(real_binarized[:, 0], probs_int[:, 0])
            aucpr2 = average_precision_score(real_binarized[:, 1], probs_int[:, 1])
            aucpr3 = average_precision_score(real_binarized[:, 2], probs_int[:, 2])
            auprcs['Integration'][d_type].append(np.mean([aucpr1, aucpr2, aucpr3]))

In [None]:
for d_type in ['WSI', 'RNA', 'miRNA', 'CNV', 'DNA']:
    print(d_type + ' ACC: {}+-{}'.format(np.mean(accs[d_type]),np.std(accs[d_type])))
    print(d_type + ' ACC: {}+-{}'.format(np.mean(accs['Integration'][d_type]),np.std(accs['Integration'][d_type])))
    print(5*'-')
print(10*'-')

for d_type in ['WSI', 'RNA', 'miRNA', 'CNV', 'DNA']:  
    print(d_type + ' F1: {}+-{}'.format(np.mean(f1_scores[d_type]),np.std(f1_scores[d_type])))
    print(d_type + ' F1: {}+-{}'.format(np.mean(f1_scores['Integration'][d_type]),np.std(f1_scores['Integration'][d_type])))
    print(5*'-')
print(10*'-')

for d_type in ['WSI', 'RNA', 'miRNA', 'CNV', 'DNA']:
    print(d_type + ' AUC: {}+-{}'.format(np.mean(aucs[d_type]),np.std(aucs[d_type])))
    print(d_type + ' AUC: {}+-{}'.format(np.mean(aucs['Integration'][d_type]),np.std(aucs['Integration'][d_type])))
    print(5*'-')
print(10*'-')

for d_type in ['WSI', 'RNA', 'miRNA', 'CNV', 'DNA']:
    print(d_type + ' AUPRC: {}+-{}'.format(np.mean(auprcs[d_type]),np.std(auprcs[d_type])))
    print(d_type + ' AUPRC: {}+-{}'.format(np.mean(auprcs['Integration'][d_type]),np.std(auprcs['Integration'][d_type])))
    print(5*'-')

In [None]:
import numpy as np
accs = []
f1_scores = []
aucs = []
auprcs = []
data_model = pd.read_excel('../result_files/data_integration_model_test_probs_SGD-all_sources.xlsx',
              sheet_name=[0,1,2,3,4,5,6,7,8,9],engine='openpyxl')

d_type = 'Integration'
for df_name, df in data_model.items():

    df_only = df
    probs = [[x,y,z] for x,y,z in zip(df_only[d_type + ' Prob LUAD'], df_only[d_type + ' Prob HLT'], df_only[d_type + ' Prob LUSC'])]
    probs = np.asarray(probs)
    preds = df_only[d_type + ' Pred'].values

    real = df_only['Real'].values
    real_binarized = label_binarize(real, classes=[*range(3)])

    aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
    aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
    aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
    
    acc = accuracy_score(real, preds)*100
    f1 = f1_score(real, preds, average='weighted')*100
    auc = roc_auc_score(real, probs, multi_class='ovr')
    accs.append(acc)
    f1_scores.append(f1)
    aucs.append(auc)
    auprcs.append(np.mean([aucpr1, aucpr2, aucpr3]))

print(d_type + ' ACC: {}+-{}'.format(np.mean(accs),np.std(accs)))
print(d_type + ' F1: {}+-{}'.format(np.mean(f1_scores),np.std(f1_scores)))
print(d_type + ' AUC: {}+-{}'.format(np.mean(aucs),np.std(aucs)))
print(d_type + ' AUPRC: {}+-{}'.format(np.mean(auprcs),np.std(auprcs)))

In [None]:
for d_type in ['WSI', 'RNA', 'miRNA', 'CNV', 'DNA']:
    print(d_type + ' ACC: {}+-{}'.format(np.mean(accs[d_type]),np.std(accs[d_type])))
    print(d_type + ' ACC: {}+-{}'.format(np.mean(accs['Integration'][d_type]),np.std(accs['Integration'][d_type])))
    print(5*'-')
print(10*'-')

for d_type in ['WSI', 'RNA', 'miRNA', 'CNV', 'DNA']:  
    print(d_type + ' F1: {}+-{}'.format(np.mean(f1_scores[d_type]),np.std(f1_scores[d_type])))
    print(d_type + ' F1: {}+-{}'.format(np.mean(f1_scores['Integration'][d_type]),np.std(f1_scores['Integration'][d_type])))
    print(5*'-')
print(10*'-')

for d_type in ['WSI', 'RNA', 'miRNA', 'CNV', 'DNA']:
    print(d_type + ' AUC: {}+-{}'.format(np.mean(aucs[d_type]),np.std(aucs[d_type])))
    print(d_type + ' AUC: {}+-{}'.format(np.mean(aucs['Integration'][d_type]),np.std(aucs['Integration'][d_type])))
    print(5*'-')

## check best integration two sources

In [None]:
os.mkdir('results_SGD/two-sources-integration')

In [None]:
data_types = ['WSI', 'RNA', 'miRNA', 'CNV', 'DNA']

i = 0
j = 1

for d_type1 in data_types[i:]:
    for d_type2 in data_types[j:]:
        name = d_type1 + '-' + d_type2
        integration_model(data_types=[d_type1, d_type2],
                          datasets=['test'], name=name,
                          path='results_SGD/two-sources-integration/',
                          fusion_type='probs', use_alphas=True)
    i += 1
    j = i + 1

In [None]:
from glob import glob
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, f1_score, average_precision_score
from sklearn.preprocessing import label_binarize

data_types = ['WSI', 'RNA', 'miRNA', 'CNV', 'DNA']

accs = {}
f1_scores = {}
aucs = {}
aucprcs = {}

i = 0
j = 1

for d_type1 in data_types[i:]:
    for d_type2 in data_types[j:]:
        name = d_type1 + '-' + d_type2
        data_model = pd.read_excel('../result_files/two-sources-integration/data_integration_model_test_probs_'+name+'.xlsx',
              sheet_name=[0,1,2,3,4,5,6,7,8,9],engine='openpyxl')
        
        accs[name] = {}
        f1_scores[name] = {}
        aucs[name] = {}
        aucprcs[name] = {}
        
        accs[name]['Integration'] = []
        accs[name][d_type1+'Int'] = []
        accs[name][d_type2+'Int'] = []
        
        f1_scores[name]['Integration'] = []
        f1_scores[name][d_type1+'Int'] = []
        f1_scores[name][d_type2+'Int'] = []
        
        aucs[name]['Integration'] = []
        aucs[name][d_type1+'Int'] = []
        aucs[name][d_type2+'Int'] = []
        
        aucprcs[name]['Integration'] = []
        aucprcs[name][d_type1+'Int'] = []
        aucprcs[name][d_type2+'Int'] = []
        
        for df_name, df in data_model.items():
            # take those where the two sources has data
            df_only = df.loc[(df['Has '+ d_type1] != -1) | (df['Has ' + d_type2] != -1)]
            df_dt1 = df.loc[df['Has '+ d_type1] != -1]
            df_dt2 = df.loc[df['Has '+ d_type2] != -1]
            probs = [[x,y,z] for x,y,z in zip(df_only['Integration Prob LUAD'], df_only['Integration Prob HLT'], df_only['Integration Prob LUSC'])]
            probs = np.asarray(probs)
            preds = df_only['Integration Pred'].values
            real = df_only['Real'].values
            acc = accuracy_score(real, preds)*100
            f1 = f1_score(real, preds, average='weighted')*100
            auc = roc_auc_score(real, probs, multi_class='ovr')
            real_binarized = label_binarize(real, classes=[*range(3)])

            aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
            aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
            aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
            
            accs[name]['Integration'].append(acc)
            f1_scores[name]['Integration'].append(f1)
            aucs[name]['Integration'].append(auc)
            aucprcs[name]['Integration'].append([aucpr1,aucpr2,aucpr3])
            
            # dtype1
            probs = [[x,y,z] for x,y,z in zip(df_dt1['Integration Prob LUAD'], df_dt1['Integration Prob HLT'], df_dt1['Integration Prob LUSC'])]
            probs = np.asarray(probs)
            preds = df_dt1['Integration Pred'].values
            real = df_dt1['Real'].values
            real_binarized = label_binarize(real, classes=[*range(3)])
            acc = accuracy_score(real, preds)*100
            f1 = f1_score(real, preds, average='weighted')*100
            auc = roc_auc_score(real, probs, multi_class='ovr')
            
            aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
            aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
            aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
            
            accs[name][d_type1+'Int'].append(acc)
            f1_scores[name][d_type1+'Int'].append(f1)
            aucs[name][d_type1+'Int'].append(auc)
            aucprcs[name][d_type1+'Int'].append(np.mean([aucpr1,aucpr2,aucpr3]))
            
            # dtype2
            probs = [[x,y,z] for x,y,z in zip(df_dt2['Integration Prob LUAD'], df_dt2['Integration Prob HLT'], df_dt2['Integration Prob LUSC'])]
            preds = df_dt2['Integration Pred'].values
            probs = np.asarray(probs)
            real = df_dt2['Real'].values
            real_binarized = label_binarize(real, classes=[*range(3)])
            acc = accuracy_score(real, preds)*100
            f1 = f1_score(real, preds, average='weighted')*100
            auc = roc_auc_score(real, probs, multi_class='ovr')
            aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
            aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
            aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
            
            accs[name][d_type2+'Int'].append(acc)
            f1_scores[name][d_type2+'Int'].append(f1)
            aucs[name][d_type2+'Int'].append(auc)
            aucprcs[name][d_type2+'Int'].append(np.mean([aucpr1,aucpr2,aucpr3]))
            
    i += 1
    j = i + 1

In [None]:
i = 0
j = 1

for d_type1 in data_types[i:]:
    for d_type2 in data_types[j:]:
        print(5*'-')
        name = d_type1 + '-' + d_type2
        print(name + ' ACC: {}+-{}'.format(np.mean(accs[name]['Integration']),np.std(accs[name]['Integration'])))
        print(d_type1+'Int' + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type1+'Int']),np.std(accs[name][d_type1+'Int'])))
        print(d_type2+'Int' + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type2+'Int']),np.std(accs[name][d_type2+'Int'])))
        
    i += 1
    j = i + 1
print(10*'-')

i = 0
j = 1

for d_type1 in data_types[i:]:
    for d_type2 in data_types[j:]:
        name = d_type1 + '-' + d_type2
        print(5*'-')
        print(name + ' F1: {}+-{}'.format(np.mean(f1_scores[name]['Integration']),np.std(f1_scores[name]['Integration'])))
        print(d_type1+'Int' + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type1+'Int']),np.std(f1_scores[name][d_type1+'Int'])))
        print(d_type2+'Int' + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type2+'Int']),np.std(f1_scores[name][d_type2+'Int'])))
    i += 1
    j = i + 1
print(10*'-')

i = 0
j = 1

for d_type1 in data_types[i:]:
    for d_type2 in data_types[j:]:
        print(5*'-')
        name = d_type1 + '-' + d_type2
        print(name + ' AUC: {}+-{}'.format(np.mean(aucs[name]['Integration']),np.std(aucs[name]['Integration'])))
        print(d_type1+'Int' + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type1+'Int']),np.std(aucs[name][d_type1+'Int'])))
        print(d_type2+'Int' + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type2+'Int']),np.std(aucs[name][d_type2+'Int'])))
    i += 1
    j = i + 1

i = 0
j = 1
for d_type1 in data_types[i:]:
    for d_type2 in data_types[j:]:
        print(5*'-')
        name = d_type1 + '-' + d_type2
        print(name + ' AUPRC: {}+-{}'.format(np.mean(aucprcs[name]['Integration']),np.std(aucprcs[name]['Integration'])))
        print(d_type1+'Int' + ' AUPRC: {}+-{}'.format(np.mean(aucprcs[name][d_type1+'Int']),np.std(aucprcs[name][d_type1+'Int'])))
        print(d_type2+'Int' + ' AUPRC: {}+-{}'.format(np.mean(aucprcs[name][d_type2+'Int']),np.std(aucprcs[name][d_type2+'Int'])))
    i += 1
    j = i + 1

## check best integration three sources

In [None]:
os.mkdir('results_SGD/three-sources-integration')

In [None]:
data_types = ['WSI', 'RNA', 'miRNA', 'CNV', 'DNA']

k = 0
i = 1
j = 2

accs = {}
f1_scores = {}
aucs = {}

for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            name = d_type1 + '-' + d_type2 + '-' + d_type3
            integration_model(data_types=[d_type1, d_type2, d_type3],
                          datasets=['test'], name=name,
                          path='results_SGD/three-sources-integration/',
                          fusion_type='probs', use_alphas=True)
        
        j += 1
    k += 1    
    i = k + 1
    j = i + 1

In [None]:

data_types = ['WSI', 'RNA', 'miRNA', 'CNV', 'DNA']
data_types2 = copy.copy(data_types)
data_types3 = copy.copy(data_types)

k = 0
i = 1
j = 2

accs = {}
f1_scores = {}
aucs = {}
aucprcs = {}

for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            name = d_type1 + '-' + d_type2 + '-' + d_type3
            data_model = pd.read_excel('../result_files/three-sources-integration/data_integration_model_test_probs_'+name+'.xlsx',
              sheet_name=[0,1,2,3,4,5,6,7,8,9],engine='openpyxl')
        
            accs[name] = {}
            f1_scores[name] = {}
            aucs[name] = {}
            aucprcs[name] = {}

            accs[name]['Integration'] = []
            accs[name][d_type1+'Int'] = []
            accs[name][d_type2+'Int'] = []
            accs[name][d_type3+'Int'] = []

            f1_scores[name]['Integration'] = []
            f1_scores[name][d_type1+'Int'] = []
            f1_scores[name][d_type2+'Int'] = []
            f1_scores[name][d_type3+'Int'] = []

            aucs[name]['Integration'] = []
            aucs[name][d_type1+'Int'] = []
            aucs[name][d_type2+'Int'] = []
            aucs[name][d_type3+'Int'] = []
            
            aucprcs[name]['Integration'] = []
            aucprcs[name][d_type1+'Int'] = []
            aucprcs[name][d_type2+'Int'] = []
            aucprcs[name][d_type3+'Int'] = []
            
            for df_name, df in data_model.items():
                # take those where the two sources has data
                df_only = df.loc[(df['Has '+ d_type1] != -1) | (df['Has ' + d_type2] != -1) | (df['Has ' + d_type3] != -1)]
                df_dt1 = df.loc[df['Has '+ d_type1] != -1]
                df_dt2 = df.loc[df['Has '+ d_type2] != -1]
                df_dt3 = df.loc[df['Has '+ d_type3] != -1]
                
                probs = [[x,y,z] for x,y,z in zip(df_only['Integration Prob LUAD'], df_only['Integration Prob HLT'], df_only['Integration Prob LUSC'])]
                preds = df_only['Integration Pred'].values
                probs = np.asarray(probs)
                real = df_only['Real'].values
                real_binarized = label_binarize(real, classes=[*range(3)])
                acc = accuracy_score(real, preds)*100
                f1 = f1_score(real, preds, average='weighted')*100
                auc = roc_auc_score(real, probs, multi_class='ovr')
                accs[name]['Integration'].append(acc)
                f1_scores[name]['Integration'].append(f1)
                aucs[name]['Integration'].append(auc)
                
                aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                aucprcs[name]['Integration'].append(np.mean([aucpr1,aucpr2,aucpr3]))
                
                # dtype1
                probs = [[x,y,z] for x,y,z in zip(df_dt1['Integration Prob LUAD'], df_dt1['Integration Prob HLT'], df_dt1['Integration Prob LUSC'])]
                preds = df_dt1['Integration Pred'].values
                probs = np.asarray(probs)
                real = df_dt1['Real'].values
                real_binarized = label_binarize(real, classes=[*range(3)])
                acc = accuracy_score(real, preds)*100
                f1 = f1_score(real, preds, average='weighted')*100
                auc = roc_auc_score(real, probs, multi_class='ovr')

                accs[name][d_type1+'Int'].append(acc)
                f1_scores[name][d_type1+'Int'].append(f1)
                aucs[name][d_type1+'Int'].append(auc)
                
                aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                aucprcs[name][d_type1+'Int'].append(np.mean([aucpr1,aucpr2,aucpr3]))
                
                # dtype2
                probs = [[x,y,z] for x,y,z in zip(df_dt2['Integration Prob LUAD'], df_dt2['Integration Prob HLT'], df_dt2['Integration Prob LUSC'])]
                preds = df_dt2['Integration Pred'].values
                probs = np.asarray(probs)
                real = df_dt2['Real'].values
                real_binarized = label_binarize(real, classes=[*range(3)])
                acc = accuracy_score(real, preds)*100
                f1 = f1_score(real, preds, average='weighted')*100
                auc = roc_auc_score(real, probs, multi_class='ovr')

                accs[name][d_type2+'Int'].append(acc)
                f1_scores[name][d_type2+'Int'].append(f1)
                aucs[name][d_type2+'Int'].append(auc)
                
                aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                aucprcs[name][d_type2+'Int'].append(np.mean([aucpr1,aucpr2,aucpr3]))
                
                
                # dtype3
                probs = [[x,y,z] for x,y,z in zip(df_dt3['Integration Prob LUAD'], df_dt3['Integration Prob HLT'], df_dt3['Integration Prob LUSC'])]
                preds = df_dt3['Integration Pred'].values
                probs = np.asarray(probs)
                real = df_dt3['Real'].values
                real_binarized = label_binarize(real, classes=[*range(3)])
                acc = accuracy_score(real, preds)*100
                f1 = f1_score(real, preds, average='weighted')*100
                auc = roc_auc_score(real, probs, multi_class='ovr')

                accs[name][d_type3+'Int'].append(acc)
                f1_scores[name][d_type3+'Int'].append(f1)
                aucs[name][d_type3+'Int'].append(auc)
                
                aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                aucprcs[name][d_type3+'Int'].append(np.mean([aucpr1,aucpr2,aucpr3]))
        j += 1
    k += 1    
    i = k + 1
    j = i + 1

In [None]:
k = 0
i = 1
j = 2

for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            print(5*'-')
            name = d_type1 + '-' + d_type2 + '-' + d_type3
            print(name + ' ACC: {}+-{}'.format(np.mean(accs[name]['Integration']),np.std(accs[name]['Integration'])))
            print(d_type1+'Int' + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type1+'Int']),np.std(accs[name][d_type1+'Int'])))
            print(d_type2+'Int' + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type2+'Int']),np.std(accs[name][d_type2+'Int'])))
            print(d_type3+'Int' + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type3+'Int']),np.std(accs[name][d_type3+'Int'])))
        
        j += 1
    k += 1    
    i = k + 1
    j = i + 1

print(10*'-')
k = 0
i = 1
j = 2

for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            print(5*'-')
            name = d_type1 + '-' + d_type2 + '-' + d_type3
            print(name + ' F1: {}+-{}'.format(np.mean(f1_scores[name]['Integration']),np.std(f1_scores[name]['Integration'])))
            print(d_type1+'Int' + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type1+'Int']),np.std(f1_scores[name][d_type1+'Int'])))
            print(d_type2+'Int' + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type2+'Int']),np.std(f1_scores[name][d_type2+'Int'])))
            print(d_type3+'Int' + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type3+'Int']),np.std(f1_scores[name][d_type3+'Int'])))
        j += 1
    k += 1    
    i = k + 1
    j = i + 1

print(10*'-')
k = 0
i = 1
j = 2

for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            print(5*'-')
            name = d_type1 + '-' + d_type2 + '-' + d_type3
            print(name + ' AUC: {}+-{}'.format(np.mean(aucs[name]['Integration']),np.std(aucs[name]['Integration'])))
            print(d_type1+'Int' + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type1+'Int']),np.std(aucs[name][d_type1+'Int'])))
            print(d_type2+'Int' + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type2+'Int']),np.std(aucs[name][d_type2+'Int'])))
            print(d_type3+'Int' + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type3+'Int']),np.std(aucs[name][d_type3+'Int'])))
        j += 1
    k += 1    
    i = k + 1
    j = i + 1
    
k = 0
i = 1
j = 2

for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            print(5*'-')
            name = d_type1 + '-' + d_type2 + '-' + d_type3
            print(name + ' AUPRC: {}+-{}'.format(np.mean(aucprcs[name]['Integration']),np.std(aucprcs[name]['Integration'])))
            print(d_type1+'Int' + ' AUPRC: {}+-{}'.format(np.mean(aucprcs[name][d_type1+'Int']),np.std(aucprcs[name][d_type1+'Int'])))
            print(d_type2+'Int' + ' AUPRC: {}+-{}'.format(np.mean(aucprcs[name][d_type2+'Int']),np.std(aucprcs[name][d_type2+'Int'])))
            print(d_type3+'Int' + ' AUPRC: {}+-{}'.format(np.mean(aucprcs[name][d_type3+'Int']),np.std(aucprcs[name][d_type3+'Int'])))
        j += 1
    k += 1    
    i = k + 1
    j = i + 1

## check best integration four sources

In [None]:
os.mkdir('../result_files/four-sources-integration')

In [None]:
data_types = ['WSI', 'RNA', 'miRNA', 'CNV', 'DNA']

k = 0
i = 1
j = 2
z = 3

accs = {}
f1_scores = {}
aucs = {}

for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            for d_type4 in data_types[z:]:
                name = d_type1 + '-' + d_type2 + '-' + d_type3 + '-' + d_type4
                integration_model(data_types=[d_type1, d_type2, d_type3, d_type4],
                          datasets=['test'], name=name,
                          path='results_SGD/four-sources-integration/',
                          fusion_type='probs', use_alphas=True)
            z += 1
        j += 1
    k += 1    
    i = k + 1
    j = i + 1
    z = j + 1

In [None]:
data_types = ['WSI', 'RNA', 'miRNA', 'CNV', 'DNA']

k = 0
i = 1
j = 2
z = 3

accs = {}
f1_scores = {}
aucs = {}
aucprcs = {}

for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            for d_type4 in data_types[z:]:
                name = d_type1 + '-' + d_type2 + '-' + d_type3 + '-' + d_type4
                data_model = pd.read_excel('../result_files/four-sources-integration/data_integration_model_test_probs_'+name+'.xlsx',
                  sheet_name=[0,1,2,3,4,5,6,7,8,9],engine='openpyxl')
        
                accs[name] = {}
                f1_scores[name] = {}
                aucs[name] = {}
                aucprcs[name] = {}
                
                accs[name]['Integration'] = []
                accs[name][d_type1+'Int'] = []
                accs[name][d_type2+'Int'] = []
                accs[name][d_type3+'Int'] = []
                accs[name][d_type4+'Int'] = []
                
                f1_scores[name]['Integration'] = []
                f1_scores[name][d_type1+'Int'] = []
                f1_scores[name][d_type2+'Int'] = []
                f1_scores[name][d_type3+'Int'] = []
                f1_scores[name][d_type4+'Int'] = []
                
                aucs[name]['Integration'] = []
                aucs[name][d_type1+'Int'] = []
                aucs[name][d_type2+'Int'] = []
                aucs[name][d_type3+'Int'] = []
                aucs[name][d_type4+'Int'] = []
                
                aucprcs[name]['Integration'] = []
                aucprcs[name][d_type1+'Int'] = []
                aucprcs[name][d_type2+'Int'] = []
                aucprcs[name][d_type3+'Int'] = []
                aucprcs[name][d_type4+'Int'] = []
                
                for df_name, df in data_model.items():
                    # take those where the four sources has data
                    df_only = df.loc[(df['Has '+ d_type1] != -1) | (df['Has ' + d_type2] != -1) | (df['Has ' + d_type3] != -1) | (df['Has ' + d_type4] != -1)]
                    df_dt1 = df.loc[df['Has '+ d_type1] != -1]
                    df_dt2 = df.loc[df['Has '+ d_type2] != -1]
                    df_dt3 = df.loc[df['Has '+ d_type3] != -1]
                    df_dt4 = df.loc[df['Has '+ d_type4] != -1]
                    
                    probs = [[x,y,z] for x,y,z in zip(df_only['Integration Prob LUAD'], df_only['Integration Prob HLT'], df_only['Integration Prob LUSC'])]
                    probs = np.asarray(probs)
                    preds = df_only['Integration Pred'].values
                    real = df_only['Real'].values
                    real_binarized = label_binarize(real, classes=[*range(3)])
                    acc = accuracy_score(real, preds)*100
                    f1 = f1_score(real, preds, average='weighted')*100
                    auc = roc_auc_score(real, probs, multi_class='ovr')
                    accs[name]['Integration'].append(acc)
                    f1_scores[name]['Integration'].append(f1)
                    aucs[name]['Integration'].append(auc)
                    
                    aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                    aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                    aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                    aucprcs[name]['Integration'].append(np.mean([aucpr1,aucpr2,aucpr3]))
                
                    # dtype1
                    probs = [[x,y,z] for x,y,z in zip(df_dt1['Integration Prob LUAD'], df_dt1['Integration Prob HLT'], df_dt1['Integration Prob LUSC'])]
                    probs = np.asarray(probs)
                    preds = df_dt1['Integration Pred'].values
                    real = df_dt1['Real'].values
                    real_binarized = label_binarize(real, classes=[*range(3)])
                    acc = accuracy_score(real, preds)*100
                    f1 = f1_score(real, preds, average='weighted')*100
                    auc = roc_auc_score(real, probs, multi_class='ovr')

                    accs[name][d_type1+'Int'].append(acc)
                    f1_scores[name][d_type1+'Int'].append(f1)
                    aucs[name][d_type1+'Int'].append(auc)
                    
                    aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                    aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                    aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                    aucprcs[name][d_type1+'Int'].append(np.mean([aucpr1,aucpr2,aucpr3]))
                    # dtype2
                    probs = [[x,y,z] for x,y,z in zip(df_dt2['Integration Prob LUAD'], df_dt2['Integration Prob HLT'], df_dt2['Integration Prob LUSC'])]
                    preds = df_dt2['Integration Pred'].values
                    probs = np.asarray(probs)
                    real = df_dt2['Real'].values
                    real_binarized = label_binarize(real, classes=[*range(3)])
                    acc = accuracy_score(real, preds)*100
                    f1 = f1_score(real, preds, average='weighted')*100
                    auc = roc_auc_score(real, probs, multi_class='ovr')

                    accs[name][d_type2+'Int'].append(acc)
                    f1_scores[name][d_type2+'Int'].append(f1)
                    aucs[name][d_type2+'Int'].append(auc)
                    
                    aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                    aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                    aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                    aucprcs[name][d_type2+'Int'].append(np.mean([aucpr1,aucpr2,aucpr3]))
                    
                    # dtype3
                    probs = [[x,y,z] for x,y,z in zip(df_dt3['Integration Prob LUAD'], df_dt3['Integration Prob HLT'], df_dt3['Integration Prob LUSC'])]
                    preds = df_dt3['Integration Pred'].values
                    probs = np.asarray(probs)
                    real = df_dt3['Real'].values
                    real_binarized = label_binarize(real, classes=[*range(3)])
                    acc = accuracy_score(real, preds)*100
                    f1 = f1_score(real, preds, average='weighted')*100
                    auc = roc_auc_score(real, probs, multi_class='ovr')

                    accs[name][d_type3+'Int'].append(acc)
                    f1_scores[name][d_type3+'Int'].append(f1)
                    aucs[name][d_type3+'Int'].append(auc)
                    
                    aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                    aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                    aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                    aucprcs[name][d_type3+'Int'].append(np.mean([aucpr1,aucpr2,aucpr3]))
                    
                    # dtype4
                    probs = [[x,y,z] for x,y,z in zip(df_dt4['Integration Prob LUAD'], df_dt4['Integration Prob HLT'], df_dt4['Integration Prob LUSC'])]
                    probs = np.asarray(probs)
                    preds = df_dt4['Integration Pred'].values
                    real = df_dt4['Real'].values
                    real_binarized = label_binarize(real, classes=[*range(3)])
                    acc = accuracy_score(real, preds)*100
                    f1 = f1_score(real, preds, average='weighted')*100
                    auc = roc_auc_score(real, probs, multi_class='ovr')

                    accs[name][d_type4+'Int'].append(acc)
                    f1_scores[name][d_type4+'Int'].append(f1)
                    aucs[name][d_type4+'Int'].append(auc)
                    
                    aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                    aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                    aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                    aucprcs[name][d_type4+'Int'].append(np.mean([aucpr1,aucpr2,aucpr3]))
            z += 1
        j += 1
    k += 1    
    i = k + 1
    j = i + 1
    z = j + 1
    

In [None]:
k = 0
i = 1
j = 2
z = 3

for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            for d_type4 in data_types[z:]:
                print(5*'-')
                name = d_type1 + '-' + d_type2 + '-' + d_type3 + '-' + d_type4
                print(name + ' ACC: {}+-{}'.format(np.mean(accs[name]['Integration']),np.std(accs[name]['Integration'])))
                print(d_type1+'Int' + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type1+'Int']),np.std(accs[name][d_type1+'Int'])))
                print(d_type2+'Int' + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type2+'Int']),np.std(accs[name][d_type2+'Int'])))
                print(d_type3+'Int' + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type3+'Int']),np.std(accs[name][d_type3+'Int'])))
                print(d_type4+'Int' + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type4+'Int']),np.std(accs[name][d_type4+'Int'])))
            z += 1
        j += 1
    k += 1    
    i = k + 1
    j = i + 1
    z = j + 1

print(10*'-')

k = 0
i = 1
j = 2
z = 3
for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            for d_type4 in data_types[z:]:
                print(5*'-')
                name = d_type1 + '-' + d_type2 + '-' + d_type3 + '-' + d_type4
                print(name + ' F1: {}+-{}'.format(np.mean(f1_scores[name]['Integration']),np.std(f1_scores[name]['Integration'])))
                print(d_type1+'Int' + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type1+'Int']),np.std(f1_scores[name][d_type1+'Int'])))
                print(d_type2+'Int' + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type2+'Int']),np.std(f1_scores[name][d_type2+'Int'])))
                print(d_type3+'Int' + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type3+'Int']),np.std(f1_scores[name][d_type3+'Int'])))
                print(d_type4+'Int' + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type4+'Int']),np.std(f1_scores[name][d_type4+'Int'])))
            z += 1
        j += 1
    k += 1    
    i = k + 1
    j = i + 1
    z = j + 1

print(10*'-')
k = 0
i = 1
j = 2
z = 3
for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            for d_type4 in data_types[z:]:
                print(5*'-')
                name = d_type1 + '-' + d_type2 + '-' + d_type3 + '-' + d_type4
                print(name + ' AUC: {}+-{}'.format(np.mean(aucs[name]['Integration']),np.std(aucs[name]['Integration'])))
                print(d_type1+'Int' + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type1+'Int']),np.std(aucs[name][d_type1+'Int'])))
                print(d_type2+'Int' + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type2+'Int']),np.std(aucs[name][d_type2+'Int'])))
                print(d_type3+'Int' + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type3+'Int']),np.std(aucs[name][d_type3+'Int'])))
                print(d_type4+'Int' + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type4+'Int']),np.std(aucs[name][d_type4+'Int'])))
            z += 1
        j += 1
    k += 1    
    i = k + 1
    j = i + 1
    z = j + 1

k = 0
i = 1
j = 2
z = 3
for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            for d_type4 in data_types[z:]:
                print(5*'-')
                name = d_type1 + '-' + d_type2 + '-' + d_type3 + '-' + d_type4
                print(name + ' AUPRC: {}+-{}'.format(np.mean(aucprcs[name]['Integration']),np.std(aucprcs[name]['Integration'])))
                print(d_type1+'Int' + ' AUPRC: {}+-{}'.format(np.mean(aucprcs[name][d_type1+'Int']),np.std(aucprcs[name][d_type1+'Int'])))
                print(d_type2+'Int' + ' AUPRC: {}+-{}'.format(np.mean(aucprcs[name][d_type2+'Int']),np.std(aucprcs[name][d_type2+'Int'])))
                print(d_type3+'Int' + ' AUPRC: {}+-{}'.format(np.mean(aucprcs[name][d_type3+'Int']),np.std(aucprcs[name][d_type3+'Int'])))
                print(d_type4+'Int' + ' AUPRC: {}+-{}'.format(np.mean(aucprcs[name][d_type4+'Int']),np.std(aucprcs[name][d_type4+'Int'])))
            z += 1
        j += 1
    k += 1    
    i = k + 1
    j = i + 1
    z = j + 1

## check best integration five sources

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, f1_score

data_model = pd.read_excel('../result_files/data_integration_model_test_probs_SGD-all_sources.xlsx',
              sheet_name=[0,1,2,3,4,5,6,7,8,9],engine='openpyxl')

accs = {
    'WSI': [],
    'RNA': [],
    'miRNA': [],
    'CNV': [],
    'DNA': [],
    'Integration': []
}

f1_scores = {
    'WSI': [],
    'RNA': [],
    'miRNA': [],
    'CNV': [],
    'DNA': [],
    'Integration': []
}

aucs = {
    'WSI': [],
    'RNA': [],
    'miRNA': [],
    'CNV': [],
    'DNA': [],
    'Integration': []
}

auprcs = {
    'WSI': [],
    'RNA': [],
    'miRNA': [],
    'CNV': [],
    'DNA': [],
    'Integration': []
}

sizes = 0
lengths['all'] = {'luad': 0, 'hlt': 0, 'lusc': 0}
for df_name, df in data_model.items():
    
    df_only = df.loc[(df['Has WSI'] != -1) | (df['Has RNA'] != -1) | (df['Has miRNA'] != -1) | (df['Has CNV'] != -1) | (df['Has DNA'] != -1)]
    df_dt1 = df.loc[df['Has RNA'] != -1]
    df_dt2 = df.loc[df['Has WSI'] != -1]
    df_dt3 = df.loc[df['Has miRNA'] != -1]
    df_dt4 = df.loc[df['Has CNV'] != -1]
    df_dt5 = df.loc[df['Has DNA'] != -1]
    print(df_only.shape[0])
    sizes += df_only.shape[0]
    
    luad_s = len(np.where(real == 0)[0])
    hlt_s = len(np.where(real == 1)[0])
    lusc_s = len(np.where(real == 2)[0])

    lengths['all']['luad'] += luad_s
    lengths['all']['hlt'] += hlt_s
    lengths['all']['lusc'] += lusc_s
    # RNA
    probs = [[x,y,z] for x,y,z in zip(df_dt1['RNA Prob LUAD'], df_dt1['RNA Prob HLT'], df_dt1['RNA Prob LUSC'])]
    probs = np.asarray(probs)
    preds = df_dt1['RNA Pred'].values
    real = df_dt1['Real'].values
    real_binarized = label_binarize(real, classes=[*range(3)])
    
    acc = accuracy_score(real, preds)*100
    f1 = f1_score(real, preds, average='weighted')*100
    try:
        auc = roc_auc_score(real, probs, multi_class='ovr')
        aucs['RNA'].append(auc)
    except:
        pass
    
    try:
        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

        auprcs['RNA'].append(np.mean(array))
    except:
        pass
                
    accs['RNA'].append(acc)
    f1_scores['RNA'].append(f1)
    
    # WSI
    probs = [[x,y,z] for x,y,z in zip(df_dt2['WSI Prob LUAD'], df_dt2['WSI Prob HLT'], df_dt2['WSI Prob LUSC'])]
    probs = np.asarray(probs)
    preds = df_dt2['WSI Pred'].values
    real = df_dt2['Real'].values
    real_binarized = label_binarize(real, classes=[*range(3)])
    acc = accuracy_score(real, preds)*100
    f1 = f1_score(real, preds, average='weighted')*100
    try:
        auc = roc_auc_score(real, probs, multi_class='ovr')
        aucs['WSI'].append(auc)
    except:
        pass
    
    try:
        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

        auprcs['WSI'].append(np.mean(array))
    except:
        pass
    
    accs['WSI'].append(acc)
    f1_scores['WSI'].append(f1)
    
    # miRNA
    probs = [[x,y,z] for x,y,z in zip(df_dt3['miRNA Prob LUAD'], df_dt3['miRNA Prob HLT'], df_dt3['miRNA Prob LUSC'])]
    probs = np.asarray(probs)
    preds = df_dt3['miRNA Pred'].values
    real = df_dt3['Real'].values
    real_binarized = label_binarize(real, classes=[*range(3)])
    acc = accuracy_score(real, preds)*100
    f1 = f1_score(real, preds, average='weighted')*100
    try:
        auc = roc_auc_score(real, probs, multi_class='ovr')
        aucs['miRNA'].append(auc)
    except:
        pass
    
    try:
        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

        auprcs['miRNA'].append(np.mean(array))
    except:
        pass
    
    accs['miRNA'].append(acc)
    f1_scores['miRNA'].append(f1)
    
    # CNV
    probs = [[x,y,z] for x,y,z in zip(df_dt4['CNV Prob LUAD'], df_dt4['CNV Prob HLT'], df_dt4['CNV Prob LUSC'])]
    probs = np.asarray(probs)
    preds = df_dt4['CNV Pred'].values
    real = df_dt4['Real'].values
    real_binarized = label_binarize(real, classes=[*range(3)])
    acc = accuracy_score(real, preds)*100
    f1 = f1_score(real, preds, average='weighted')*100
    try:
        auc = roc_auc_score(real, probs, multi_class='ovr')
        aucs['CNV'].append(auc)
    except:
        pass
    
    try:
        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

        auprcs['CNV'].append(np.mean(array))
    except:
        pass
    
    accs['CNV'].append(acc)
    f1_scores['CNV'].append(f1)
    
    # DNA
    probs = [[x,y,z] for x,y,z in zip(df_dt5['DNA Prob LUAD'], df_dt5['DNA Prob HLT'], df_dt5['DNA Prob LUSC'])]
    probs = np.asarray(probs)
    preds = df_dt5['DNA Pred'].values
    real = df_dt5['Real'].values
    real_binarized = label_binarize(real, classes=[*range(3)])
    acc = accuracy_score(real, preds)*100
    f1 = f1_score(real, preds, average='weighted')*100
    try:
        auc = roc_auc_score(real, probs, multi_class='ovr')
        aucs['DNA'].append(auc)
    except:
        pass
    
    try:
        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

        auprcs['DNA'].append(np.mean(array))
    except:
        pass
    
    accs['DNA'].append(acc)
    f1_scores['DNA'].append(f1)
    
    # integration
    probs_int = [[x,y,z] for x,y,z in zip(df_only['Integration Prob LUAD'], df_only['Integration Prob HLT'], df_only['Integration Prob LUSC'])]
    preds_int = df_only['Integration Pred'].values
    probs_int = np.asarray(probs_int)
    real = df_only['Real'].values
    real_binarized = label_binarize(real, classes=[*range(3)])
    acc_int = accuracy_score(real, preds_int)*100
    f1_int = f1_score(real, preds_int, average='weighted')*100
    try:
        auc_int = roc_auc_score(real, probs_int, multi_class='ovr')
        aucs['Integration'].append(auc_int)
    except:
        pass
    
    try:
        aucpr1 = average_precision_score(real_binarized[:, 0], probs_int[:, 0])
        aucpr2 = average_precision_score(real_binarized[:, 1], probs_int[:, 1])
        aucpr3 = average_precision_score(real_binarized[:, 2], probs_int[:, 2])
        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

        auprcs['Integration'].append(np.mean(array))
    except:
        pass
    
    accs['Integration'].append(acc_int)
    f1_scores['Integration'].append(f1_int)

print(sizes)

In [None]:
for d_type in ['Integration','WSI', 'RNA', 'miRNA', 'CNV', 'DNA']:
    print(d_type + ' ACC: {}+-{}'.format(np.mean(accs[d_type]),np.std(accs[d_type])))
    print(5*'-')
print(10*'-')

for d_type in ['Integration','WSI', 'RNA', 'miRNA', 'CNV', 'DNA']:  
    print(d_type + ' F1: {}+-{}'.format(np.mean(f1_scores[d_type]),np.std(f1_scores[d_type])))
    print(5*'-')
print(10*'-')

for d_type in ['Integration','WSI', 'RNA', 'miRNA', 'CNV', 'DNA']:
    print(d_type + ' AUC: {}+-{}'.format(np.mean(aucs[d_type]),np.std(aucs[d_type])))
    print(5*'-')
    
print(10*'-')

for d_type in ['Integration','WSI', 'RNA', 'miRNA', 'CNV', 'DNA']:
    print(d_type + ' AUPRC: {}+-{}'.format(np.mean(auprcs[d_type]),np.std(auprcs[d_type])))
    print(5*'-')

## check best integration two sources without NANs

In [None]:
from glob import glob
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, f1_score

accs = {}
f1_scores = {}
aucs = {}
auprcs = {}
lengths = {}

i = 0
j = 1



for d_type1 in data_types[i:]:
    for d_type2 in data_types[j:]:
        name = d_type1 + '-' + d_type2
        data_model = pd.read_excel('../result_files/two-sources-integration/data_integration_model_test_probs_'+name+'.xlsx',
              sheet_name=[0,1,2,3,4,5,6,7,8,9],engine='openpyxl')
        
        accs[name] = {}
        f1_scores[name] = {}
        aucs[name] = {}
        auprcs[name] = {}
        
        accs[name]['Integration'] = []
        accs[name][d_type1+'Int'] = []
        accs[name][d_type2+'Int'] = []
        
        f1_scores[name]['Integration'] = []
        f1_scores[name][d_type1+'Int'] = []
        f1_scores[name][d_type2+'Int'] = []
        
        aucs[name]['Integration'] = []
        aucs[name][d_type1+'Int'] = []
        aucs[name][d_type2+'Int'] = []
        
        auprcs[name]['Integration'] = []
        auprcs[name][d_type1+'Int'] = []
        auprcs[name][d_type2+'Int'] = []
        
        lengths[name] = {'luad': 0, 'hlt': 0, 'lusc': 0}
        
        for df_name, df in data_model.items():
            # take those where the two sources has data
            df_only = df.loc[(df['Has '+ d_type1] != -1) & (df['Has ' + d_type2] != -1)]
            df_dt1 = df_only.loc[df_only['Has '+ d_type1] != -1]
            df_dt2 = df_only.loc[df_only['Has '+ d_type2] != -1]
            probs = [[x,y,z] for x,y,z in zip(df_only['Integration Prob LUAD'], df_only['Integration Prob HLT'], df_only['Integration Prob LUSC'])]
            probs = np.asarray(probs)
            preds = df_only['Integration Pred'].values
            real = df_only['Real'].values
            real_binarized = label_binarize(real, classes=[*range(3)])
            
            luad_s = len(np.where(real == 0)[0])
            hlt_s = len(np.where(real == 1)[0])
            lusc_s = len(np.where(real == 2)[0])

            lengths[name]['luad'] += luad_s
            lengths[name]['hlt'] += hlt_s
            lengths[name]['lusc'] += lusc_s
            acc = accuracy_score(real, preds)*100
            f1 = f1_score(real, preds, average='weighted')*100
            try:
                auc = roc_auc_score(real, probs, multi_class='ovr')
                aucs[name]['Integration'].append(auc)
            except:
                pass
            
            try:
                aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]
                
                auprcs[name]['Integration'].append(np.mean(array))
            except:
                pass
            
            accs[name]['Integration'].append(acc)
            f1_scores[name]['Integration'].append(f1)
            
            
            # dtype1
            probs = [[x,y,z] for x,y,z in zip(df_only[d_type1+' Prob LUAD'], df_only[d_type1+' Prob HLT'], df_only[d_type1+' Prob LUSC'])]
            probs = np.asarray(probs)
            preds = df_only[d_type1+' Pred'].values
            real = df_only['Real'].values
            real_binarized = label_binarize(real, classes=[*range(3)])
            acc = accuracy_score(real, preds)*100
            f1 = f1_score(real, preds, average='weighted')*100
            try:
                auc = roc_auc_score(real, probs, multi_class='ovr')
                aucs[name][d_type1+'Int'].append(auc)
            except:
                pass
            try:
                aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]
                
                auprcs[name][d_type1+'Int'].append(np.mean(array))
            except:
                pass
            
            accs[name][d_type1+'Int'].append(acc)
            f1_scores[name][d_type1+'Int'].append(f1)
            
            
            # dtype2
            probs = [[x,y,z] for x,y,z in zip(df_only[d_type2+' Prob LUAD'], df_only[d_type2+' Prob HLT'], df_only[d_type2+' Prob LUSC'])]
            probs = np.asarray(probs)
            preds = df_only[d_type2+' Pred'].values
            real = df_only['Real'].values
            real_binarized = label_binarize(real, classes=[*range(3)])
            acc = accuracy_score(real, preds)*100
            f1 = f1_score(real, preds, average='weighted')*100
            try:
                auc = roc_auc_score(real, probs, multi_class='ovr')
                aucs[name][d_type2+'Int'].append(auc)
            except:
                pass
            
            try:
                aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]
                
                auprcs[name][d_type2+'Int'].append(np.mean(array))
            except:
                pass
            
            accs[name][d_type2+'Int'].append(acc)
            f1_scores[name][d_type2+'Int'].append(f1)
            
    i += 1
    j = i + 1

In [None]:
i = 0
j = 1

for d_type1 in data_types[i:]:
    for d_type2 in data_types[j:]:
        print(5*'-')
        name = d_type1 + '-' + d_type2
        print('Number of samples {}'.format(lengths[name]))
        print(name + ' ACC: {}+-{}'.format(np.mean(accs[name]['Integration']),np.std(accs[name]['Integration'])))
        print(d_type1 + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type1+'Int']),np.std(accs[name][d_type1+'Int'])))
        print(d_type2 + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type2+'Int']),np.std(accs[name][d_type2+'Int'])))
        
    i += 1
    j = i + 1
print(10*'-')

i = 0
j = 1

for d_type1 in data_types[i:]:
    for d_type2 in data_types[j:]:
        name = d_type1 + '-' + d_type2
        print(5*'-')
        print(name + ' F1: {}+-{}'.format(np.mean(f1_scores[name]['Integration']),np.std(f1_scores[name]['Integration'])))
        print(d_type1 + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type1+'Int']),np.std(f1_scores[name][d_type1+'Int'])))
        print(d_type2 + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type2+'Int']),np.std(f1_scores[name][d_type2+'Int'])))
    i += 1
    j = i + 1
print(10*'-')

i = 0
j = 1

for d_type1 in data_types[i:]:
    for d_type2 in data_types[j:]:
        print(5*'-')
        name = d_type1 + '-' + d_type2
        print(name + ' AUC: {}+-{}'.format(np.mean(aucs[name]['Integration']),np.std(aucs[name]['Integration'])))
        print(d_type1 + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type1+'Int']),np.std(aucs[name][d_type1+'Int'])))
        print(d_type2 + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type2+'Int']),np.std(aucs[name][d_type2+'Int'])))
    i += 1
    j = i + 1

i = 0
j = 1

for d_type1 in data_types[i:]:
    for d_type2 in data_types[j:]:
        print(5*'-')
        name = d_type1 + '-' + d_type2
        print(name + ' AUPRC: {}+-{}'.format(np.mean(auprcs[name]['Integration']),np.std(auprcs[name]['Integration'])))
        print(d_type1 + ' AUPRC: {}+-{}'.format(np.mean(auprcs[name][d_type1+'Int']),np.std(auprcs[name][d_type1+'Int'])))
        print(d_type2 + ' AUPRC: {}+-{}'.format(np.mean(auprcs[name][d_type2+'Int']),np.std(auprcs[name][d_type2+'Int'])))
    i += 1
    j = i + 1

In [None]:
i = 0
j = 1

for d_type1 in data_types[i:]:
    for d_type2 in data_types[j:]:
        name = d_type1 + '-' + d_type2
        print('{}: Number of samples {}'.format(name, lengths[name]))
    i += 1
    j = i + 1

## check best integration three sources without NANs

In [None]:

data_types = ['WSI', 'RNA', 'miRNA', 'CNV', 'DNA']
data_types2 = copy.copy(data_types)
data_types3 = copy.copy(data_types)

k = 0
i = 1
j = 2

accs = {}
f1_scores = {}
aucs = {}
auprcs = {}

for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            name = d_type1 + '-' + d_type2 + '-' + d_type3
            data_model = pd.read_excel('../result_files/three-sources-integration/data_integration_model_test_probs_'+name+'.xlsx',
              sheet_name=[0,1,2,3,4,5,6,7,8,9],engine='openpyxl')
        
            accs[name] = {}
            f1_scores[name] = {}
            aucs[name] = {}
            auprcs[name] = {}
                
            accs[name]['Integration'] = []
            accs[name][d_type1+'Int'] = []
            accs[name][d_type2+'Int'] = []
            accs[name][d_type3+'Int'] = []

            f1_scores[name]['Integration'] = []
            f1_scores[name][d_type1+'Int'] = []
            f1_scores[name][d_type2+'Int'] = []
            f1_scores[name][d_type3+'Int'] = []

            aucs[name]['Integration'] = []
            aucs[name][d_type1+'Int'] = []
            aucs[name][d_type2+'Int'] = []
            aucs[name][d_type3+'Int'] = []
            
            auprcs[name]['Integration'] = []
            auprcs[name][d_type1+'Int'] = []
            auprcs[name][d_type2+'Int'] = []
            auprcs[name][d_type3+'Int'] = []
            
            lengths[name] = {'luad': 0, 'hlt': 0, 'lusc': 0}
            for df_name, df in data_model.items():
                # take those where the two sources has data
                df_only = df.loc[(df['Has '+ d_type1] != -1) & (df['Has ' + d_type2] != -1) & (df['Has ' + d_type3] != -1)]
                df_dt1 = df.loc[df['Has '+ d_type1] != -1]
                df_dt2 = df.loc[df['Has '+ d_type2] != -1]
                df_dt3 = df.loc[df['Has '+ d_type3] != -1]
                
                probs = [[x,y,z] for x,y,z in zip(df_only['Integration Prob LUAD'], df_only['Integration Prob HLT'], df_only['Integration Prob LUSC'])]
                probs = np.asarray(probs)
                preds = df_only['Integration Pred'].values
                real = df_only['Real'].values
                real_binarized = label_binarize(real, classes=[*range(3)])
                acc = accuracy_score(real, preds)*100
                f1 = f1_score(real, preds, average='weighted')*100
                try:
                    auc = roc_auc_score(real, probs, multi_class='ovr')
                    aucs[name]['Integration'].append(auc)
                except:
                    pass
                
                try:
                    aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                    aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                    aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                    array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

                    auprcs[name]['Integration'].append(np.mean(array))
                except:
                    pass
                aucs[name]['Integration'].append(auc)
                accs[name]['Integration'].append(acc)
                f1_scores[name]['Integration'].append(f1)
                
                luad_s = len(np.where(real == 0)[0])
                hlt_s = len(np.where(real == 1)[0])
                lusc_s = len(np.where(real == 2)[0])

                lengths[name]['luad'] += luad_s
                lengths[name]['hlt'] += hlt_s
                lengths[name]['lusc'] += lusc_s
            
                # dtype1
                probs = [[x,y,z] for x,y,z in zip(df_only[d_type1+' Prob LUAD'], df_only[d_type1+' Prob HLT'], df_only[d_type1+' Prob LUSC'])]
                probs = np.asarray(probs)
                preds = df_only[d_type1+' Pred'].values
                real = df_only['Real'].values
                real_binarized = label_binarize(real, classes=[*range(3)])
                acc = accuracy_score(real, preds)*100
                f1 = f1_score(real, preds, average='weighted')*100
                try:
                    auc = roc_auc_score(real, probs, multi_class='ovr')
                    aucs[name][d_type1+'Int'].append(auc)
                except:
                    pass
                
                try:
                    aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                    aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                    aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                    array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

                    auprcs[name][d_type1+'Int'].append(np.mean(array))
                except:
                    pass
                
                accs[name][d_type1+'Int'].append(acc)
                f1_scores[name][d_type1+'Int'].append(f1)


                # dtype2
                probs = [[x,y,z] for x,y,z in zip(df_only[d_type2+' Prob LUAD'], df_only[d_type2+' Prob HLT'], df_only[d_type2+' Prob LUSC'])]
                probs = np.asarray(probs)
                preds = df_only[d_type2+' Pred'].values
                real = df_only['Real'].values
                real_binarized = label_binarize(real, classes=[*range(3)])
                acc = accuracy_score(real, preds)*100
                f1 = f1_score(real, preds, average='weighted')*100
                try:
                    auc = roc_auc_score(real, probs, multi_class='ovr')
                    aucs[name][d_type2+'Int'].append(auc)
                except:
                    pass
                
                try:
                    aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                    aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                    aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                    array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

                    auprcs[name][d_type2+'Int'].append(np.mean(array))
                except:
                    pass
                
                accs[name][d_type2+'Int'].append(acc)
                f1_scores[name][d_type2+'Int'].append(f1)
                
                # dtype3
                probs = [[x,y,z] for x,y,z in zip(df_only[d_type3+' Prob LUAD'], df_only[d_type3+' Prob HLT'], df_only[d_type3+' Prob LUSC'])]
                probs = np.asarray(probs)
                preds = df_only[d_type3+' Pred'].values
                real = df_only['Real'].values
                real_binarized = label_binarize(real, classes=[*range(3)])
                acc = accuracy_score(real, preds)*100
                f1 = f1_score(real, preds, average='weighted')*100
                try:
                    auc = roc_auc_score(real, probs, multi_class='ovr')
                    aucs[name][d_type3+'Int'].append(auc)
                except:
                    pass
                
                try:
                    aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                    aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                    aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                    array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

                    auprcs[name][d_type3+'Int'].append(np.mean(array))
                except:
                    pass
                
                accs[name][d_type3+'Int'].append(acc)
                f1_scores[name][d_type3+'Int'].append(f1)
                
        j += 1
    k += 1    
    i = k + 1
    j = i + 1

In [None]:
k = 0
i = 1
j = 2

for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            print(5*'-')
            name = d_type1 + '-' + d_type2 + '-' + d_type3
            print(name + ' ACC: {}+-{}'.format(np.mean(accs[name]['Integration']),np.std(accs[name]['Integration'])))
            print(d_type1 + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type1+'Int']),np.std(accs[name][d_type1+'Int'])))
            print(d_type2 + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type2+'Int']),np.std(accs[name][d_type2+'Int'])))
            print(d_type3 + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type3+'Int']),np.std(accs[name][d_type3+'Int'])))
        
        j += 1
    k += 1    
    i = k + 1
    j = i + 1

print(10*'-')
k = 0
i = 1
j = 2

for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            print(5*'-')
            name = d_type1 + '-' + d_type2 + '-' + d_type3
            print(name + ' F1: {}+-{}'.format(np.mean(f1_scores[name]['Integration']),np.std(f1_scores[name]['Integration'])))
            print(d_type1 + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type1+'Int']),np.std(f1_scores[name][d_type1+'Int'])))
            print(d_type2 + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type2+'Int']),np.std(f1_scores[name][d_type2+'Int'])))
            print(d_type3 + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type3+'Int']),np.std(f1_scores[name][d_type3+'Int'])))
        j += 1
    k += 1    
    i = k + 1
    j = i + 1

print(10*'-')
k = 0
i = 1
j = 2

for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            print(5*'-')
            name = d_type1 + '-' + d_type2 + '-' + d_type3
            print(name + ' AUC: {}+-{}'.format(np.mean(aucs[name]['Integration']),np.std(aucs[name]['Integration'])))
            print(d_type1 + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type1+'Int']),np.std(aucs[name][d_type1+'Int'])))
            print(d_type2 + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type2+'Int']),np.std(aucs[name][d_type2+'Int'])))
            print(d_type3 + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type3+'Int']),np.std(aucs[name][d_type3+'Int'])))
        j += 1
    k += 1    
    i = k + 1
    j = i + 1

k = 0
i = 1
j = 2

print(10*'-')

for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            print(5*'-')
            name = d_type1 + '-' + d_type2 + '-' + d_type3
            print(name + ' AUPRC: {}+-{}'.format(np.mean(auprcs[name]['Integration']),np.std(auprcs[name]['Integration'])))
            print(d_type1 + ' AUPRC: {}+-{}'.format(np.mean(auprcs[name][d_type1+'Int']),np.std(auprcs[name][d_type1+'Int'])))
            print(d_type2 + ' AUPRC: {}+-{}'.format(np.mean(auprcs[name][d_type2+'Int']),np.std(auprcs[name][d_type2+'Int'])))
            print(d_type3 + ' AUPRC: {}+-{}'.format(np.mean(auprcs[name][d_type3+'Int']),np.std(auprcs[name][d_type3+'Int'])))
        j += 1
    k += 1    
    i = k + 1
    j = i + 1

In [None]:
k = 0
i = 1
j = 2

for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            
            name = d_type1 + '-' + d_type2 + '-' + d_type3
            print('{}: Number of samples {}'.format(name, lengths[name]))
        
        j += 1
    k += 1    
    i = k + 1
    j = i + 1

## check best integration four sources without NANs

In [None]:
data_types = ['WSI', 'RNA', 'miRNA', 'CNV', 'DNA']

k = 0
i = 1
j = 2
z = 3

accs = {}
f1_scores = {}
aucs = {}
auprcs = {}

for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            for d_type4 in data_types[z:]:
                name = d_type1 + '-' + d_type2 + '-' + d_type3 + '-' + d_type4
                data_model = pd.read_excel('../result_files/four-sources-integration/data_integration_model_test_probs_'+name+'.xlsx',
                  sheet_name=[0,1,2,3,4,5,6,7,8,9],engine='openpyxl')
        
                accs[name] = {}
                f1_scores[name] = {}
                aucs[name] = {}
                auprcs[name] = {}
                
                accs[name]['Integration'] = []
                accs[name][d_type1+'Int'] = []
                accs[name][d_type2+'Int'] = []
                accs[name][d_type3+'Int'] = []
                accs[name][d_type4+'Int'] = []
                
                f1_scores[name]['Integration'] = []
                f1_scores[name][d_type1+'Int'] = []
                f1_scores[name][d_type2+'Int'] = []
                f1_scores[name][d_type3+'Int'] = []
                f1_scores[name][d_type4+'Int'] = []
                
                aucs[name]['Integration'] = []
                aucs[name][d_type1+'Int'] = []
                aucs[name][d_type2+'Int'] = []
                aucs[name][d_type3+'Int'] = []
                aucs[name][d_type4+'Int'] = []
                
                auprcs[name]['Integration'] = []
                auprcs[name][d_type1+'Int'] = []
                auprcs[name][d_type2+'Int'] = []
                auprcs[name][d_type3+'Int'] = []
                auprcs[name][d_type4+'Int'] = []
                
                lengths[name] = {'luad': 0, 'hlt': 0, 'lusc': 0}
                
                for df_name, df in data_model.items():
                    # take those where the four sources has data
                    df_only = df.loc[(df['Has '+ d_type1] != -1) & (df['Has ' + d_type2] != -1) & (df['Has ' + d_type3] != -1) & (df['Has ' + d_type4] != -1)]
                    df_dt1 = df.loc[df['Has '+ d_type1] != -1]
                    df_dt2 = df.loc[df['Has '+ d_type2] != -1]
                    df_dt3 = df.loc[df['Has '+ d_type3] != -1]
                    df_dt4 = df.loc[df['Has '+ d_type4] != -1]
                    
                    probs = [[x,y,z] for x,y,z in zip(df_only['Integration Prob LUAD'], df_only['Integration Prob HLT'], df_only['Integration Prob LUSC'])]
                    probs = np.asarray(probs)
                    preds = df_only['Integration Pred'].values
                    real = df_only['Real'].values
                    real_binarized = label_binarize(real, classes=[*range(3)])
                    acc = accuracy_score(real, preds)*100
                    f1 = f1_score(real, preds, average='weighted')*100
                    try:
                        auc = roc_auc_score(real, probs, multi_class='ovr')
                        aucs[name]['Integration'].append(auc)
                    except:
                        pass
                    
                    try:
                        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

                        auprcs[name]['Integration'].append(np.mean(array))
                    except:
                        pass
                
                    aucs[name]['Integration'].append(auc)
                    accs[name]['Integration'].append(acc)
                    f1_scores[name]['Integration'].append(f1)
                    
                    luad_s = len(np.where(real == 0)[0])
                    hlt_s = len(np.where(real == 1)[0])
                    lusc_s = len(np.where(real == 2)[0])

                    lengths[name]['luad'] += luad_s
                    lengths[name]['hlt'] += hlt_s
                    lengths[name]['lusc'] += lusc_s
                    
                    # dtype1
                    probs = [[x,y,z] for x,y,z in zip(df_only[d_type1+' Prob LUAD'], df_only[d_type1+' Prob HLT'], df_only[d_type1+' Prob LUSC'])]
                    probs = np.asarray(probs)
                    preds = df_only[d_type1+' Pred'].values
                    real = df_only['Real'].values
                    real_binarized = label_binarize(real, classes=[*range(3)])
                    acc = accuracy_score(real, preds)*100
                    f1 = f1_score(real, preds, average='weighted')*100
                    try:
                        auc = roc_auc_score(real, probs, multi_class='ovr')
                        aucs[name][d_type1+'Int'].append(auc)
                    except:
                        pass
                    
                    try:
                        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

                        auprcs[name][d_type1+'Int'].append(np.mean(array))
                    except:
                        pass
                    
                    accs[name][d_type1+'Int'].append(acc)
                    f1_scores[name][d_type1+'Int'].append(f1)


                    # dtype2
                    probs = [[x,y,z] for x,y,z in zip(df_only[d_type2+' Prob LUAD'], df_only[d_type2+' Prob HLT'], df_only[d_type2+' Prob LUSC'])]
                    probs = np.asarray(probs)
                    preds = df_only[d_type2+' Pred'].values
                    real = df_only['Real'].values
                    real_binarized = label_binarize(real, classes=[*range(3)])
                    acc = accuracy_score(real, preds)*100
                    f1 = f1_score(real, preds, average='weighted')*100
                    try:
                        auc = roc_auc_score(real, probs, multi_class='ovr')
                        aucs[name][d_type2+'Int'].append(auc)
                    except:
                        pass
                    
                    try:
                        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

                        auprcs[name][d_type2+'Int'].append(np.mean(array))
                    except:
                        pass
                    
                    accs[name][d_type2+'Int'].append(acc)
                    f1_scores[name][d_type2+'Int'].append(f1)

                    # dtype3
                    probs = [[x,y,z] for x,y,z in zip(df_only[d_type3+' Prob LUAD'], df_only[d_type3+' Prob HLT'], df_only[d_type3+' Prob LUSC'])]
                    probs = np.asarray(probs)
                    preds = df_only[d_type3+' Pred'].values
                    real = df_only['Real'].values
                    real_binarized = label_binarize(real, classes=[*range(3)])
                    acc = accuracy_score(real, preds)*100
                    f1 = f1_score(real, preds, average='weighted')*100
                    try:
                        auc = roc_auc_score(real, probs, multi_class='ovr')
                        aucs[name][d_type3+'Int'].append(auc)
                    except:
                        pass
                    
                    try:
                        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

                        auprcs[name][d_type3+'Int'].append(np.mean(array))
                    except:
                        pass
                    
                    
                    accs[name][d_type3+'Int'].append(acc)
                    f1_scores[name][d_type3+'Int'].append(f1)
                    
                    # dtype4
                    probs = [[x,y,z] for x,y,z in zip(df_only[d_type4+' Prob LUAD'], df_only[d_type4+' Prob HLT'], df_only[d_type4+' Prob LUSC'])]
                    probs = np.asarray(probs)
                    preds = df_only[d_type4+' Pred'].values
                    real = df_only['Real'].values
                    real_binarized = label_binarize(real, classes=[*range(3)])
                    acc = accuracy_score(real, preds)*100
                    f1 = f1_score(real, preds, average='weighted')*100
                    try:
                        auc = roc_auc_score(real, probs, multi_class='ovr')
                        aucs[name][d_type4+'Int'].append(auc)
                    except:
                        pass
                    
                    try:
                        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
                        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
                        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
                        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

                        auprcs[name][d_type4+'Int'].append(np.mean(array))
                    except:
                        pass
                    
                    accs[name][d_type4+'Int'].append(acc)
                    f1_scores[name][d_type4+'Int'].append(f1)
            z += 1
        j += 1
    k += 1    
    i = k + 1
    j = i + 1
    z = j + 1

d_type1 = 'WSI'
d_type2 = 'miRNA'
d_type3 = 'CNV'
d_type4 = 'DNA'

name = d_type1 + '-' + d_type2 + '-' + d_type3 + '-' + d_type4
data_model = pd.read_excel('../result_files/four-sources-integration/data_integration_model_test_probs_'+name+'.xlsx',
sheet_name=[0,1,2,3,4,5,6,7,8,9],engine='openpyxl')

accs[name] = {}
f1_scores[name] = {}
aucs[name] = {}
auprcs[name] = {}

accs[name]['Integration'] = []
accs[name][d_type1+'Int'] = []
accs[name][d_type2+'Int'] = []
accs[name][d_type3+'Int'] = []
accs[name][d_type4+'Int'] = []

f1_scores[name]['Integration'] = []
f1_scores[name][d_type1+'Int'] = []
f1_scores[name][d_type2+'Int'] = []
f1_scores[name][d_type3+'Int'] = []
f1_scores[name][d_type4+'Int'] = []

aucs[name]['Integration'] = []
aucs[name][d_type1+'Int'] = []
aucs[name][d_type2+'Int'] = []
aucs[name][d_type3+'Int'] = []
aucs[name][d_type4+'Int'] = []

auprcs[name]['Integration'] = []
auprcs[name][d_type1+'Int'] = []
auprcs[name][d_type2+'Int'] = []
auprcs[name][d_type3+'Int'] = []
auprcs[name][d_type4+'Int'] = []

lengths[name] = {'luad': 0, 'hlt': 0, 'lusc': 0}

for df_name, df in data_model.items():
    # take those where the four sources has data
    df_only = df.loc[(df['Has '+ d_type1] != -1) & (df['Has ' + d_type2] != -1) & (df['Has ' + d_type3] != -1) & (df['Has ' + d_type4] != -1)]
    df_dt1 = df.loc[df['Has '+ d_type1] != -1]
    df_dt2 = df.loc[df['Has '+ d_type2] != -1]
    df_dt3 = df.loc[df['Has '+ d_type3] != -1]
    df_dt4 = df.loc[df['Has '+ d_type4] != -1]

    probs = [[x,y,z] for x,y,z in zip(df_only['Integration Prob LUAD'], df_only['Integration Prob HLT'], df_only['Integration Prob LUSC'])]
    probs = np.asarray(probs)
    preds = df_only['Integration Pred'].values
    real = df_only['Real'].values
    real_binarized = label_binarize(real, classes=[*range(3)])
    acc = accuracy_score(real, preds)*100
    f1 = f1_score(real, preds, average='weighted')*100
    try:
        auc = roc_auc_score(real, probs, multi_class='ovr')
        aucs[name]['Integration'].append(auc)
    except:
        pass

    try:
        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

        auprcs[name]['Integration'].append(np.mean(array))
    except:
        pass

    aucs[name]['Integration'].append(auc)
    accs[name]['Integration'].append(acc)
    f1_scores[name]['Integration'].append(f1)

    luad_s = len(np.where(real == 0)[0])
    hlt_s = len(np.where(real == 1)[0])
    lusc_s = len(np.where(real == 2)[0])

    lengths[name]['luad'] += luad_s
    lengths[name]['hlt'] += hlt_s
    lengths[name]['lusc'] += lusc_s

    # dtype1
    probs = [[x,y,z] for x,y,z in zip(df_only[d_type1+' Prob LUAD'], df_only[d_type1+' Prob HLT'], df_only[d_type1+' Prob LUSC'])]
    probs = np.asarray(probs)
    preds = df_only[d_type1+' Pred'].values
    real = df_only['Real'].values
    real_binarized = label_binarize(real, classes=[*range(3)])
    acc = accuracy_score(real, preds)*100
    f1 = f1_score(real, preds, average='weighted')*100
    try:
        auc = roc_auc_score(real, probs, multi_class='ovr')
        aucs[name][d_type1+'Int'].append(auc)
    except:
        pass

    try:
        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

        auprcs[name][d_type1+'Int'].append(np.mean(array))
    except:
        pass

    accs[name][d_type1+'Int'].append(acc)
    f1_scores[name][d_type1+'Int'].append(f1)


    # dtype2
    probs = [[x,y,z] for x,y,z in zip(df_only[d_type2+' Prob LUAD'], df_only[d_type2+' Prob HLT'], df_only[d_type2+' Prob LUSC'])]
    probs = np.asarray(probs)
    preds = df_only[d_type2+' Pred'].values
    real = df_only['Real'].values
    real_binarized = label_binarize(real, classes=[*range(3)])
    acc = accuracy_score(real, preds)*100
    f1 = f1_score(real, preds, average='weighted')*100
    try:
        auc = roc_auc_score(real, probs, multi_class='ovr')
        aucs[name][d_type2+'Int'].append(auc)
    except:
        pass

    try:
        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

        auprcs[name][d_type2+'Int'].append(np.mean(array))
    except:
        pass

    accs[name][d_type2+'Int'].append(acc)
    f1_scores[name][d_type2+'Int'].append(f1)

    # dtype3
    probs = [[x,y,z] for x,y,z in zip(df_only[d_type3+' Prob LUAD'], df_only[d_type3+' Prob HLT'], df_only[d_type3+' Prob LUSC'])]
    probs = np.asarray(probs)
    preds = df_only[d_type3+' Pred'].values
    real = df_only['Real'].values
    real_binarized = label_binarize(real, classes=[*range(3)])
    acc = accuracy_score(real, preds)*100
    f1 = f1_score(real, preds, average='weighted')*100
    try:
        auc = roc_auc_score(real, probs, multi_class='ovr')
        aucs[name][d_type3+'Int'].append(auc)
    except:
        pass

    try:
        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

        auprcs[name][d_type3+'Int'].append(np.mean(array))
    except:
        pass


    accs[name][d_type3+'Int'].append(acc)
    f1_scores[name][d_type3+'Int'].append(f1)

    # dtype4
    probs = [[x,y,z] for x,y,z in zip(df_only[d_type4+' Prob LUAD'], df_only[d_type4+' Prob HLT'], df_only[d_type4+' Prob LUSC'])]
    probs = np.asarray(probs)
    preds = df_only[d_type4+' Pred'].values
    real = df_only['Real'].values
    real_binarized = label_binarize(real, classes=[*range(3)])
    acc = accuracy_score(real, preds)*100
    f1 = f1_score(real, preds, average='weighted')*100
    try:
        auc = roc_auc_score(real, probs, multi_class='ovr')
        aucs[name][d_type4+'Int'].append(auc)
    except:
        pass

    try:
        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

        auprcs[name][d_type4+'Int'].append(np.mean(array))
    except:
        pass

    accs[name][d_type4+'Int'].append(acc)
    f1_scores[name][d_type4+'Int'].append(f1)

In [None]:
k = 0
i = 1
j = 2
z = 3

for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            for d_type4 in data_types[z:]:
                print(5*'-')
                name = d_type1 + '-' + d_type2 + '-' + d_type3 + '-' + d_type4
                print(name + ' ACC: {}+-{}'.format(np.mean(accs[name]['Integration']),np.std(accs[name]['Integration'])))
                print(d_type1 + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type1+'Int']),np.std(accs[name][d_type1+'Int'])))
                print(d_type2 + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type2+'Int']),np.std(accs[name][d_type2+'Int'])))
                print(d_type3 + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type3+'Int']),np.std(accs[name][d_type3+'Int'])))
                print(d_type4 + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type4+'Int']),np.std(accs[name][d_type4+'Int'])))
            z += 1
        j += 1
    k += 1    
    i = k + 1
    j = i + 1
    z = j + 1

print(10*'-')

k = 0
i = 1
j = 2
z = 3
for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            for d_type4 in data_types[z:]:
                print(5*'-')
                name = d_type1 + '-' + d_type2 + '-' + d_type3 + '-' + d_type4
                print(name + ' F1: {}+-{}'.format(np.mean(f1_scores[name]['Integration']),np.std(f1_scores[name]['Integration'])))
                print(d_type1 + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type1+'Int']),np.std(f1_scores[name][d_type1+'Int'])))
                print(d_type2 + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type2+'Int']),np.std(f1_scores[name][d_type2+'Int'])))
                print(d_type3 + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type3+'Int']),np.std(f1_scores[name][d_type3+'Int'])))
                print(d_type4 + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type4+'Int']),np.std(f1_scores[name][d_type4+'Int'])))
            z += 1
        j += 1
    k += 1    
    i = k + 1
    j = i + 1
    z = j + 1

print(10*'-')
k = 0
i = 1
j = 2
z = 3
for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            for d_type4 in data_types[z:]:
                print(5*'-')
                name = d_type1 + '-' + d_type2 + '-' + d_type3 + '-' + d_type4
                print(name + ' AUC: {}+-{}'.format(np.mean(aucs[name]['Integration']),np.std(aucs[name]['Integration'])))
                print(d_type1 + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type1+'Int']),np.std(aucs[name][d_type1+'Int'])))
                print(d_type2 + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type2+'Int']),np.std(aucs[name][d_type2+'Int'])))
                print(d_type3 + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type3+'Int']),np.std(aucs[name][d_type3+'Int'])))
                print(d_type4 + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type4+'Int']),np.std(aucs[name][d_type4+'Int'])))
            z += 1
        j += 1
    k += 1    
    i = k + 1
    j = i + 1
    z = j + 1

print(10*'-')
k = 0
i = 1
j = 2
z = 3
for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            for d_type4 in data_types[z:]:
                print(5*'-')
                name = d_type1 + '-' + d_type2 + '-' + d_type3 + '-' + d_type4
                print(name + ' AUPRC: {}+-{}'.format(np.mean(auprcs[name]['Integration']),np.std(auprcs[name]['Integration'])))
                print(d_type1 + ' AUPRC: {}+-{}'.format(np.mean(auprcs[name][d_type1+'Int']),np.std(auprcs[name][d_type1+'Int'])))
                print(d_type2 + ' AUPRC: {}+-{}'.format(np.mean(auprcs[name][d_type2+'Int']),np.std(auprcs[name][d_type2+'Int'])))
                print(d_type3 + ' AUPRC: {}+-{}'.format(np.mean(auprcs[name][d_type3+'Int']),np.std(auprcs[name][d_type3+'Int'])))
                print(d_type4 + ' AUPRC: {}+-{}'.format(np.mean(auprcs[name][d_type4+'Int']),np.std(auprcs[name][d_type4+'Int'])))
            z += 1
        j += 1
    k += 1    
    i = k + 1
    j = i + 1
    z = j + 1

In [None]:
k = 0
i = 1
j = 2
z = 3

for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            for d_type4 in data_types[z:]:
                
                name = d_type1 + '-' + d_type2 + '-' + d_type3 + '-' + d_type4
                print('{}: Number of samples {}'.format(name, lengths[name]))
            z += 1
        j += 1
    k += 1    
    i = k + 1
    j = i + 1
    z = j + 1

In [None]:
name = d_type1 + '-' + d_type2 + '-' + d_type3 + '-' + d_type4
print(name + ' ACC: {}+-{}'.format(np.mean(accs[name]['Integration']),np.std(accs[name]['Integration'])))
print(d_type1 + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type1+'Int']),np.std(accs[name][d_type1+'Int'])))
print(d_type2 + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type2+'Int']),np.std(accs[name][d_type2+'Int'])))
print(d_type3 + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type3+'Int']),np.std(accs[name][d_type3+'Int'])))
print(d_type4 + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type4+'Int']),np.std(accs[name][d_type4+'Int'])))

print(10*'-')
print(name + ' F1: {}+-{}'.format(np.mean(f1_scores[name]['Integration']),np.std(f1_scores[name]['Integration'])))
print(d_type1 + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type1+'Int']),np.std(f1_scores[name][d_type1+'Int'])))
print(d_type2 + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type2+'Int']),np.std(f1_scores[name][d_type2+'Int'])))
print(d_type3 + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type3+'Int']),np.std(f1_scores[name][d_type3+'Int'])))
print(d_type4 + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type4+'Int']),np.std(f1_scores[name][d_type4+'Int'])))

print(10*'-')

print(name + ' AUC: {}+-{}'.format(np.mean(aucs[name]['Integration']),np.std(aucs[name]['Integration'])))
print(d_type1 + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type1+'Int']),np.std(aucs[name][d_type1+'Int'])))
print(d_type2 + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type2+'Int']),np.std(aucs[name][d_type2+'Int'])))
print(d_type3 + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type3+'Int']),np.std(aucs[name][d_type3+'Int'])))
print(d_type4 + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type4+'Int']),np.std(aucs[name][d_type4+'Int'])))

print(10*'-')

print(name + ' AUPRC: {}+-{}'.format(np.mean(auprcs[name]['Integration']),np.std(auprcs[name]['Integration'])))
print(d_type1 + ' AUPRC: {}+-{}'.format(np.mean(auprcs[name][d_type1+'Int']),np.std(auprcs[name][d_type1+'Int'])))
print(d_type2 + ' AUPRC: {}+-{}'.format(np.mean(auprcs[name][d_type2+'Int']),np.std(auprcs[name][d_type2+'Int'])))
print(d_type3 + ' AUPRC: {}+-{}'.format(np.mean(auprcs[name][d_type3+'Int']),np.std(auprcs[name][d_type3+'Int'])))
print(d_type4 + ' AUPRC: {}+-{}'.format(np.mean(auprcs[name][d_type4+'Int']),np.std(auprcs[name][d_type4+'Int'])))

### Ommit healthy class

In [None]:
data_types = ['WSI', 'RNA', 'miRNA', 'CNV', 'DNA']

k = 0
i = 1
j = 2
z = 3

accs = {}
f1_scores = {}
aucs = {}
lengths = {}
for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            for d_type4 in data_types[z:]:
                name = d_type1 + '-' + d_type2 + '-' + d_type3 + '-' + d_type4
                data_model = pd.read_excel('results_SGD/four-sources-integration/data_integration_model_test_probs_'+name+'.xlsx',
                  sheet_name=[0,1,2,3,4,5,6,7,8,9],engine='openpyxl')
        
                accs[name] = {}
                f1_scores[name] = {}
                aucs[name] = {}
                
                accs[name]['Integration'] = []
                accs[name][d_type1+'Int'] = []
                accs[name][d_type2+'Int'] = []
                accs[name][d_type3+'Int'] = []
                accs[name][d_type4+'Int'] = []
                
                f1_scores[name]['Integration'] = []
                f1_scores[name][d_type1+'Int'] = []
                f1_scores[name][d_type2+'Int'] = []
                f1_scores[name][d_type3+'Int'] = []
                f1_scores[name][d_type4+'Int'] = []
                
                aucs[name]['Integration'] = []
                aucs[name][d_type1+'Int'] = []
                aucs[name][d_type2+'Int'] = []
                aucs[name][d_type3+'Int'] = []
                aucs[name][d_type4+'Int'] = []
                
                lengths[name] = {'luad': 0, 'lusc': 0}
                
                for df_name, df in data_model.items():
                    # take those where the four sources has data
                    df_only = df.loc[(df['Has '+ d_type1] != -1) & (df['Has ' + d_type2] != -1) & (df['Has ' + d_type3] != -1) & (df['Has ' + d_type4] != -1)]
                    df_dt1 = df

In [None]:
data_types = ['WSI', 'RNA', 'miRNA', 'CNV', 'DNA']

k = 0
i = 1
j = 2
z = 3

accs = {}
f1_scores = {}
aucs = {}
lengths = {}
for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            for d_type4 in data_types[z:]:
                name = d_type1 + '-' + d_type2 + '-' + d_type3 + '-' + d_type4
                data_model = pd.read_excel('results_SGD/four-sources-integration/data_integration_model_test_probs_'+name+'.xlsx',
                  sheet_name=[0,1,2,3,4,5,6,7,8,9],engine='openpyxl')
        
                accs[name] = {}
                f1_scores[name] = {}
                aucs[name] = {}
                
                accs[name]['Integration'] = []
                accs[name][d_type1+'Int'] = []
                accs[name][d_type2+'Int'] = []
                accs[name][d_type3+'Int'] = []
                accs[name][d_type4+'Int'] = []
                
                f1_scores[name]['Integration'] = []
                f1_scores[name][d_type1+'Int'] = []
                f1_scores[name][d_type2+'Int'] = []
                f1_scores[name][d_type3+'Int'] = []
                f1_scores[name][d_type4+'Int'] = []
                
                aucs[name]['Integration'] = []
                aucs[name][d_type1+'Int'] = []
                aucs[name][d_type2+'Int'] = []
                aucs[name][d_type3+'Int'] = []
                aucs[name][d_type4+'Int'] = []
                
                lengths[name] = {'luad': 0, 'lusc': 0}
                
                for df_name, df in data_model.items():
                    # take those where the four sources has data
                    df_only = df.loc[(df['Has '+ d_type1] != -1) & (df['Has ' + d_type2] != -1) & (df['Has ' + d_type3] != -1) & (df['Has ' + d_type4] != -1)]
                    df_dt1 = df.loc[df['Has '+ d_type1] != -1]
                    df_dt2 = df.loc[df['Has '+ d_type2] != -1]
                    df_dt3 = df.loc[df['Has '+ d_type3] != -1]
                    df_dt4 = df.loc[df['Has '+ d_type4] != -1]
                    
                    df_only = df_only[df_only['Real'] != 1]
                    df_dt1 = df_dt1[df_dt1['Real'] != 1]
                    df_dt2 = df_dt2[df_dt2['Real'] != 1]
                    df_dt3 = df_dt3[df_dt3['Real'] != 1]
                    df_dt4 = df_dt4[df_dt4['Real'] != 1]
                    
                    probs = [[x,y] for x,y in zip(df_only['Integration Prob LUAD'], df_only['Integration Prob LUSC'])]
                    preds = df_only['Integration Pred'].values
                    real = df_only['Real'].values
                    real = np.where(real == 2, 1, 0)
                    acc = accuracy_score(real, preds)*100
                    f1 = f1_score(real, preds, average='weighted')*100
                    
                    auc = roc_auc_score(real, np.asarray(probs)[:,1])
                    aucs[name]['Integration'].append(auc)
                    
                    aucs[name]['Integration'].append(auc)
                    accs[name]['Integration'].append(acc)
                    f1_scores[name]['Integration'].append(f1)
                    
                    luad_s = len(np.where(real == 0)[0])
                    lusc_s = len(np.where(real == 2)[0])

                    lengths[name]['luad'] += luad_s
                    lengths[name]['lusc'] += lusc_s
                    
                    # dtype1
                    probs = [[x,y] for x,y in zip(df_only[d_type1+' Prob LUAD'], df_only[d_type1+' Prob LUSC'])]
                    preds = df_only[d_type1+' Pred'].values
                    real = df_only['Real'].values
                    real = np.where(real == 2, 1, 0)
                    acc = accuracy_score(real, preds)*100
                    f1 = f1_score(real, preds, average='weighted')*100
                    try:
                        auc = roc_auc_score(real, np.asarray(probs)[:,1])
                        aucs[name][d_type1+'Int'].append(auc)
                    except:
                        pass
                    accs[name][d_type1+'Int'].append(acc)
                    f1_scores[name][d_type1+'Int'].append(f1)


                    # dtype2
                    probs = [[x,y] for x,y in zip(df_only[d_type2+' Prob LUAD'], df_only[d_type2+' Prob LUSC'])]
                    preds = df_only[d_type2+' Pred'].values
                    real = df_only['Real'].values
                    real = np.where(real == 2, 1, 0)
                    acc = accuracy_score(real, preds)*100
                    f1 = f1_score(real, preds, average='weighted')*100
                    try:
                        auc = roc_auc_score(real, np.asarray(probs)[:,1])
                        aucs[name][d_type2+'Int'].append(auc)
                    except:
                        pass

                    accs[name][d_type2+'Int'].append(acc)
                    f1_scores[name][d_type2+'Int'].append(f1)

                    # dtype3
                    probs = [[x,y] for x,y in zip(df_only[d_type3+' Prob LUAD'], df_only[d_type3+' Prob LUSC'])]
                    preds = df_only[d_type3+' Pred'].values
                    real = df_only['Real'].values
                    real = np.where(real == 2, 1, 0)
                    acc = accuracy_score(real, preds)*100
                    f1 = f1_score(real, preds, average='weighted')*100
                    try:
                        auc = roc_auc_score(real, np.asarray(probs)[:,1])
                        aucs[name][d_type3+'Int'].append(auc)
                    except:
                        pass

                    accs[name][d_type3+'Int'].append(acc)
                    f1_scores[name][d_type3+'Int'].append(f1)
                    
                    # dtype4
                    probs = [[x,y] for x,y in zip(df_only[d_type4+' Prob LUAD'], df_only[d_type4+' Prob LUSC'])]
                    preds = df_only[d_type4+' Pred'].values
                    real = df_only['Real'].values
                    real = np.where(real == 2, 1, 0)
                    acc = accuracy_score(real, preds)*100
                    f1 = f1_score(real, preds, average='weighted')*100
                    try:
                        auc = roc_auc_score(real, np.asarray(probs)[:,1])
                        aucs[name][d_type4+'Int'].append(auc)
                    except:
                        pass

                    accs[name][d_type4+'Int'].append(acc)
                    f1_scores[name][d_type4+'Int'].append(f1)
            z += 1
        j += 1
    k += 1    
    i = k + 1
    j = i + 1
    z = j + 1

In [None]:
k = 0
i = 1
j = 2
z = 3

for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            for d_type4 in data_types[z:]:
                print(5*'-')
                name = d_type1 + '-' + d_type2 + '-' + d_type3 + '-' + d_type4
                print(name + ' ACC: {}+-{}'.format(np.mean(accs[name]['Integration']),np.std(accs[name]['Integration'])))
                print(d_type1 + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type1+'Int']),np.std(accs[name][d_type1+'Int'])))
                print(d_type2 + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type2+'Int']),np.std(accs[name][d_type2+'Int'])))
                print(d_type3 + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type3+'Int']),np.std(accs[name][d_type3+'Int'])))
                print(d_type4 + ' ACC: {}+-{}'.format(np.mean(accs[name][d_type4+'Int']),np.std(accs[name][d_type4+'Int'])))
            z += 1
        j += 1
    k += 1    
    i = k + 1
    j = i + 1
    z = j + 1

print(10*'-')

k = 0
i = 1
j = 2
z = 3
for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            for d_type4 in data_types[z:]:
                print(5*'-')
                name = d_type1 + '-' + d_type2 + '-' + d_type3 + '-' + d_type4
                print(name + ' F1: {}+-{}'.format(np.mean(f1_scores[name]['Integration']),np.std(f1_scores[name]['Integration'])))
                print(d_type1 + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type1+'Int']),np.std(f1_scores[name][d_type1+'Int'])))
                print(d_type2 + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type2+'Int']),np.std(f1_scores[name][d_type2+'Int'])))
                print(d_type3 + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type3+'Int']),np.std(f1_scores[name][d_type3+'Int'])))
                print(d_type4 + ' F1: {}+-{}'.format(np.mean(f1_scores[name][d_type4+'Int']),np.std(f1_scores[name][d_type4+'Int'])))
            z += 1
        j += 1
    k += 1    
    i = k + 1
    j = i + 1
    z = j + 1

print(10*'-')
k = 0
i = 1
j = 2
z = 3
for d_type1 in data_types[k:]:
    for d_type2 in data_types[i:]:
        for d_type3 in data_types[j:]:
            for d_type4 in data_types[z:]:
                print(5*'-')
                name = d_type1 + '-' + d_type2 + '-' + d_type3 + '-' + d_type4
                print(name + ' AUC: {}+-{}'.format(np.mean(aucs[name]['Integration']),np.std(aucs[name]['Integration'])))
                print(d_type1 + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type1+'Int']),np.std(aucs[name][d_type1+'Int'])))
                print(d_type2 + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type2+'Int']),np.std(aucs[name][d_type2+'Int'])))
                print(d_type3 + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type3+'Int']),np.std(aucs[name][d_type3+'Int'])))
                print(d_type4 + ' AUC: {}+-{}'.format(np.mean(aucs[name][d_type4+'Int']),np.std(aucs[name][d_type4+'Int'])))
            z += 1
        j += 1
    k += 1    
    i = k + 1
    j = i + 1
    z = j + 1

In [None]:
lengths

## check best integration five sources without NANs

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, f1_score

data_model = pd.read_excel('../result_files/data_integration_model_test_probs_SGD-all_sources.xlsx',
              sheet_name=[0,1,2,3,4,5,6,7,8,9],engine='openpyxl')

accs = {
    'WSI': [],
    'RNA': [],
    'miRNA': [],
    'CNV': [],
    'DNA': [],
    'Integration': []
}

f1_scores = {
    'WSI': [],
    'RNA': [],
    'miRNA': [],
    'CNV': [],
    'DNA': [],
    'Integration': []
}

aucs = {
    'WSI': [],
    'RNA': [],
    'miRNA': [],
    'CNV': [],
    'DNA': [],
    'Integration': []
}

auprcs = {
    'WSI': [],
    'RNA': [],
    'miRNA': [],
    'CNV': [],
    'DNA': [],
    'Integration': []
}

sizes = 0
lengths['all'] = {'luad': 0, 'hlt': 0, 'lusc': 0}
for df_name, df in data_model.items():
    
    df_only = df.loc[(df['Has WSI'] != -1) & (df['Has RNA'] != -1) & (df['Has miRNA'] != -1) & (df['Has CNV'] != -1) & (df['Has DNA'] != -1)]
    print(df_only.shape[0])
    sizes += df_only.shape[0]
    
    luad_s = len(np.where(real == 0)[0])
    hlt_s = len(np.where(real == 1)[0])
    lusc_s = len(np.where(real == 2)[0])

    lengths['all']['luad'] += luad_s
    lengths['all']['hlt'] += hlt_s
    lengths['all']['lusc'] += lusc_s
    # RNA
    probs = [[x,y,z] for x,y,z in zip(df_only['RNA Prob LUAD'], df_only['RNA Prob HLT'], df_only['RNA Prob LUSC'])]
    probs = np.asarray(probs)
    preds = df_only['RNA Pred'].values
    real = df_only['Real'].values
    real_binarized = label_binarize(real, classes=[*range(3)])
    
    acc = accuracy_score(real, preds)*100
    f1 = f1_score(real, preds, average='weighted')*100
    try:
        auc = roc_auc_score(real, probs, multi_class='ovr')
        aucs['RNA'].append(auc)
    except:
        pass
    
    try:
        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

        auprcs['RNA'].append(np.mean(array))
    except:
        pass
                
    accs['RNA'].append(acc)
    f1_scores['RNA'].append(f1)
    
    # WSI
    probs = [[x,y,z] for x,y,z in zip(df_only['WSI Prob LUAD'], df_only['WSI Prob HLT'], df_only['WSI Prob LUSC'])]
    probs = np.asarray(probs)
    preds = df_only['WSI Pred'].values
    real = df_only['Real'].values
    real_binarized = label_binarize(real, classes=[*range(3)])
    acc = accuracy_score(real, preds)*100
    f1 = f1_score(real, preds, average='weighted')*100
    try:
        auc = roc_auc_score(real, probs, multi_class='ovr')
        aucs['WSI'].append(auc)
    except:
        pass
    
    try:
        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

        auprcs['WSI'].append(np.mean(array))
    except:
        pass
    
    accs['WSI'].append(acc)
    f1_scores['WSI'].append(f1)
    
    # miRNA
    probs = [[x,y,z] for x,y,z in zip(df_only['miRNA Prob LUAD'], df_only['miRNA Prob HLT'], df_only['miRNA Prob LUSC'])]
    probs = np.asarray(probs)
    preds = df_only['miRNA Pred'].values
    real = df_only['Real'].values
    real_binarized = label_binarize(real, classes=[*range(3)])
    acc = accuracy_score(real, preds)*100
    f1 = f1_score(real, preds, average='weighted')*100
    try:
        auc = roc_auc_score(real, probs, multi_class='ovr')
        aucs['miRNA'].append(auc)
    except:
        pass
    
    try:
        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

        auprcs['miRNA'].append(np.mean(array))
    except:
        pass
    
    accs['miRNA'].append(acc)
    f1_scores['miRNA'].append(f1)
    
    # CNV
    probs = [[x,y,z] for x,y,z in zip(df_only['CNV Prob LUAD'], df_only['CNV Prob HLT'], df_only['CNV Prob LUSC'])]
    probs = np.asarray(probs)
    preds = df_only['CNV Pred'].values
    real = df_only['Real'].values
    real_binarized = label_binarize(real, classes=[*range(3)])
    acc = accuracy_score(real, preds)*100
    f1 = f1_score(real, preds, average='weighted')*100
    try:
        auc = roc_auc_score(real, probs, multi_class='ovr')
        aucs['CNV'].append(auc)
    except:
        pass
    
    try:
        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

        auprcs['CNV'].append(np.mean(array))
    except:
        pass
    
    accs['CNV'].append(acc)
    f1_scores['CNV'].append(f1)
    
    # DNA
    probs = [[x,y,z] for x,y,z in zip(df_only['DNA Prob LUAD'], df_only['DNA Prob HLT'], df_only['DNA Prob LUSC'])]
    probs = np.asarray(probs)
    preds = df_only['DNA Pred'].values
    real = df_only['Real'].values
    real_binarized = label_binarize(real, classes=[*range(3)])
    acc = accuracy_score(real, preds)*100
    f1 = f1_score(real, preds, average='weighted')*100
    try:
        auc = roc_auc_score(real, probs, multi_class='ovr')
        aucs['DNA'].append(auc)
    except:
        pass
    
    try:
        aucpr1 = average_precision_score(real_binarized[:, 0], probs[:, 0])
        aucpr2 = average_precision_score(real_binarized[:, 1], probs[:, 1])
        aucpr3 = average_precision_score(real_binarized[:, 2], probs[:, 2])
        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

        auprcs['DNA'].append(np.mean(array))
    except:
        pass
    
    accs['DNA'].append(acc)
    f1_scores['DNA'].append(f1)
    
    # integration
    probs_int = [[x,y,z] for x,y,z in zip(df_only['Integration Prob LUAD'], df_only['Integration Prob HLT'], df_only['Integration Prob LUSC'])]
    preds_int = df_only['Integration Pred'].values
    probs_int = np.asarray(probs_int)
    real = df_only['Real'].values
    real_binarized = label_binarize(real, classes=[*range(3)])
    acc_int = accuracy_score(real, preds_int)*100
    f1_int = f1_score(real, preds_int, average='weighted')*100
    try:
        auc_int = roc_auc_score(real, probs_int, multi_class='ovr')
        aucs['Integration'].append(auc_int)
    except:
        pass
    
    try:
        aucpr1 = average_precision_score(real_binarized[:, 0], probs_int[:, 0])
        aucpr2 = average_precision_score(real_binarized[:, 1], probs_int[:, 1])
        aucpr3 = average_precision_score(real_binarized[:, 2], probs_int[:, 2])
        array = [x for x in [aucpr1,aucpr2,aucpr3] if not np.isnan(x)]

        auprcs['Integration'].append(np.mean(array))
    except:
        pass
    
    accs['Integration'].append(acc_int)
    f1_scores['Integration'].append(f1_int)

print(sizes)

In [None]:
for d_type in ['Integration','WSI', 'RNA', 'miRNA', 'CNV', 'DNA']:
    print(d_type + ' ACC: {}+-{}'.format(np.mean(accs[d_type]),np.std(accs[d_type])))
    print(5*'-')
print(10*'-')

for d_type in ['Integration','WSI', 'RNA', 'miRNA', 'CNV', 'DNA']:  
    print(d_type + ' F1: {}+-{}'.format(np.mean(f1_scores[d_type]),np.std(f1_scores[d_type])))
    print(5*'-')
print(10*'-')

for d_type in ['Integration','WSI', 'RNA', 'miRNA', 'CNV', 'DNA']:
    print(d_type + ' AUC: {}+-{}'.format(np.mean(aucs[d_type]),np.std(aucs[d_type])))
    print(5*'-')
    
print(10*'-')

for d_type in ['Integration','WSI', 'RNA', 'miRNA', 'CNV', 'DNA']:
    print(d_type + ' AUPRC: {}+-{}'.format(np.mean(auprcs[d_type]),np.std(auprcs[d_type])))
    print(5*'-')

In [None]:
print(lengths['all'])