# Plots and results

In [None]:
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve,RocCurveDisplay
from sklearn.metrics import confusion_matrix
sns.set_style("dark")
plt.rcParams.update({'font.size': 16})

In [None]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True,
                          filename='plot.png'):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / np.sum(cm).astype('float')
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.savefig(filename, dpi=300)

## 10-Fold CV experiment

In [None]:
with open('runs/tcia_kfold_patient/test_results.pkl', 'rb') as f:
    data_tcia = pickle.load(f)

In [None]:
with open('runs/tcgagtex_kfold_patient/test_results.pkl', 'rb') as f:
    data_tcga_gtex = pickle.load(f)

In [None]:
outputs = []
real = []
accs = []
f1s = []
aucs = []
probabilities = []
for split in data_tcia.keys():
    test = data_tcia[split]
    real_test = np.concatenate(data_tcga_gtex[split]['real'], axis=0).astype(np.int32)
    preds = np.concatenate(data_tcga_gtex[split]['predictions'], axis=0).astype(np.int32)
    patient_ids = np.concatenate(data_tcga_gtex[split]['patient_ids'], axis=0)
    probs = np.concatenate(data_tcga_gtex[split]['outputs'],axis=0).astype(np.float32)
    new_preds = []
    new_real = []
    new_outputs = []
    for pidx in np.unique(patient_ids):
        index = np.where(patient_ids == pidx)[0]
        if len(index) == 1:
            new_preds.append(preds[index[0]])
            new_real.append(real_test[index[0]])
            new_outputs.append(probs[index[0]])
        else:
            new_probs = np.mean(probs[index], axis=0)
            n_ = softmax(new_probs)
            new_outputs.append(n_)
            new_preds.append(np.argmax(n_))
            new_real.append(real_test[index[0]])
    
    acc = accuracy_score(new_real, new_preds)
    f1 = f1_score(new_real, new_preds, average="weighted")
    auc = roc_auc_score(new_real, new_preds)
    accs.append(acc)
    f1s.append(f1)
    aucs.append(auc)
    real.append(new_real)
    outputs.append(new_preds)
    probabilities.append(new_outputs)

print(f'TCGA + GTEX')
print(f'Acc {round(np.mean(acc)*100,3)} +- {round(np.std(accs)*100, 3)}')
print(f'F1-score {round(np.mean(f1s)*100,3)} +- {round(np.std(f1s)*100, 3)}')
print(f'AUC {round(np.mean(aucs),3)}+- {round(np.std(aucs), 3)}')

In [None]:
real = [item for sublist in real for item in sublist]
outputs = [item for sublist in outputs for item in sublist]
probs = [item for sublist in probabilities for item in sublist]

In [None]:
acc = accuracy_score(real, outputs)
f1 = f1_score(real, outputs)
auc = roc_auc_score(real, outputs)
print('TCGA-GTEX 10-fold CV results')
print(f'Acc {round(acc*100,3)}')
print(f'F1-score {round(f1*100,3)}')
print(f'AUC {round(auc,3)}')

In [None]:
new_outputs = np.array(probs)
fpr, tpr, _ = roc_curve(real, new_outputs[:,1])
auc = round(roc_auc_score(real, outputs), 3)
plt.figure()
plt.plot(fpr,tpr,label="TCGA+GTEx, AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.tight_layout()
plt.legend()
plt.savefig('plots/tcgagtex_10foldcv_roccurve_patient', dpi=300)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(real, outputs)

In [None]:
plot_confusion_matrix(cm, target_names = ['Control', 'Tumor'], normalize = False, filename = 'plots/cm_10fold_tcgagtex_patient.png')

In [None]:
import random
from collections import Counter
random.seed(99)
outputs = []
real = []
accs = []
f1s = []
aucs = []
probabilities = []
for split in data_tcia.keys():
    test = data_tcia[split]
    real_test = np.concatenate(data_tcia[split]['real'], axis=0).astype(np.int32)
    preds = np.concatenate(data_tcia[split]['predictions'], axis=0).astype(np.int32)
    patient_ids = np.concatenate(data_tcia[split]['patient_ids'], axis=0)
    probs = np.concatenate(data_tcia[split]['outputs'],axis=0).astype(np.float32)
    new_preds = []
    new_real = []
    new_outputs = []
    for pidx in np.unique(patient_ids):
        index = np.where(patient_ids == pidx)[0]
        if len(index) == 1:
            new_preds.append(preds[index[0]])
            new_real.append(real_test[index[0]])
            new_outputs.append(probs[index[0]])
        else:
            new_probs = np.mean(probs[index], axis=0)
            n_ = softmax(new_probs)
            new_outputs.append(n_)
            new_preds.append(np.argmax(n_))
            new_real.append(real_test[index[0]])
    
    acc = accuracy_score(new_real, new_preds)
    f1 = f1_score(new_real, new_preds, average="weighted")
    auc = roc_auc_score(new_real, new_preds)
    accs.append(acc)
    f1s.append(f1)
    aucs.append(auc)
    real.append(new_real)
    outputs.append(new_preds)
    probabilities.append(new_outputs)

print(f'CPTAC')
print(f'Acc {round(np.mean(acc)*100,3)} +- {round(np.std(accs)*100, 3)}')
print(f'F1-score {round(np.mean(f1s)*100,3)} +- {round(np.std(f1s)*100, 3)}')
print(f'AUC {round(np.mean(aucs),3)}+- {round(np.std(aucs), 3)}')

In [None]:
real = [item for sublist in real for item in sublist]
outputs = [item for sublist in outputs for item in sublist]
new_outputs = [item for sublist in probabilities for item in sublist]

In [None]:
acc = accuracy_score(real, outputs)
f1 = f1_score(real, outputs)
auc = roc_auc_score(real, outputs)
print('CPTAC 10-fold CV results')
print(f'Acc {round(acc*100,3)}')
print(f'F1-score {round(f1*100,3)}')
print(f'AUC {round(auc,3)}')

In [None]:
new_outputs = np.array(new_outputs)
fpr, tpr, _ = roc_curve(real, new_outputs[:,1])
auc = round(roc_auc_score(real, outputs), 3)
plt.figure()
plt.plot(fpr,tpr,label="CPTAC, AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.tight_layout()
plt.legend()
plt.savefig('plots/tcia_kfold_roccurve_patient.png', dpi=300)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(real, outputs)

In [None]:
plot_confusion_matrix(cm, target_names = ['Control', 'Tumor'], normalize = False, filename = 'plots/cm_10fold_tcia_patient.png')

## K-FOLD Country

In [None]:
with open('runs/tcia_kfold_country/test_results.pkl', 'rb') as f:
    data_tcia_country = pickle.load(f)
countries = ['Bulgaria', 'Canada', 'China', 'Denmark', 'India', 'Iraq', 'Other',
       'Poland', 'Russia', 'Serbia', 'South Wales', 'United States']
outputs = []
real = []
accs = []
f1s = []
aucs = []
probabilities = []
preds_per_country = []
for split in data_tcia_country.keys():
    test = data_tcia_country[split]
    real_test = np.concatenate(data_tcia_country[split]['real'], axis=0).astype(np.int32)
    preds = np.concatenate(data_tcia_country[split]['predictions'], axis=0).astype(np.int32)
    patient_ids = np.concatenate(data_tcia_country[split]['patient_ids'], axis=0)
    probs = np.concatenate(data_tcia_country[split]['outputs'],axis=0).astype(np.float32)
    new_preds = []
    new_real = []
    new_outputs = []
    for pidx in np.unique(patient_ids):
        index = np.where(patient_ids == pidx)[0]
        if len(index) == 1:
            new_preds.append(preds[index[0]])
            new_real.append(real_test[index[0]])
            new_outputs.append(probs[index[0]])
        else:
            new_probs = np.mean(probs[index], axis=0)
            n_ = softmax(new_probs)
            new_outputs.append(n_)
            new_preds.append(np.argmax(n_))
            new_real.append(real_test[index[0]])
    
    acc = accuracy_score(new_real, new_preds)
    f1 = f1_score(new_real, new_preds, average="weighted")
    accs.append(acc)
    f1s.append(f1)
    real.append(new_real)
    outputs.append(new_preds)
    probabilities.append(new_outputs)
    preds_per_country.append(len(new_real))

print(f'CPTAC k-Fold per country')
print(f'Acc {round(np.mean(acc)*100,3)} +- {round(np.std(accs)*100, 3)}')
print(f'F1-score {round(np.mean(f1s)*100,3)} +- {round(np.std(f1s)*100, 3)}')

In [None]:
# Accuracy only with the countries with more samples
idx_countries = [1,2,6,7,8,11]
print(f'TCIA k-Fold countries with more samples')
print(f'Acc {round(np.mean(np.array(accs)[idx_countries])*100,3)} +- {round(np.std(np.array(accs)[idx_countries])*100, 3)}')
print(f'F1-score {round(np.mean(np.array(f1s)[idx_countries])*100,3)} +- {round(np.std(np.array(f1s)[idx_countries])*100, 3)}')

In [None]:
import pandas as pd
country_df = pd.DataFrame()
country_df['acc'] = accs
country_df['f1-score'] = f1s
country_df['country'] = countries

In [None]:
plt.figure(figsize=(7,7))
plt.bar(countries, np.array(accs)*100)
plt.xticks(rotation=90)
plt.ylabel('Accuracy')
plt.tight_layout()
plt.savefig('plots/kfold_countries_cptac_accs.png', dpi=300)

In [None]:
plt.figure(figsize=(7,7))
plt.bar(countries, np.array(f1s)*100)
plt.xticks(list(range(len(countries))), countries, alpha=0.8)
plt.xticks(rotation=90)
plt.ylabel('F1-Score')
plt.tight_layout()
plt.savefig('plots/kfold_countries_cptac_f1score.png', dpi=300)

In [None]:
plt.figure(figsize=(7,7))
x = {k: v for k, v in zip(countries, preds_per_country)}
new_dict = {k: v for k, v in sorted(x.items(), key=lambda item: item[1])}
plt.barh(list(new_dict.keys()), list(new_dict.values()))
plt.yticks(list(range(len(new_dict.keys()))), list(new_dict.keys()), alpha=0.8)
plt.xticks(list(range(len(new_dict.keys()))), '', alpha=0.8)

plt.xlabel('Number of patients')
plt.tight_layout()
plt.savefig('plots/countries_patients.png', dpi=300)

## Generalization experiments

In [None]:
with open('runs/tcga_gtex_on_tcia_reinhard_fast_patient/test_results_evaluation.pkl', 'rb') as f:
    data_tcga_gtex_on_tcia = pickle.load(f)

with open('runs/tcia_on_tcgagtex_reinhard_fast_40_patient/test_results_evaluation.pkl', 'rb') as f:
    data_tcia_on_tcga_gtex = pickle.load(f)

In [None]:
real_test = np.concatenate(data_tcia_on_tcga_gtex['real'], axis=0).astype(np.int32)
preds = np.concatenate(data_tcia_on_tcga_gtex['predictions'], axis=0).astype(np.int32)
patient_ids = np.concatenate(data_tcia_on_tcga_gtex['patient_ids'], axis=0)
probs = np.concatenate(data_tcia_on_tcga_gtex['outputs'],axis=0).astype(np.float)
new_preds = []
new_real = []
new_outputs = []
for pidx in np.unique(patient_ids):
    index = np.where(patient_ids == pidx)[0]
    if len(index) == 1:
        new_preds.append(preds[index[0]])
        new_real.append(real_test[index[0]])
        new_outputs.append(probs[index[0]])
    else:
        new_probs = np.mean(probs[index], axis=0)
        n_ = softmax(new_probs)
        new_outputs.append(n_)
        new_preds.append(np.argmax(n_))
        new_real.append(real_test[index[0]])
            
acc = accuracy_score(new_real, new_preds)
f1 = f1_score(new_real, new_preds, average="weighted")
auc = roc_auc_score(new_real, new_preds)

print(f'CPTAC on TCGA-GTEX')
print(f'Acc {round(acc*100,3)}')
print(f'F1-score {round(f1*100,3)}')
print(f'AUC {round(auc,3)}')

cm = confusion_matrix(new_real, new_preds)
plot_confusion_matrix(cm, target_names = ['Control', 'Tumor'], normalize = False, filename = 'plots/tcia_on_tcga_gtex_patient.png')

In [None]:
real_test = np.concatenate(data_tcga_gtex_on_tcia['real'], axis=0).astype(np.int32)
preds = np.concatenate(data_tcga_gtex_on_tcia['predictions'], axis=0).astype(np.int32)

patient_ids = np.concatenate(data_tcga_gtex_on_tcia['patient_ids'], axis=0)
probs = np.concatenate(data_tcga_gtex_on_tcia['outputs'],axis=0).astype(np.float)
new_preds = []
new_real = []
new_outputs = []
for pidx in np.unique(patient_ids):
    index = np.where(patient_ids == pidx)[0]
    if len(index) == 1:
        new_preds.append(preds[index[0]])
        new_real.append(real_test[index[0]])
        new_outputs.append(probs[index[0]])
    else:
        counts = np.bincount(preds[index])
        new_preds.append(np.argmax(counts))
        #n_ = softmax(new_probs)
        #new_outputs.append(n_)
        #new_preds.append(np.argmax(n_))
        new_real.append(real_test[index[0]])
        
acc = accuracy_score(new_real, new_preds)
f1 = f1_score(new_real, new_preds, average="weighted")
auc = roc_auc_score(new_real, new_preds)

print(f'TCGA-GTEX on CPTAC')
print(f'Acc {round(acc*100,3)}')
print(f'F1-score {round(f1*100,3)}')
print(f'AUC {round(auc,3)}')

cm = confusion_matrix(new_real, new_preds)
plot_confusion_matrix(cm, target_names = ['Control', 'Tumor'], normalize = False, filename = 'plots/tcga_gtex_on_tcia_patient.png')

## Results on external dataset

In [None]:
with open('runs/tcgagtex_on_mhmc_reinhard_fast_patient/test_results_evaluation.pkl', 'rb') as f:
    data_tcga_gtex_on_mhmc = pickle.load(f)

with open('runs/tcia_on_mhmc_reinhard_fast_patient/test_results_evaluation.pkl', 'rb') as f:
    data_tcia_on_mhmc = pickle.load(f)

with open('runs/tcga_gtex_cptac_on_mhmc_reinhard_fast_patient/test_results_evaluation.pkl', 'rb') as f:
    data_tcga_gtex_tcia_on_mhmc = pickle.load(f)

In [None]:
real_test = np.concatenate(data_tcia_on_mhmc['real'], axis=0).astype(np.int32)
preds = np.concatenate(data_tcia_on_mhmc['predictions'], axis=0).astype(np.int32)

patient_ids = np.concatenate(data_tcia_on_mhmc['patient_ids'], axis=0)
types = [patient_id.split('/')[-2] for patient_id in patient_ids]
patient_ids = [type_+'_'+patient_id.split('/')[-1].split('_')[0]+'_'+patient_id.split('/')[-1].split('_')[-1].split('-')[0] for type_,patient_id in zip(types,patient_ids)]
patient_ids = np.array(patient_ids)
import pdb; pdb.set_trace()
probs = np.concatenate(data_tcia_on_mhmc['outputs'],axis=0).astype(np.float)
new_preds = []
new_real = []
new_outputs = []
for pidx in np.unique(patient_ids):
    index = np.where(patient_ids == pidx)[0]
    if len(index) == 1:
        new_preds.append(preds[index[0]])
        new_real.append(real_test[index[0]])
        new_outputs.append(probs[index[0]])
    else:
        new_probs = np.mean(probs[index], axis=0)
        n_ = softmax(new_probs)
        new_outputs.append(n_)
        new_preds.append(np.argmax(n_))
        new_real.append(real_test[index[0]])
        
acc = accuracy_score(new_real, new_preds)
f1 = f1_score(new_real, new_preds, average="weighted")
auc = roc_auc_score(new_real, new_preds)

print(f'CPTAC on MHMC')
print(f'Acc {round(acc*100,3)}')
print(f'F1-score {round(f1*100,3)}')
print(f'AUC {round(auc,3)}')

new_outputs = np.array(new_outputs) 
cm = confusion_matrix(new_real, new_preds)
plot_confusion_matrix(cm, target_names = ['Control', 'Tumor'], normalize = False, filename = 'plots/tcia_on_mhmc_patient.png')
fpr, tpr, _ = roc_curve(new_real, new_outputs[:,1])
auc = round(roc_auc_score(new_real, new_preds), 3)
plt.figure()
plt.plot(fpr,tpr,label="CPTAC on MHMC, AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.tight_layout()
plt.legend()
plt.savefig('plots/tcia_on_mhmc_roccurve_patient.png')

In [None]:
real_test = np.concatenate(data_tcga_gtex_on_mhmc['real'], axis=0).astype(np.int32)
preds = np.concatenate(data_tcga_gtex_on_mhmc['predictions'], axis=0).astype(np.int32)
patient_ids = np.concatenate(data_tcga_gtex_on_mhmc['patient_ids'], axis=0)
types = [patient_id.split('/')[-2] for patient_id in patient_ids]
patient_ids = [type_+'_'+patient_id.split('/')[-1].split('_')[0]+'_'+patient_id.split('/')[-1].split('_')[-1].split('-')[0] for type_,patient_id in zip(types,patient_ids)]
patient_ids = np.array(patient_ids)
probs = np.concatenate(data_tcga_gtex_on_mhmc['outputs'],axis=0).astype(np.float)
new_preds = []
new_real = []
new_outputs = []
for pidx in np.unique(patient_ids):
    index = np.where(patient_ids == pidx)[0]
    if len(index) == 1:
        new_preds.append(preds[index[0]])
        new_real.append(real_test[index[0]])
        new_outputs.append(softmax(probs[index[0]]))
    else:
        new_probs = np.mean(probs[index], axis=0)
        n_ = softmax(new_probs)
        new_outputs.append(n_)
        new_preds.append(np.argmax(n_))
        new_real.append(real_test[index[0]])

new_outputs = np.array(new_outputs)   
acc = accuracy_score(new_real, new_preds)
f1 = f1_score(new_real, new_preds, average="weighted")
auc = roc_auc_score(new_real, new_preds)
print(f'TCGA-GTEX on MHMC')
print(f'Acc {round(acc*100,3)}')
print(f'F1-score {round(f1*100,3)}')
print(f'AUC {round(auc,3)}')

cm = confusion_matrix(new_real, new_preds)
plot_confusion_matrix(cm, target_names = ['Control', 'Tumor'], normalize = False, filename = 'plots/tcga_gtex_on_mhmc_patient.png')

#RocCurveDisplay.from_predictions(new_real, new_outputs[:,1], name='TCGA+GTEx on MHMC')
#plt.savefig('plots/tcga_gtex_on_mhmc_roccurve_patient.png')
fpr, tpr, _ = roc_curve(new_real, new_outputs[:,1])
auc = round(roc_auc_score(new_real, new_preds), 3)
plt.figure()
plt.plot(fpr,tpr,label="TCGA+GTEx on MHMC, AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.tight_layout()
plt.legend()
plt.savefig('plots/tcga_gtex_on_mhmc_roccurve_patient.png')

In [None]:
real_test = np.concatenate(data_tcga_gtex_tcia_on_mhmc['real'], axis=0).astype(np.int32)
preds = np.concatenate(data_tcga_gtex_tcia_on_mhmc['predictions'], axis=0).astype(np.int32)
patient_ids = np.concatenate(data_tcga_gtex_tcia_on_mhmc['patient_ids'], axis=0)
types = [patient_id.split('/')[-2] for patient_id in patient_ids]
patient_ids = [type_+'_'+patient_id.split('/')[-1].split('_')[0]+'_'+patient_id.split('/')[-1].split('_')[-1].split('-')[0] for type_,patient_id in zip(types,patient_ids)]
patient_ids = np.array(patient_ids)
probs = np.concatenate(data_tcga_gtex_tcia_on_mhmc['outputs'],axis=0).astype(np.float32)
new_preds = []
new_real = []
new_outputs = []
for pidx in np.unique(patient_ids):
    index = np.where(patient_ids == pidx)[0]
    if len(index) == 1:
        new_preds.append(preds[index[0]])
        new_real.append(real_test[index[0]])
        new_outputs.append(softmax(probs[index[0]]))
    else:
        new_probs = np.mean(probs[index], axis=0)
        n_ = softmax(new_probs)
        new_outputs.append(n_)
        new_preds.append(np.argmax(n_))
        new_real.append(real_test[index[0]])

new_outputs = np.array(new_outputs)   
acc = accuracy_score(new_real, new_preds)
f1 = f1_score(new_real, new_preds, average="weighted")
auc = roc_auc_score(new_real, new_preds)
print(f'TCGA-GTEX+CPTAC on MHMC')
print(f'Acc {round(acc*100,3)}')
print(f'F1-score {round(f1*100,3)}')
print(f'AUC {round(auc,3)}')

cm = confusion_matrix(new_real, new_preds)
plot_confusion_matrix(cm, target_names = ['Control', 'Tumor'], normalize = False, filename = 'plots/tcga_gtex_tcia_on_mhmc_patient.png')

fpr, tpr, _ = roc_curve(new_real, new_outputs[:,1])
auc = round(roc_auc_score(new_real, new_preds), 3)
plt.figure()
plt.plot(fpr,tpr,label="CPTAC+TCGA+GTEx, AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.tight_layout()
plt.legend()
plt.savefig('plots/tcga_gtex_tcia_on_mhmc_roccurve_patient.png')

In [None]:
real_test_tcga = np.concatenate(data_tcga_gtex_on_mhmc['real'], axis=0).astype(np.int32)
preds_tcga = np.concatenate(data_tcga_gtex_on_mhmc['outputs'], axis=0).astype(np.int32)
patient_ids = np.concatenate(data_tcga_gtex_on_mhmc['patient_ids'], axis=0)
types = [patient_id.split('/')[-2] for patient_id in patient_ids]
patient_ids = [type_+'_'+patient_id.split('/')[-1].split('_')[0]+'_'+patient_id.split('/')[-1].split('_')[-1].split('-')[0] for type_,patient_id in zip(types,patient_ids)]
patient_ids = np.array(patient_ids)
probs_tcga = np.concatenate(data_tcga_gtex_on_mhmc['outputs'],axis=0).astype(np.float32)
new_preds_tcga = []
new_real_tcga = []
new_outputs_tcga = []
for pidx in np.unique(patient_ids):
    index = np.where(patient_ids == pidx)[0]
    if len(index) == 1:
        new_preds_tcga.append(preds_tcga[index[0]])
        new_real_tcga.append(real_test_tcga[index[0]])
        new_outputs_tcga.append(softmax(probs_tcga[index[0]]))
    else:
        new_probs = np.mean(probs_tcga[index], axis=0)
        n_ = softmax(new_probs)
        new_outputs_tcga.append(n_)
        new_preds_tcga.append(np.argmax(n_))
        new_real_tcga.append(real_test_tcga[index[0]])
new_outputs_tcga = np.array(new_outputs_tcga)
fpr, tpr, _ = roc_curve(new_real_tcga, new_outputs_tcga[:,1])
auc = round(roc_auc_score(new_real_tcga, new_outputs_tcga[:,1]), 3)
plt.plot(fpr,tpr,label="TCGA+GTEX, AUC="+str(auc))

real_test_tcia = np.concatenate(data_tcia_on_mhmc['real'], axis=0).astype(np.int32)
preds_tcia = np.concatenate(data_tcia_on_mhmc['outputs'], axis=0).astype(np.int32)
patient_ids = np.concatenate(data_tcia_on_mhmc['patient_ids'], axis=0)
types = [patient_id.split('/')[-2] for patient_id in patient_ids]
patient_ids = [type_+'_'+patient_id.split('/')[-1].split('_')[0]+'_'+patient_id.split('/')[-1].split('_')[-1].split('-')[0] for type_,patient_id in zip(types,patient_ids)]
patient_ids = np.array(patient_ids)
probs_tcia = np.concatenate(data_tcia_on_mhmc['outputs'],axis=0).astype(np.float32)
new_preds_tcia = []
new_real_tcia = []
new_outputs_tcia = []
for pidx in np.unique(patient_ids):
    index = np.where(patient_ids == pidx)[0]
    if len(index) == 1:
        new_preds_tcia.append(preds_tcia[index[0]])
        new_real_tcia.append(real_test_tcia[index[0]])
        new_outputs_tcia.append(softmax(probs_tcia[index[0]]))
    else:
        new_probs = np.mean(probs_tcia[index], axis=0)
        n_ = softmax(new_probs)
        new_outputs_tcia.append(n_)
        new_preds_tcia.append(np.argmax(n_))
        new_real_tcia.append(real_test_tcia[index[0]])
new_outputs_tcia = np.array(new_outputs_tcia)
fpr, tpr, _ = roc_curve(new_real_tcia, new_outputs_tcia[:,1])
auc = round(roc_auc_score(new_real_tcia, new_outputs_tcia[:,1]), 3)
plt.plot(fpr,tpr,label="CPTAC, AUC="+str(auc))

real_test_all = np.concatenate(data_tcga_gtex_tcia_on_mhmc['real'], axis=0).astype(np.int32)
preds_all = np.concatenate(data_tcga_gtex_tcia_on_mhmc['outputs'], axis=0).astype(np.int32)
patient_ids = np.concatenate(data_tcga_gtex_tcia_on_mhmc['patient_ids'], axis=0)
types = [patient_id.split('/')[-2] for patient_id in patient_ids]
patient_ids = [type_+'_'+patient_id.split('/')[-1].split('_')[0]+'_'+patient_id.split('/')[-1].split('_')[-1].split('-')[0] for type_,patient_id in zip(types,patient_ids)]
patient_ids = np.array(patient_ids)
probs_all = np.concatenate(data_tcga_gtex_tcia_on_mhmc['outputs'],axis=0).astype(np.float32)
new_preds_all = []
new_real_all = []
new_outputs_all = []
for pidx in np.unique(patient_ids):
    index = np.where(patient_ids == pidx)[0]
    if len(index) == 1:
        new_preds_all.append(preds_all[index[0]])
        new_real_all.append(real_test_all[index[0]])
        new_outputs_all.append(softmax(probs_all[index[0]]))
    else:
        new_probs = np.mean(probs_all[index], axis=0)
        n_ = softmax(new_probs)
        new_outputs_all.append(n_)
        new_preds_all.append(np.argmax(n_))
        new_real_all.append(real_test_all[index[0]])
new_outputs_all = np.array(new_outputs_all)
fpr, tpr, _ = roc_curve(new_real_all, new_outputs_all[:,1])
auc = round(roc_auc_score(new_real_all, new_outputs_all[:,1]), 3)
plt.plot(fpr,tpr,label="CPTAC+TCGA+GTEX, AUC="+str(auc))

plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.tight_layout()
plt.legend()
plt.savefig('plots/roc_curve_mhmc_three_patient.png', dpi=300)

## Features visualization

In [None]:
with open('runs/tcia_on_mhmc_reinhard_fast/features.pkl', 'rb') as f:
    features_data = pickle.load(f)

In [None]:
with open('runs/tcgagtex_fulltrain_reinhard_fast/features.pkl', 'rb') as f:
    features_data = pickle.load(f)

In [None]:
features = features_data['features']
labels = features_data['labels']
patient_id = features_data['patient_id']

In [None]:
project = [x.split('-')[0] for x in patient_id]

In [None]:
import umap
reducer = umap.UMAP()
embedding = reducer.fit_transform(features)

In [None]:
import matplotlib

colors = ['red','blue']
scatter = plt.scatter(embedding[:,0], embedding[:,1], c=labels, cmap=matplotlib.colors.ListedColormap(colors))
plt.legend(handles=scatter.legend_elements()[0], 
           title="Label",
           labels=['Control', 'Tumor'])
plt.savefig('plots/features_label_tcgagtex.png', dpi=300)

In [None]:
patient_id

In [None]:
import matplotlib
project_new = []
for x in project:
    if x =='TCGA':
        project_new.append(1)
    elif x =='GTEX':
        project_new.append(0)
colors = ['red','blue']
scatter = plt.scatter(embedding[:,0], embedding[:,1], c=project_new, cmap=matplotlib.colors.ListedColormap(colors))
plt.legend(handles=scatter.legend_elements()[0], 
           title="Database",
           labels=['GTEx', 'TCGA'])
plt.savefig('plots/features_project_tcgagtex.png', dpi=300)

## 10-fold all datasets

In [None]:
with open('runs/tcia_kfol_tcga_gtex_patient/test_results.pkl', 'rb') as f:
    data_tcia = pickle.load(f)

In [None]:
with open('runs/tcga_gtex_kfol_tcia_patient/test_results.pkl', 'rb') as f:
    data_tcga_gtex = pickle.load(f)

In [None]:
outputs = []
real = []
accs = []
f1s = []
aucs = []
probabilities = []
for split in data_tcga_gtex.keys():
    test = data_tcga_gtex[split]
    real_test = np.concatenate(data_tcga_gtex[split]['real'], axis=0).astype(np.int32)
    preds = np.concatenate(data_tcga_gtex[split]['predictions'], axis=0).astype(np.int32)
    patient_ids = np.concatenate(data_tcga_gtex[split]['patient_ids'], axis=0)
    probs = np.concatenate(data_tcga_gtex[split]['outputs'],axis=0).astype(np.float32)
    new_preds = []
    new_real = []
    new_outputs = []
    for pidx in np.unique(patient_ids):
        index = np.where(patient_ids == pidx)[0]
        if len(index) == 1:
            new_preds.append(preds[index[0]])
            new_real.append(real_test[index[0]])
            new_outputs.append(probs[index[0]])
        else:
            new_probs = np.mean(probs[index], axis=0)
            n_ = softmax(new_probs)
            new_outputs.append(n_)
            new_preds.append(np.argmax(n_))
            new_real.append(real_test[index[0]])
    
    acc = accuracy_score(new_real, new_preds)
    f1 = f1_score(new_real, new_preds, average="weighted")
    auc = roc_auc_score(new_real, new_preds)
    accs.append(acc)
    f1s.append(f1)
    aucs.append(auc)
    real.append(new_real)
    outputs.append(new_preds)
    probabilities.append(new_outputs)

print(f'TCGA-GTEX + TCIA on train')
print(f'Acc {round(np.mean(acc)*100,3)} +- {round(np.std(accs)*100, 3)}')
print(f'F1-score {round(np.mean(f1s)*100,3)} +- {round(np.std(f1s)*100, 3)}')
print(f'AUC {round(np.mean(aucs),3)}+- {round(np.std(aucs)*100, 3)}')

In [None]:
outputs = []
real = []
accs = []
f1s = []
aucs = []
probabilities = []
for split in data_tcia.keys():
    test = data_tcia[split]
    real_test = np.concatenate(data_tcia[split]['real'], axis=0).astype(np.int32)
    preds = np.concatenate(data_tcia[split]['predictions'], axis=0).astype(np.int32)
    patient_ids = np.concatenate(data_tcia[split]['patient_ids'], axis=0)
    probs = np.concatenate(data_tcia[split]['outputs'],axis=0).astype(np.float32)
    new_preds = []
    new_real = []
    new_outputs = []
    for pidx in np.unique(patient_ids):
        index = np.where(patient_ids == pidx)[0]
        if len(index) == 1:
            new_preds.append(preds[index[0]])
            new_real.append(real_test[index[0]])
            new_outputs.append(probs[index[0]])
        else:
            new_probs = np.mean(probs[index], axis=0)
            n_ = softmax(new_probs)
            new_outputs.append(n_)
            new_preds.append(np.argmax(n_))
            new_real.append(real_test[index[0]])
    
    acc = accuracy_score(new_real, new_preds)
    f1 = f1_score(new_real, new_preds, average="weighted")
    auc = roc_auc_score(new_real, new_preds)
    accs.append(acc)
    f1s.append(f1)
    aucs.append(auc)
    real.append(new_real)
    outputs.append(new_preds)
    probabilities.append(new_outputs)

print(f'CPTAC + TCGA-GTEX on train')
print(f'Acc {round(np.mean(acc)*100,3)} +- {round(np.std(accs)*100, 3)}')
print(f'F1-score {round(np.mean(f1s)*100,3)} +- {round(np.std(f1s)*100, 3)}')
print(f'AUC {round(np.mean(aucs),3)}+- {round(np.std(aucs), 3)}')