## Best Result per Fold

In [42]:
from collections import Counter
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import RobustScaler

import random
import matplotlib
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

import plotly.express as px

plt.style.use('default')

## 5-fold Mean Result

In [23]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

df = pd.read_csv('output/result_5_fold.csv')[['fold', 'n_genes', 'auc_valid', 
                                               'tp', 'fp', 'tn', 'fn']]

df_best = df.groupby('fold').apply(lambda x: pd.Series({
    'n_genes': x.set_index('n_genes')['auc_valid'].idxmax()
})).reset_index()

df_best['n_genes'] = df_best['n_genes'].astype(int)

df_best.to_csv('output/best_result_per_5_fold.csv', sep=',', index=False)

df1 = pd.read_csv('output/inference_valid_5_fold.csv')
del df1['Unnamed: 0']
df1 = df1.rename(columns={'n_feats': 'n_genes'})

df2 = pd.read_csv('output/result_5_fold.csv')[['fold', 'n_genes', 'y_train_hat_min', 'y_train_hat_max']]

df3 = pd.merge(df1, df2, on=['fold', 'n_genes'])
df3['y_hat_adjusted'] = df3.apply(lambda x: (x['y_hat'] - x['y_train_hat_min']) / (x['y_train_hat_max'] - x['y_train_hat_min']), axis=1)
df3['y_hat_hard'] = df3.apply(lambda x: int(x['y_hat'] >= x['threshold']), axis=1)
df3 = df3.fillna('non-therapy')

df3 = df3.merge(df, on=['fold', 'n_genes'], how='inner')

df3['therapy']  = df3['therapy'].apply(lambda x: x.replace('therapy_first_line_', ''))

df3 = df3.merge(df_best, on=['fold', 'n_genes'], how='inner')

def classification_metrics(x):
    tn, fp, fn, tp = confusion_matrix(x['y_true'], x['y_hat_hard']).ravel()
    sensitivity, specificity = tp / (tp + fn), tn / (tn + fp)
    precision = tp / (tp + fp)
    return pd.Series({
        'AUC': roc_auc_score(x['y_true'], x['y_hat_adjusted']),
        'Accuracy': accuracy_score(x['y_true'], x['y_hat_hard']),
        'Sensitivity': sensitivity,
        'Specificity': specificity,
        'Precision': precision,
        'F1': 2 * precision * sensitivity / (precision + sensitivity)
    })

ddd = pd.DataFrame(df3.groupby(['fold']).apply(classification_metrics))

avg = ddd.mean().drop(columns='fold')
avg.name = 'Average'
std = ddd.std().drop(columns='fold')
std.name = 'Standard Deviation'
overall = pd.DataFrame(avg).join(std)
overall

Unnamed: 0,Average,Standard Deviation
AUC,0.695777,0.020671
Accuracy,0.69527,0.024595
Sensitivity,0.496866,0.136516
Specificity,0.75748,0.061731
Precision,0.389744,0.036656
F1,0.430965,0.073802


In [41]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

df1 = pd.read_csv('output/inference_valid_5_fold.csv')
del df1['Unnamed: 0']
df1 = df1.rename(columns={'n_feats': 'n_genes'})

df2 = pd.read_csv('output/result_5_fold.csv')[['fold', 'n_genes', 'y_train_hat_min', 'y_train_hat_max']]
therapy_class = pd.read_csv('data/mmrf/therapy/therapy_first_line_class.tsv', sep='\t')

df3 = pd.merge(df1, df2, on=['fold', 'n_genes'])
df3['y_hat_adjusted'] = df3.apply(lambda x: (x['y_hat'] - x['y_train_hat_min']) / (x['y_train_hat_max'] - x['y_train_hat_min']), axis=1)
df3['y_hat_hard'] = df3.apply(lambda x: int(x['y_hat'] >= x['threshold']), axis=1)
df3 = df3.fillna('non-therapy')

df3 = pd.merge(df3, therapy_class, on='ID', how='inner')
df3 = df3.merge(df_best, on=['fold', 'n_genes'], how='inner')

ddd = pd.DataFrame(df3.groupby(['therapy_first_line_class']).apply(classification_metrics))

ddd.reset_index().set_index('therapy_first_line_class').T


invalid value encountered in longlong_scalars



therapy_first_line_class,Bortezomib-based,Carfilzomib-based,Combined IMIDs/carfilzomib-based,Combined bortezomib/IMIDs-based,Combined bortezomib/IMIDs/carfilzomib-based,Combined bortezomib/carfilzomib-based,IMIDs-based
AUC,0.604006,0.62931,0.711783,0.642605,0.638889,0.0,0.632184
Accuracy,0.736434,0.702703,0.696429,0.637681,0.7,0.0,0.714286
Sensitivity,0.454545,0.375,0.414634,0.434343,1.0,0.0,0.166667
Specificity,0.762712,0.793103,0.859155,0.719512,0.666667,0.0,0.827586
Precision,0.151515,0.333333,0.62963,0.383929,0.25,0.0,0.166667
F1,0.227273,0.352941,0.5,0.407583,0.4,0.0,0.166667


## Result per Therapy

In [25]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

df1 = pd.read_csv('output/inference_valid_10_fold.csv')
del df1['Unnamed: 0']
df1 = df1.rename(columns={'n_feats': 'n_genes'})

df2 = pd.read_csv('output/result_10_fold.csv')[['fold', 'n_genes', 'y_train_hat_min', 'y_train_hat_max']]

df3 = pd.merge(df1, df2, on=['fold', 'n_genes'])
df3['y_hat_adjusted'] = df3.apply(lambda x: (x['y_hat'] - x['y_train_hat_min']) / (x['y_train_hat_max'] - x['y_train_hat_min']), axis=1)
df3['y_hat_hard'] = df3.apply(lambda x: int(x['y_hat'] >= x['threshold']), axis=1)
df3 = df3.fillna('non-therapy')

df3 = df3.merge(df, on=['fold', 'n_genes'], how='inner')

df3['therapy']  = df3['therapy'].apply(lambda x: x.replace('therapy_first_line_', ''))

df3 = df3.merge(df_best, on=['fold', 'n_genes'], how='inner')

ddd = pd.DataFrame(df3.groupby(['therapy']).apply(classification_metrics))

ddd.reset_index().set_index('therapy').T

therapy,bor-cyc-dex,bor-dex,bor-len-dex,len-dex,non-therapy
AUC,0.705405,0.839599,0.720468,0.656146,0.698178
Accuracy,0.62406,0.75,0.627119,0.7,0.689655
Sensitivity,0.517241,0.714286,0.476923,0.714286,0.507937
Specificity,0.653846,0.754386,0.684211,0.697674,0.757396
Precision,0.294118,0.263158,0.364706,0.277778,0.438356
F1,0.375,0.384615,0.413333,0.4,0.470588


## 10-fold Mean Result

In [43]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

df = pd.read_csv('output/result_10_fold.csv')[['fold', 'n_genes', 'auc_valid', 
                                               'tp', 'fp', 'tn', 'fn']]

df_best = df.groupby('fold').apply(lambda x: pd.Series({
    'n_genes': x.set_index('n_genes')['auc_valid'].idxmax()
})).reset_index()

df_best['n_genes'] = df_best['n_genes'].astype(int)

df_best.to_csv('output/best_result_per_10_fold.csv', sep=',', index=False)

df1 = pd.read_csv('output/inference_valid_10_fold.csv')
del df1['Unnamed: 0']
df1 = df1.rename(columns={'n_feats': 'n_genes'})

df2 = pd.read_csv('output/result_10_fold.csv')[['fold', 'n_genes', 'y_train_hat_min', 'y_train_hat_max']]

df3 = pd.merge(df1, df2, on=['fold', 'n_genes'])
df3['y_hat_adjusted'] = df3.apply(lambda x: (x['y_hat'] - x['y_train_hat_min']) / (x['y_train_hat_max'] - x['y_train_hat_min']), axis=1)
df3['y_hat_hard'] = df3.apply(lambda x: int(x['y_hat'] >= x['threshold']), axis=1)
df3 = df3.fillna('non-therapy')

df3 = df3.merge(df, on=['fold', 'n_genes'], how='inner')

df3['therapy']  = df3['therapy'].apply(lambda x: x.replace('therapy_first_line_', ''))

df3 = df3.merge(df_best, on=['fold', 'n_genes'], how='inner')

def classification_metrics(x):
    try:
        tn, fp, fn, tp = confusion_matrix(x['y_true'], x['y_hat_hard']).ravel()
        sensitivity, specificity = tp / (tp + fn), tn / (tn + fp)
        precision = tp / (tp + fp)
        return pd.Series({
            'AUC': roc_auc_score(x['y_true'], x['y_hat_adjusted']),
            'Accuracy': accuracy_score(x['y_true'], x['y_hat_hard']),
            'Sensitivity': sensitivity,
            'Specificity': specificity,
            'Precision': precision,
            'F1': 2 * precision * sensitivity / (precision + sensitivity)
        })
    except:
        return pd.Series({
            'AUC': 0.0,
            'Accuracy': 0.0,
            'Sensitivity': 0.0,
            'Specificity': 0.0,
            'Precision': 0.0,
            'F1': 0.0
        })

ddd = pd.DataFrame(df3.groupby(['fold']).apply(classification_metrics))

ddd.reset_index().mean()

avg = ddd.mean().drop(columns='fold')
avg.name = 'Average'
std = ddd.std().drop(columns='fold')
std.name = 'Standard Deviation'
overall = pd.DataFrame(avg).join(std)
overall.T

Unnamed: 0,AUC,Accuracy,Sensitivity,Specificity,Precision,F1
Average,0.734637,0.663293,0.512655,0.709272,0.431367,0.385352
Standard Deviation,0.034421,0.157277,0.330831,0.27574,0.229146,0.182189


## Result per Therapy

In [4]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

df1 = pd.read_csv('output/inference_valid_10_fold.csv')
del df1['Unnamed: 0']
df1 = df1.rename(columns={'n_feats': 'n_genes'})

df2 = pd.read_csv('output/result_10_fold.csv')[['fold', 'n_genes', 'y_train_hat_min', 'y_train_hat_max']]

df3 = pd.merge(df1, df2, on=['fold', 'n_genes'])
df3['y_hat_adjusted'] = df3.apply(lambda x: (x['y_hat'] - x['y_train_hat_min']) / (x['y_train_hat_max'] - x['y_train_hat_min']), axis=1)
df3['y_hat_hard'] = df3.apply(lambda x: int(x['y_hat'] >= x['threshold']), axis=1)
df3 = df3.fillna('non-therapy')

df3 = df3.merge(df, on=['fold', 'n_genes'], how='inner')

df3['therapy']  = df3['therapy'].apply(lambda x: x.replace('therapy_first_line_', ''))

df3 = df3.merge(df_best, on=['fold', 'n_genes'], how='inner')

ddd = pd.DataFrame(df3.groupby(['therapy']).apply(classification_metrics))

ddd.reset_index().set_index('therapy').T

therapy,bor-cyc-dex,bor-dex,bor-len-dex,len-dex,non-therapy
AUC,0.705405,0.839599,0.720468,0.656146,0.698178
Accuracy,0.62406,0.75,0.627119,0.7,0.689655
Sensitivity,0.517241,0.714286,0.476923,0.714286,0.507937
Specificity,0.653846,0.754386,0.684211,0.697674,0.757396
Precision,0.294118,0.263158,0.364706,0.277778,0.438356
F1,0.375,0.384615,0.413333,0.4,0.470588


In [50]:
df3.groupby(by='therapy').count()['ID']

therapy
bor-cyc-dex    133
bor-dex         64
bor-len-dex    236
len-dex         50
non-therapy    232
Name: ID, dtype: int64

In [40]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

df1 = pd.read_csv('output/inference_valid_10_fold.csv')
del df1['Unnamed: 0']
df1 = df1.rename(columns={'n_feats': 'n_genes'})

df2 = pd.read_csv('output/result_10_fold.csv')[['fold', 'n_genes', 'y_train_hat_min', 'y_train_hat_max']]
therapy_class = pd.read_csv('data/mmrf/therapy/therapy_first_line_class.tsv', sep='\t')

df3 = pd.merge(df1, df2, on=['fold', 'n_genes'])
df3['y_hat_adjusted'] = df3.apply(lambda x: (x['y_hat'] - x['y_train_hat_min']) / (x['y_train_hat_max'] - x['y_train_hat_min']), axis=1)
df3['y_hat_hard'] = df3.apply(lambda x: int(x['y_hat'] >= x['threshold']), axis=1)
df3 = df3.fillna('non-therapy')

df3 = pd.merge(df3, therapy_class, on='ID', how='inner')
df3 = df3.merge(df_best, on=['fold', 'n_genes'], how='inner')

ddd = pd.DataFrame(df3.groupby(['therapy_first_line_class']).apply(classification_metrics))

ddd.reset_index().set_index('therapy_first_line_class').T

therapy_first_line_class,Bortezomib-based,Carfilzomib-based,Combined IMIDs/carfilzomib-based,Combined bortezomib/IMIDs-based,Combined bortezomib/IMIDs/carfilzomib-based,Combined bortezomib/carfilzomib-based,IMIDs-based
AUC,0.727658,0.758621,0.672278,0.729572,0.722222,0.0,0.698276
Accuracy,0.72093,0.756757,0.642857,0.623188,0.7,0.0,0.771429
Sensitivity,0.636364,0.625,0.439024,0.494949,0.5,0.0,0.666667
Specificity,0.728814,0.793103,0.760563,0.674797,0.722222,0.0,0.793103
Precision,0.179487,0.454545,0.514286,0.379845,0.166667,0.0,0.4
F1,0.28,0.526316,0.473684,0.429825,0.25,0.0,0.5
