In [1]:
import os

import numpy as np
import pandas as pd
import h5py
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import spearmanr

from tensorflow import keras
from keras import layers

import MMRF_utils

In [66]:
mmrf_rna_dir = '/Users/nalinisingh/datasets/multiple_myeloma/MMRF_CoMMpass_IA15a_E74GTF_Cufflinks_Gene_FPKM.txt'
asct_data_dir = '/Users/nalinisingh/datasets/multiple_myeloma/MMRF_OS_PFS_ASCT.csv'
no_asct_data_dir = '/Users/nalinisingh/datasets/multiple_myeloma/MMRF_OS_PFS_non-ASCT.csv'

In [219]:
train_patient_all = pd.DataFrame()
valid_patient_all = pd.DataFrame()
test_patient_all = pd.DataFrame()

for ind in [1]:
    data_filename = '/Users/nalinisingh/dev/ml_mmrf_nonnormalized/ml_mmrf/ml_mmrf/output/cleaned_mm_fold_2mos_pfs_ind_seed0.pkl'#%(ind)
    train, test, valid = MMRF_utils.get_train_test_valid(data_filename, ind, show_features=True)

    # include the first n clinical visit data for each patient (default 0)
    train_df = MMRF_utils.preprocess_patient_data(train, num_clin_visits=3)  

    valid_df = MMRF_utils.preprocess_patient_data(valid, num_clin_visits=3)  

    test_df = MMRF_utils.preprocess_patient_data(test, num_clin_visits=3)  

    genomic_fn = '/Users/nalinisingh/datasets/multiple_myeloma/MMRF_CoMMpass_IA15a_E74GTF_Salmon_Gene_TPM.txt'
    genomic_df = MMRF_utils.preprocess_genomic_data(genomic_fn, nPCA=0)

    train_patient_all = train_patient_all.append(train_df.merge(genomic_df, left_on='pids', right_on='pids'))

    valid_patient_all = valid_patient_all.append(valid_df.merge(genomic_df, left_on='pids', right_on='pids'))

    test_patient_all = test_patient_all.append(test_df.merge(genomic_df, left_on='pids', right_on='pids'))
    

loading from: /Users/nalinisingh/dev/ml_mmrf_nonnormalized/ml_mmrf/ml_mmrf/output/cleaned_mm_fold_2mos_pfs_ind_seed0.pkl
pids
(470,)
x
(470, 33, 16)
m
(470, 33, 16)
feature_names_x
(16,)
['cbc_abs_neut' 'chem_albumin' 'chem_bun' 'chem_calcium' 'chem_creatinine'
 'chem_glucose' 'cbc_hemoglobin' 'serum_kappa' 'serum_m_protein'
 'cbc_platelet' 'chem_totprot' 'cbc_wbc' 'serum_iga' 'serum_igg'
 'serum_igm' 'serum_lambda']
ys_seq
(470, 1)
ce
(470, 1)
feature_names_y
(1,)
['progression free survival (all)']
b
(470, 17)
feature_names
(17,)
Index(['iss', 'age', 'gender', 'ecog', 'line1sct', 'serum_beta2_microglobulin',
       'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'heavy_chain', 'igg_type',
       'iga_type', 'igm_type', 'kappa_type', 'lambda_type'],
      dtype='object')
a
(470, 33, 9)
m_a
(470, 33, 6)
feature_names_a
(9,)
['local_clock' 'Bor' 'Car' 'Cyc' 'Dex' 'Len' 'line1' 'line2' 'line3plus']
Keep first 3 clinical visits.
Keep first 3 clinical visits.
Keep first 3 clinical visits.
Reading /User

In [272]:
asct_data = pd.read_csv('/Users/nalinisingh/datasets/multiple_myeloma/MMRF_OS_PFS_ASCT.csv')
no_asct_data = pd.read_csv('/Users/nalinisingh/datasets/multiple_myeloma/MMRF_OS_PFS_non-ASCT.csv')

In [273]:
combined_all_patient = pd.concat([train_df, valid_df, test_df])
combined_all_patient['gender'] = (combined_all_patient['gender']-1)*100
combined_all_rna_seq = pd.concat([train_patient_all,valid_patient_all,test_patient_all])
combined_all_rna_seq['gender'] = (combined_all_rna_seq['gender']-1)*100
bort_rna_seq = combined_all_rna_seq[combined_all_rna_seq['Bor1']==True]
asct_rna_seq = combined_all_rna_seq[combined_all_rna_seq['pids'].isin(asct_data['Patient'].values)]
no_asct_rna_seq = combined_all_rna_seq[combined_all_rna_seq['pids'].isin(no_asct_data['Patient'].values)]

In [320]:
covs = ['age','gender','iss','ecog','igg_type','igm_type','iga_type','kappa_type','lambda_type','serum_beta2_microglobulin','serum_m_protein1','chem_creatinine1','chem_albumin1','pfs']



In [353]:
cohort_mean_df = pd.DataFrame()
cohort_std_df = pd.DataFrame()

for i, df in enumerate([combined_all_patient, combined_all_rna_seq, bort_rna_seq, asct_rna_seq, no_asct_rna_seq]):
    cohort_mean_df = cohort_mean_df.append(pd.DataFrame(df[covs].mean()).T.applymap(lambda x: '{0:.2f}'.format(x)),ignore_index=True)
    cohort_std_df = cohort_std_df.append(pd.DataFrame(df[covs].std()).T.applymap(lambda x: '{0:.2f}'.format(x)),ignore_index=True)

In [354]:
cohort_mean_df

Unnamed: 0,age,gender,iss,ecog,igg_type,igm_type,iga_type,kappa_type,lambda_type,serum_beta2_microglobulin,serum_m_protein1,chem_creatinine1,chem_albumin1,pfs
0,63.31,40.1,1.85,0.86,0.65,0.06,0.17,0.64,0.34,4.85,2.81,113.94,35.96,16.29
1,63.22,41.09,1.85,0.88,0.65,0.05,0.18,0.64,0.35,4.91,2.85,112.9,35.93,16.04
2,63.55,39.65,1.87,0.88,0.62,0.05,0.19,0.64,0.34,5.13,2.8,118.21,35.93,16.91
3,59.49,43.68,1.79,0.76,0.65,0.04,0.18,0.64,0.36,4.45,2.73,105.25,36.93,18.95
4,67.77,37.92,1.94,1.03,0.66,0.06,0.18,0.65,0.33,5.48,2.99,122.24,34.7,12.48


In [355]:
cohort_str_df = pd.DataFrame(index= cohort_mean_df.index,columns=cohort_mean_df.columns)
for i in range(5):

    cohort_str_df.iloc[i] = cohort_mean_df.iloc[i]+r'$\pm$'+cohort_std_df.iloc[i]


In [356]:
cohort_list = [
    r'\makecell{All \\ (n='+str(combined_all_patient.shape[0])+')}',
    r'\makecell{RNA-Seq \\ (n='+str(combined_all_rna_seq.shape[0])+')}',
    r'\makecell{Bortezomib \\ (n='+str(bort_rna_seq.shape[0])+')}',
    r'\makecell{ASCT \\ (n='+str(asct_rna_seq.shape[0])+')}',
    r'\makecell{Non-ASCT \\ (n='+str(no_asct_rna_seq.shape[0])+')}'
]
cohort_str_df = cohort_str_df.rename({i:cohort_list[i] for i in range(5)})

In [357]:
att_map = {
    r'age': r'\makecell{Age \\ (years)}',
    r'gender': r'\makecell{Gender \\ (\% female)}',
    r'iss': r'ISS',
    r'ecog': r'ECOG',
    r'igg_type': r'\makecell{IgG \\ (g/L)}',
    r'iga_type': r'\makecell{IgA \\ (g/L)}',
    r'igm_type': r'\makecell{IgM \\ (g/L)}',
    r'kappa_type': r'\makecell{$\kappa$-Light Chain \\ (mg/dL)}',
    r'lambda_type': r'\makecell{$\lambda$-Light Chain \\ (mg/dL)}',
    r'serum_beta2_microglobulin': r'\makecell{$\beta_2$-Microglobulin \\ (mg/mL)}',
    r'serum_m_protein1': r'\makecell{M-protein \\ (g/dL)}',
    r'chem_creatinine1': r'\makecell{Creatinine \\ ($\mu$mol/L)}',
    r'chem_albumin1': r'\makecell{Albumin \\ (g/L)}',
    r'pfs': r'PFS (months)'
    
}

In [358]:
cohort_str_df = cohort_str_df.rename(att_map,axis=1)

In [359]:
cohort_str_df

Unnamed: 0,\makecell{Age \\ (years)},\makecell{Gender \\ (\% female)},ISS,ECOG,\makecell{IgG \\ (g/L)},\makecell{IgM \\ (g/L)},\makecell{IgA \\ (g/L)},\makecell{$\kappa$-Light Chain \\ (mg/dL)},\makecell{$\lambda$-Light Chain \\ (mg/dL)},\makecell{$\beta_2$-Microglobulin \\ (mg/mL)},\makecell{M-protein \\ (g/dL)},\makecell{Creatinine \\ ($\mu$mol/L)},\makecell{Albumin \\ (g/L)},PFS (months)
\makecell{All \\ (n=995)},63.31$\pm$10.47,40.10$\pm$49.03,1.85$\pm$0.45,0.86$\pm$0.73,0.65$\pm$0.48,0.06$\pm$0.23,0.17$\pm$0.37,0.64$\pm$0.48,0.34$\pm$0.47,4.85$\pm$3.92,2.81$\pm$1.81,113.94$\pm$74.25,35.96$\pm$6.23,16.29$\pm$9.60
\makecell{RNA-Seq \\ (n=662)},63.22$\pm$10.76,41.09$\pm$49.24,1.85$\pm$0.49,0.88$\pm$0.76,0.65$\pm$0.48,0.05$\pm$0.21,0.18$\pm$0.38,0.64$\pm$0.48,0.35$\pm$0.48,4.91$\pm$3.88,2.85$\pm$1.77,112.90$\pm$72.86,35.93$\pm$6.27,16.04$\pm$9.31
\makecell{Bortezomib \\ (n=512)},63.55$\pm$10.62,39.65$\pm$48.96,1.87$\pm$0.49,0.88$\pm$0.79,0.62$\pm$0.49,0.05$\pm$0.22,0.19$\pm$0.40,0.64$\pm$0.48,0.34$\pm$0.48,5.13$\pm$3.98,2.80$\pm$1.80,118.21$\pm$79.43,35.93$\pm$6.37,16.91$\pm$9.02
\makecell{ASCT \\ (n=364)},59.49$\pm$9.18,43.68$\pm$49.67,1.79$\pm$0.52,0.76$\pm$0.67,0.65$\pm$0.48,0.04$\pm$0.19,0.18$\pm$0.38,0.64$\pm$0.48,0.36$\pm$0.48,4.45$\pm$3.68,2.73$\pm$1.79,105.25$\pm$68.38,36.93$\pm$6.18,18.95$\pm$8.89
\makecell{Non-ASCT \\ (n=298)},67.77$\pm$10.81,37.92$\pm$48.60,1.94$\pm$0.44,1.03$\pm$0.83,0.66$\pm$0.48,0.06$\pm$0.24,0.18$\pm$0.39,0.65$\pm$0.48,0.33$\pm$0.47,5.48$\pm$4.04,2.99$\pm$1.75,122.24$\pm$77.09,34.70$\pm$6.18,12.48$\pm$8.56


In [361]:
print(cohort_str_df.T.to_latex(escape=False, column_format='cccccc').replace("\\\n", "\\ \hline\n"))

\begin{tabular}{cccccc}
\toprule
{} & \makecell{All \\ (n=995)} & \makecell{RNA-Seq \\ (n=662)} & \makecell{Bortezomib \\ (n=512)} & \makecell{ASCT \\ (n=364)} & \makecell{Non-ASCT \\ (n=298)} \\ \hline
\midrule
\makecell{Age \\ (years)}                     &           63.31$\pm$10.47 &               63.22$\pm$10.76 &                  63.55$\pm$10.62 &             59.49$\pm$9.18 &                67.77$\pm$10.81 \\ \hline
\makecell{Gender \\ (\% female)}              &           40.10$\pm$49.03 &               41.09$\pm$49.24 &                  39.65$\pm$48.96 &            43.68$\pm$49.67 &                37.92$\pm$48.60 \\ \hline
ISS                                           &             1.85$\pm$0.45 &                 1.85$\pm$0.49 &                    1.87$\pm$0.49 &              1.79$\pm$0.52 &                  1.94$\pm$0.44 \\ \hline
ECOG                                          &             0.86$\pm$0.73 &                 0.88$\pm$0.76 &                    0.88$\pm$0.79 &       