# Import Required Libraries

In [1]:
from sklearn.model_selection import StratifiedKFold
from model_selection import ClusteredStratifiedKFold

from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

from scipy.stats import ks_2samp

from model import OptimizedKMeans
from model import GeneticProfiling
from model import GeneticClustering
from model import dae_wrapper
from model import Dense
from model import ConvDense

from constants import FISH_VARIABLE_NAMES

from correlation import select_genes, select_genes_mic
from util import to_data_frame
from itertools import compress
from datetime import datetime

import lightgbm as lgb
import pandas as pd
import numpy as np
import pickle
import os

# Loading Data

In [2]:
clinical = pd.read_csv('data/clinical.tsv', sep='\t', index_col='ID')
clinical.iloc[:4,:2]

Unnamed: 0_level_0,response_best_response_first_line,response_days_to_disease_progression
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
MMRF1007,0.0,0.0
MMRF1011,0.0,1.0
MMRF1013,1.0,1.0
MMRF1014,0.0,


In [3]:
clinical = pd.read_csv('data/clinical.tsv', sep='\t', index_col='ID')

RESP_VAR_NAME = 'response_best_response_first_line'

clinical.dropna(subset=[RESP_VAR_NAME], inplace=True)

# del clinical['response_best_response_first_line']
del clinical['response_days_to_disease_progression']
del clinical['response_days_to_first_response']
del clinical['response_best_response_and_days_to_first_therapy']

#for var in FISH_VARIABLE_NAMES:
#    del clinical[var]

genefpkm = pd.read_csv('data/gene_fpkm.tsv', sep='\t', index_col='ID')

selected_index = clinical.join(genefpkm, how='inner').index

clinical = clinical.loc[selected_index,:]

clinical[RESP_VAR_NAME] = clinical[RESP_VAR_NAME].astype(int)

therapy_class = clinical['therapy_first_line_class']

del clinical['therapy_first_line_class']

# remove patients with more than five missing clinical variables
# clinical = clinical[clinical.isna().T.sum() < 10]

genefpkm = genefpkm.loc[selected_index,:]

for g in genefpkm.loc[:, genefpkm.sum() == 0].columns:
    del genefpkm[g]

genefpkm = genefpkm.dropna(axis=1, how='any')

print("Gene expressions {}".format(genefpkm.shape))
    
clinical.iloc[:6,:6]

Gene expressions (724, 49387)


Unnamed: 0_level_0,response_best_response_first_line,cmmc,ecog_ps,cell_markers,percent_aneuploid,percent_plama_cells_bone_marrow
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MMRF1021,0,,PS 1 (Restricted in physically strenuous activ...,CD13,0.0,4.9
MMRF1024,0,,PS 1 (Restricted in physically strenuous activ...,CD117,11.0,6.0
MMRF1029,0,,PS 1 (Restricted in physically strenuous activ...,CD117,0.0,8.4
MMRF1030,1,,PS 1 (Restricted in physically strenuous activ...,CD117,15.4,9.6
MMRF1031,0,,PS 0 (Fully Active),CD117,18.3,10.1
MMRF1032,0,,PS 2 (Ambulatory and capable of all selfcare),CD117,20.7,11.1


In [4]:
for c in clinical.columns:
    print('{} ({}): {}'.format(c, clinical[c].dtype, list(clinical[c].unique())[:4]))

response_best_response_first_line (int32): [0, 1]
cmmc (float64): [nan, 5913.0, 22169.0, 3864.0]
ecog_ps (object): ['PS 1 (Restricted in physically strenuous activity)', 'PS 0 (Fully Active)', 'PS 2 (Ambulatory and capable of all selfcare)', nan]
cell_markers (object): ['CD13', 'CD117', 'CD138', nan]
percent_aneuploid (float64): [0.0, 11.0, 15.4, 18.3]
percent_plama_cells_bone_marrow (float64): [4.9, 6.0, 8.4, 9.6]
percent_plama_cells_peripherical_blood (float64): [0.0, 0.1, 0.6, 0.03]
creatinine (float64): [88.4, 123.76, 106.08, 55.692]
iss (float64): [1.0, 2.0, 3.0, nan]
absolute_neutrophil (float64): [2.4, 2.3, 2.6, 2.5]
platelet (float64): [216.0, 188.0, 219.0, 215.0]
wbc_x10_10_9_l (float64): [5.2, 4.3, 4.0, 4.7]
bun (float64): [8.925, 11.424, 5.355, nan]
glucose (float64): [4.675, 4.785, 5.995, 6.27]
total_protein (float64): [11.5, 8.7, 9.4, 9.8]
albumin (float64): [39.0, 40.0, 36.0, 37.0]
beta_2_microglobulin (float64): [2.1, 3.61, 1.9, 1.98]
calcium (float64): [2.4, 2.45, 2.25,

In [5]:
from collections import Counter
print(dict(Counter(clinical[RESP_VAR_NAME])))
clinical.head()

{0: 553, 1: 171}


Unnamed: 0_level_0,response_best_response_first_line,cmmc,ecog_ps,cell_markers,percent_aneuploid,percent_plama_cells_bone_marrow,percent_plama_cells_peripherical_blood,creatinine,iss,absolute_neutrophil,...,t_8_14_mafa,t_8_14_myc,lga,lgg,lgl_kappa,lgl_lambda,lgm,m_protein,therapy_first_line,first_line_transplant
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MMRF1021,0,,PS 1 (Restricted in physically strenuous activ...,CD13,0.0,4.9,0.0,88.4,1.0,2.4,...,Not Detected,Not Detected,0.66,70.5,48.57,3.28,0.4,3.05,Bor-Len-Dex,Yes
MMRF1024,0,,PS 1 (Restricted in physically strenuous activ...,CD117,11.0,6.0,0.0,123.76,2.0,2.3,...,,,,,9.66,0.87,,2.6,,No
MMRF1029,0,,PS 1 (Restricted in physically strenuous activ...,CD117,0.0,8.4,0.0,106.08,1.0,2.6,...,Not Detected,Not Detected,0.69,27.99,27.04,0.74,0.43,1.8,Bor-Len-Dex,No
MMRF1030,1,,PS 1 (Restricted in physically strenuous activ...,CD117,15.4,9.6,0.0,55.692,1.0,2.5,...,Not Detected,Detected,0.24,41.63,,7.3,0.23,3.55,Bor-Len-Dex,Yes
MMRF1031,0,,PS 0 (Fully Active),CD117,18.3,10.1,0.0,81.328,1.0,10.29,...,Not Detected,Not Detected,15.2,6.47,23.59,1.166,0.76,1.52,Bor-Len-Dex,No


# Transforming Qualitative Variables into Dummy Ones

In [6]:
clinical['first_line_transplant'] = clinical['first_line_transplant'].replace('Yes', 1).replace('No', 0)
all_therapy = clinical['therapy_first_line']

for column in clinical:
    
    values = clinical[column]
    
    if values.dtype == 'object':
        
        values = pd.get_dummies(values)
        
        n_values = values.shape[1]
        
        values.columns = [column + '_' + str(c).lower().replace(' ', '_') for c in values.columns]
    
        del clinical[column]
        
        if n_values == 2:
            values = values.iloc[:, [0]]
        
        clinical = clinical.join(values, how='inner')

clinical = clinical.fillna(0)

print('Clinical data set with {} samples and {} features'.format(*clinical.shape))
clinical.iloc[:8,:]

Clinical data set with 724 samples and 71 features


Unnamed: 0_level_0,response_best_response_first_line,cmmc,percent_aneuploid,percent_plama_cells_bone_marrow,percent_plama_cells_peripherical_blood,creatinine,iss,absolute_neutrophil,platelet,wbc_x10_10_9_l,...,t_4_14_whsc1_detected,t_6_14_ccnd3_detected,t_8_14_mafa_detected,t_8_14_myc_detected,therapy_first_line_bor,therapy_first_line_bor-cyc-dex,therapy_first_line_bor-dex,therapy_first_line_bor-len-dex,therapy_first_line_len,therapy_first_line_len-dex
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MMRF1021,0,0.0,0.0,4.9,0.0,88.4,1.0,2.4,216.0,5.2,...,1,0,0,0,0,0,0,1,0,0
MMRF1024,0,0.0,11.0,6.0,0.0,123.76,2.0,2.3,188.0,4.3,...,0,0,0,0,0,0,0,0,0,0
MMRF1029,0,0.0,0.0,8.4,0.0,106.08,1.0,2.6,219.0,4.0,...,0,0,0,0,0,0,0,1,0,0
MMRF1030,1,0.0,15.4,9.6,0.0,55.692,1.0,2.5,215.0,4.7,...,0,0,0,1,0,0,0,1,0,0
MMRF1031,0,0.0,18.3,10.1,0.0,81.328,1.0,10.29,385.0,12.4,...,0,0,0,0,0,0,0,1,0,0
MMRF1032,0,0.0,20.7,11.1,0.0,70.72,2.0,1.3,166.0,2.5,...,0,0,0,0,0,1,0,0,0,0
MMRF1033,0,0.0,18.5,12.0,0.0,79.56,1.0,3.99,307.0,7.4,...,0,0,0,0,0,0,0,0,0,1
MMRF1037,0,0.0,20.7,17.0,0.0,70.72,1.0,3.2,361.0,5.4,...,0,0,0,0,0,0,0,0,0,1


In [7]:
genefpkm.iloc[:8,:8]

Unnamed: 0_level_0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MMRF1021,17.9842,0.0,36.6665,6.2914,4.54056,0.401168,43.2985,25.6721
MMRF1024,11.9624,0.0,22.0121,5.38688,1.0656,4.0726,0.09242,21.3765
MMRF1029,17.6518,0.0,38.2082,6.61449,2.55321,0.237359,0.158369,55.2849
MMRF1030,1.59596,0.0,35.0286,4.62864,1.31648,0.642315,0.177913,40.7695
MMRF1031,0.030286,0.0,42.3708,5.34199,5.05216,0.439662,1.60464,63.2392
MMRF1032,0.681066,0.0,30.2787,3.01076,1.62869,0.587525,1.09192,20.0476
MMRF1033,0.595213,0.0,29.5319,6.10217,1.29061,2.33192,0.364109,25.0889
MMRF1037,0.513027,0.0,29.3762,4.32193,1.62923,0.131227,0.401753,26.2385


# Removing Bias from Therapy

In [8]:
therapy_columns = []

for c in clinical.columns:
    if 'therapy_first_line' in c:
        therapy_columns.append(c)

to_delete = []

for a in list(clinical[therapy_columns].loc[:,clinical[therapy_columns].sum() < 10].columns):
    to_delete += list(clinical.loc[clinical[a] == 1,:].index)
    del clinical[a]
    all_therapy = all_therapy[all_therapy.str.lower() != a.replace('therapy_first_line_', '')]
    
clinical = clinical.loc[~clinical.index.isin(to_delete),:]

genefpkm = genefpkm.loc[~genefpkm.index.isin(to_delete),:]

therapy_columns = [t for t in therapy_columns if t in clinical.columns]

print('Valid Therapies')
for t in therapy_columns:
    print('* {}'.format(t.replace('therapy_first_line_', '')))

Valid Therapies
* bor-cyc-dex
* bor-dex
* bor-len-dex
* len-dex


In [9]:
%matplotlib inline

from collections import Counter
if False:
    for c in selected_feats:
        print(clinical[c].unique())
        print(Counter(clinical[c]))

        def fff(x):
            vvv = clinical[c].quantile([0, .1,.2,.3,.4,.5,.6,.7,.8,.9,1.]).values
            for i, (a, b) in enumerate(zip(vvv[:-1], vvv[1:])):
                if a <= x < b:
                    return i
            return 10

        clinical['{}_cat'.format(c)] = clinical[c].apply(fff)

        clinical['{}_cat'.format(c)].hist()

# THERAPY SENSITIVITY MODELLING

In [10]:
df = pd.read_csv('output/result_10_fold.csv')[['fold', 'n_genes', 'auc_valid', 
                                               'tp', 'fp', 'tn', 'fn']]

df_best = df.groupby('fold').apply(lambda x: pd.Series({
    'n_genes': x.set_index('n_genes')['auc_valid'].idxmax()
})).reset_index()

df_best['n_genes'] = df_best['n_genes'].astype(int)

df_best = df_best.set_index('fold')

df_best.T

KeyError: 'n_genes'

## 10-fold Experiment

In [None]:
from collections import Counter
from scipy.special import erfinv
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from optimization import lightgbm_optimizer
from evaluation import optimize_threshold, classification_metrics, ks_score

import time

RANDOM_STATE = 10
N_FOLDS = 10

simulation = None

x, y = clinical.values[:, 1:], clinical.values[:, 0]

kfold = StratifiedKFold(N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

def therapy_from_dummy(row):
    global therapy_columns
    try:
        return therapy_columns[row.tolist().index(1)]
    except Exception as e:
        return None 
    

label_encode = LabelEncoder()    
sss = pd.DataFrame({'therapy': label_encode.fit_transform(all_therapy.fillna('Non-therapy').tolist())})
sss['y'] = y

for fold, (train_index, valid_index) in enumerate(kfold.split(x, sss.apply(lambda p: str(int(p['therapy'])) + str(int(p['y'])), axis=1))):
    
    n_features = df_best.iloc[fold,:]['n_genes'].tolist()
    
    fold += 1
    
    #######################################################################################################
    # Split train & valid
    #######################################################################################################
    
    start = time.time()

    response_train = clinical.iloc[train_index, [0]]
    response_valid = clinical.iloc[valid_index, [0]]

    clinical_train_ = clinical.iloc[train_index, 1:]
    clinical_valid_ = clinical.iloc[valid_index, 1:]

    genefpkm_scaler_minmax = MinMaxScaler()

    genefpkm_train_ = genefpkm.iloc[train_index, :]
    genefpkm_train_ = pd.DataFrame(genefpkm_scaler_minmax.fit_transform(genefpkm_train_.values), 
        columns=genefpkm_train_.columns, index=genefpkm_train_.index)

    genefpkm_valid_ = genefpkm.iloc[valid_index, :]
    genefpkm_valid_ = pd.DataFrame(genefpkm_scaler_minmax.transform(
        genefpkm_valid_.values), columns=genefpkm_valid_.columns, index=genefpkm_valid_.index)

    gene_normalization_time = time.time() - start

    #######################################################################################################
    # Select gene expressions
    #######################################################################################################
    
    # export feature selection result
    selected_genes = pd.read_csv('output/brfl/selected_genes_{}_of_{}_fold.csv'.format(fold, N_FOLDS))['gene'].to_list()
    selected_feats = pd.read_csv('output/brfl/selected_feats_{}_of_{}_fold.csv'.format(fold, N_FOLDS))['gene'].to_list() # just a bad name, it shold be feat
    
    # force therapy columns to be selected
    selected_feats = list(set(selected_feats + therapy_columns))

    #######################################################################################################
    # Remove unselected features
    #######################################################################################################

    clinical_train = clinical_train_.loc[:,selected_feats].copy()
    clinical_valid = clinical_valid_.loc[:,selected_feats].copy()

    genefpkm_train = genefpkm_train_.loc[:,selected_genes[:n_features]].copy()
    genefpkm_valid = genefpkm_valid_.loc[:,selected_genes[:n_features]].copy()

    #######################################################################################################
    # Genetic Profiling
    #######################################################################################################

    start = time.time()

    file_name = 'output/brfl/kmeans_genetic_profiling_{}_of_{}_fold_{}_genes.pkl'.format(
        fold, N_FOLDS, n_features)
    
    with open(file_name, 'rb') as file:
        genetic_profiling = pickle.load(file)

    profiling_train = to_data_frame(genetic_profiling.transform(genefpkm_train), 
                                    prefix='PV', index=genefpkm_train.index)    
    clinical_train = pd.concat([clinical_train, profiling_train], axis=1)

    profiling_valid = to_data_frame(genetic_profiling.transform(genefpkm_valid), 
                                    prefix='PV', index=genefpkm_valid.index)
    clinical_valid = pd.concat([clinical_valid, profiling_valid], axis=1)    

    #######################################################################################################
    # Gene Clustering
    #######################################################################################################

    file_name= 'output/brfl/kmeans_genetic_clustering_{}_of_{}_fold_{}_genes.pkl'.format(
        fold, N_FOLDS, n_features)       
    
    with open(file_name, 'rb') as file:
        genetic_clustering = pickle.load(file)

    gene_cluster_train = to_data_frame(genetic_clustering.transform(genefpkm_train), 
                                       prefix='GC', index=genefpkm_train.index)
    gene_cluster_valid = to_data_frame(genetic_clustering.transform(genefpkm_valid), 
                                       prefix='GC', index=genefpkm_valid.index)        

    clinical_train = pd.concat([clinical_train, gene_cluster_train], axis=1)
    clinical_valid = pd.concat([clinical_valid, gene_cluster_valid], axis=1)

    #######################################################################################################
    # Normalizing Clinical Data
    #######################################################################################################

    clinical_scaler_minmax = MinMaxScaler()

    clinical_train__ = clinical_scaler_minmax.fit_transform(clinical_train)
    clinical_train = pd.DataFrame(clinical_train__, index=clinical_train.index, columns=clinical_train.columns)
    clinical_train = clinical_train.fillna(0)

    clinical_valid__ = clinical_scaler_minmax.transform(clinical_valid)
    clinical_valid = pd.DataFrame(clinical_valid__, index=clinical_valid.index, columns=clinical_valid.columns)
    clinical_valid = clinical_valid.fillna(0)

    #######################################################################################################
    # Denoising Autoencoder
    #######################################################################################################

    file_name = '{}_of_{}_fold_{}_genes'.format(fold, N_FOLDS, n_features)
    
    dda_train, dda_valid = dae_wrapper(genefpkm_train, genefpkm_valid, RANDOM_STATE, file_name, predict=True)

    dda_scaler_minmax = MinMaxScaler()

    dda_train = dda_scaler_minmax.fit_transform(dda_train)
    dda_train = pd.DataFrame(dda_train, index=genefpkm_train.index)
    dda_train.columns = [str(col) + '_DDA' for col in genefpkm_train.columns]

    dda_valid = dda_scaler_minmax.transform(dda_valid)
    dda_valid = pd.DataFrame(dda_valid, index=genefpkm_valid.index)
    dda_valid.columns = [str(col) + '_DDA' for col in genefpkm_valid.columns]

    dda_time = time.time() - start

    #######################################################################################################
    # Joining all features
    #######################################################################################################

    x_train = clinical_train.join(genefpkm_train, how='inner').join(dda_train, how='inner')
    x_valid = clinical_valid.join(genefpkm_valid, how='inner').join(dda_valid, how='inner')

    # x_train = x_train.loc[x_train[therapy_columns].astype(int)
    # .apply(lambda x: np.sum(x.tolist()), axis=1) != 0]
    # response_train = response_train.loc[(x_train[therapy_columns]
    # .astype(int).apply(lambda x: np.sum(x.tolist()), axis=1) != 0).values]

    # x_valid = x_valid.loc[x_valid[therapy_columns].astype(int)
    # .apply(lambda x: np.sum(x.tolist()), axis=1) != 0]
    # response_valid = response_valid.loc[(x_valid[therapy_columns]
    # .astype(int).apply(lambda x: np.sum(x.tolist()), axis=1) != 0).values]

    #######################################################################################################
    # Light GBM Load
    #######################################################################################################

    model_name = 'output/brfl/classifier_{}_of_{}_fold_with_{}_genes.lgbm'.format(fold, N_FOLDS, n_features)

    with open(model_name, 'rb') as file:
        gbm = pickle.load(file)

    #######################################################################################################
    # Light GBM Inference
    #######################################################################################################
    
    local_simulation = pd.DataFrame({'ID': x_valid.index})
    
    local_simulation['ACTUAL_THERAPY'] = x_valid[therapy_columns].idxmax(axis=1).fillna('non-therapy').tolist()
    local_simulation['ACTUAL_THERAPY_INDEX'] = x_valid[therapy_columns].max(axis=1).tolist()
    
    local_simulation['ACTUAL_THERAPY'] = local_simulation[['ACTUAL_THERAPY', 'ACTUAL_THERAPY_INDEX']].apply(
        lambda x: x['ACTUAL_THERAPY'] if x['ACTUAL_THERAPY_INDEX'] != 0 else 'non-therapy', axis=1)
    
    del local_simulation['ACTUAL_THERAPY_INDEX']
    
    for t1 in therapy_columns + ['non-therapy']:
        for t2 in therapy_columns:
            x_valid[t2] = int(t1 == t2)
        local_simulation[t1] = gbm.predict(x_valid.values)
        
    local_simulation['NEW_THERAPY'] = local_simulation[therapy_columns  + ['non-therapy']].idxmax(axis=1)

    # local_simulation['NEW_THERAPY'] = local_simulation.apply(lambda x: x['ACTUAL_THERAPY'] if x[x['ACTUAL_THERAPY']] == x[therapy_columns + ['non-therapy']].max() else x['NEW_THERAPY'])
    
    simulation = local_simulation if simulation is None else pd.concat([simulation, local_simulation])
    
simulation['NEW_THERAPY'] = simulation.apply(
    lambda x: x['ACTUAL_THERAPY'] if x[x['ACTUAL_THERAPY']] == x[therapy_columns + ['non-therapy']].max() else x['NEW_THERAPY'], axis=1)

simulation.head()

In [11]:
from matplotlib import pyplot as plt
import seaborn as sns; sns.set()

def change_proportion(x):
    vvv = x['ACTUAL_THERAPY'].tolist()

    return x.shape[0] / (simulation['ACTUAL_THERAPY'] == np.unique(vvv)[0]).sum()

h = pd.DataFrame(simulation[['NEW_THERAPY', 'ACTUAL_THERAPY']].groupby(
    ['ACTUAL_THERAPY', 'NEW_THERAPY']).apply(change_proportion).unstack())

# h['non-therapy'] = 0

h.columns = [c.replace('therapy_first_line_', '') for c in h.columns]
h.index = [c.replace('therapy_first_line_', '') for c in h.index]
h = h.sort_index()[sorted(h.columns)].fillna(0)

fig, ax = plt.subplots(1, 1, figsize = (5, 5), dpi=80)

sns.heatmap(h, vmin=0.0, vmax=1, square=True, linewidths=.5, annot=True, cmap='gray')

ax.set_ylabel('Actual Therapy')    
ax.set_xlabel('Simulated Therapy')

plt.show()

NameError: name 'simulation' is not defined

In [None]:
(simulation['ACTUAL_THERAPY'] != simulation['NEW_THERAPY']).sum() / simulation.shape[0]

In [None]:
simulation['new_therapy'] = (simulation['ACTUAL_THERAPY'] != simulation['NEW_THERAPY']).astype(int)
simulation.to_csv('output/simulation_result.csv', index=False, sep=',')
simulation.head()