In [None]:
%%capture
%cd ../../
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd

from common.src.engineer import get_change_since_prev_session
from common.src.prep import Splitter

from src.prepare.filter import drop_samples_outside_study_date, keep_only_one_per_week
from src.label import get_symptom_labels
from src.summarize import get_patient_characteristics, pre_and_post_treatment_missingness_summary

# Missingness

In [None]:
symp = pd.read_parquet('./data/external/symptom.parquet.gzip')
data = pd.read_parquet('./data/treatment_centered_clinical_dataset.parquet.gzip')
data = get_change_since_prev_session(data)
data = keep_only_one_per_week(data)
data = drop_samples_outside_study_date(data)
data = get_symptom_labels(data, symp)

Getting change since last session...: 100%|██████████| 9297/9297 [00:02<00:00, 3728.90it/s]
Getting the first sessions of a given week...: 100%|██████████| 9297/9297 [00:00<00:00, 22560.71it/s]
Getting symptom labels...: 100%|██████████| 1096/1096 [00:06<00:00, 180.96it/s]
Getting symptom labels...: 100%|██████████| 1097/1097 [00:06<00:00, 169.34it/s]
Getting symptom labels...: 100%|██████████| 1097/1097 [00:06<00:00, 167.57it/s]
Getting symptom labels...: 100%|██████████| 1097/1097 [00:06<00:00, 157.84it/s]


In [None]:
# Compute rates of ESAS symptom score missingness in the cohort
symps = ['nausea', 'appetite', 'pain', 'shortness_of_breath', 'tiredness', 'drowsiness', 'depression', 'anxiety', 'well_being']
pretreatment_cols = [f'esas_{symp}' for symp in symps]
posttreatment_cols = [f'target_{col}_survey_date' for col in pretreatment_cols]
event_cols = [f'target_{col}_change' for col in pretreatment_cols]
splitter = Splitter()
dev_cohort, test_cohort = splitter.temporal_split(data, split_date='2017-10-01')
for name, cohort in {'test': test_cohort, 'dev': dev_cohort}.items():
    result = pre_and_post_treatment_missingness_summary(cohort, pretreatment_cols, posttreatment_cols, event_cols)
    result.to_csv(f'./result/tables/missingness_summary_{name}_cohort.csv')
result[['any_missingness_trt', 'target_missingness_trt', 'event_rate_trt', 'any_missingness_mrn', 'event_rate_mrn']]

Unnamed: 0,any_missingness_trt,target_missingness_trt,event_rate_trt,any_missingness_mrn,event_rate_mrn
esas_nausea,10392 (41.6),4647 (24.1),1829 (12.5),2062 (70.7),801 (31.8)
esas_appetite,10367 (41.5),4621 (24.0),1949 (13.3),2055 (70.5),891 (35.4)
esas_pain,10356 (41.4),4633 (24.0),1955 (13.4),2062 (70.7),893 (35.5)
esas_shortness_of_breath,10366 (41.5),4627 (24.0),2152 (14.7),2064 (70.8),912 (36.3)
esas_tiredness,10367 (41.5),4632 (24.1),2695 (18.4),2063 (70.8),1111 (44.1)
esas_drowsiness,10401 (41.6),4652 (24.2),2270 (15.6),2063 (70.8),1001 (39.8)
esas_depression,10400 (41.6),4637 (24.1),2101 (14.4),2065 (70.8),923 (36.7)
esas_anxiety,10403 (41.6),4643 (24.1),2103 (14.4),2069 (71.0),934 (37.2)
esas_well_being,10551 (42.2),4666 (24.4),2517 (17.4),2079 (71.3),1079 (43.0)
Mean,10400 (41.6),4639 (24.1),2174 (14.9),2064 (70.8),949 (37.7)


In [None]:
# Comapre the cohort with missing ESAS symptom scores and the cohort without missing ESAS symptom scores
col = 'esas_nausea'

cancer_cols = [col for col in dev_cohort.columns if col.startswith('cancer_site')]
top_cancers = dev_cohort[cancer_cols].sum().sort_values(ascending=False)[:5].index
top_regimens = dev_cohort['regimen'].value_counts()[:5].index

result = dict()
for name, cohort in {'dev': dev_cohort, 'test': test_cohort}.items():
    mask = cohort[f'target_{col}_change'].isnull()
    no_scores, has_scores = cohort[mask], cohort[~mask]
    result[(name, 'No Scores', 'Treatments')] = get_patient_characteristics(no_scores, top_regimens, top_cancers)
    result[(name, 'Has Scores', 'Treatments')] = get_patient_characteristics(has_scores, top_regimens, top_cancers)
    no_scores, has_scores = no_scores.groupby('mrn').last(), has_scores.groupby('mrn').last()
    result[(name, 'No Scores', 'Patients')] = get_patient_characteristics(no_scores, top_regimens, top_cancers)
    result[(name, 'Has Scores', 'Patients')] = get_patient_characteristics(has_scores, top_regimens, top_cancers)
result = pd.DataFrame(result)
result.to_csv('./result/tables/missingness_cohort_characteristics.csv')