In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [None]:
date = '2025-03-29'
root_dir = '/cluster/projects/gliugroup/2BLAST'

# Raw

In [None]:
df = pd.read_parquet(f'{root_dir}/data/processed/ESAS/ESAS_{date}')

In [None]:
# survey names
df['obs_name'].value_counts()

In [None]:
# datetime
"""
Hmm, what's up with the 177k rows without a timestamp?
Most likely the datetime is stored in the column Observations.Observation.meta.lastUpdated 
in the {raw_data_dir}/observation_csv/*-observations-meta.csv files

Because it makes up only ~2.5% of the cases, I'm going to ignore them

TODO: process the meta csv files and merge it with the main datasets (why was it even separated in the first place?)
"""
main_date_col, secondary_date_col = "occurrence_datetime_from_order", "effective_datetime"
pd.DataFrame(
    data=[[
        (df[main_date_col].notna() & df[secondary_date_col].isna()).sum(),
        (df[main_date_col].isna() & df[secondary_date_col].isna()).sum(),
        (df[main_date_col].notna() & df[secondary_date_col].notna()).sum(),
        (df[main_date_col].isna() & df[secondary_date_col].notna()).sum()
    ]],
    columns=[f'Only {main_date_col}', 'Neither', 'Both', f'Only {secondary_date_col}']
)

In [None]:
# string entries
num_col, str_col = "obs_val_num", "obs_val_str"
pd.DataFrame(
    data=[[
        (df[num_col].notna() & df[str_col].isna()).sum(),
        (df[num_col].isna() & df[str_col].isna()).sum(),
        (df[num_col].notna() & df[str_col].notna()).sum(),
        (df[num_col].isna() & df[str_col].notna()).sum()
    ]],
    columns=['Only numerical entries', 'Neither', 'Both', 'Only string entries']
)

# Processed

In [None]:
df = pd.read_parquet(f'{root_dir}/data/final/data_{date}/interim/symptom.parquet')
df['obs_year'] = df['obs_date'].dt.year
symp_cols = df.columns.drop(['mrn', 'obs_year', 'obs_date'])

In [None]:
N = df['mrn'].nunique()
min_date, max_date = df['obs_date'].min(), df['obs_date'].max()
print(f'{N} patients from {min_date} to {max_date}')

In [None]:
# measurement count over time
# NOTE: The completion rate of these surveys dropped from ~70% to ~30% during COVID and has never rebounded. 
# We might have to rethink on the relevance of these as features, if we want to use them for predicting future outcomes. 
counts = df.groupby('obs_year').apply(lambda g: g[symp_cols].notnull().sum(), include_groups=False)
counts = counts.reset_index().melt('obs_year', var_name='symptom', value_name='count')
g = sns.relplot(
    data=counts, x='obs_year', y='count', col='symptom', col_wrap=3, kind='line', 
    facet_kws={'sharex': False, 'sharey': False}
)

In [None]:
# score distribution
fig, axes = plt.subplots(nrows=int(np.ceil(len(symp_cols)/2)), ncols=2, figsize=(10,20))
axes = axes.flatten()
for idx, col in enumerate(symp_cols): 
    sns.histplot(df[col], ax=axes[idx], discrete=True)
plt.tight_layout()

In [None]:
# overall missingness
df.isnull().mean().sort_values()

In [None]:
# patients over time
df.groupby('obs_year')['mrn'].nunique().plot(kind='bar')

In [None]:
# surveys per patient
df.groupby('obs_year').apply(
    lambda g: g.groupby('mrn').apply(len, include_groups=False).mean(),
    include_groups=False
).plot(kind='bar')