In [None]:
import matplotlib.pyplot as plt
import polars as pl
import seaborn as sns
from make_clinical_dataset.epic.util import plot_count_over_time
from make_clinical_dataset.shared.constants import ROOT_DIR

In [None]:
DATE = '2025-03-29'
DATA_DIR = f"{ROOT_DIR}/data/final/data_{DATE}"

In [None]:
data = pl.read_parquet(f'{DATA_DIR}/processed/treatment_centered_data.parquet')
data = data.with_columns(pl.col('assessment_date').dt.year().alias('assessment_year'))

# Features

In [None]:
# age of patients over time
fig, ax = plt.subplots(figsize=(12, 6))
df = data.select('assessment_year', 'age', 'sex').to_pandas()
sns.boxplot(data=df, x="assessment_year", y="age", color=".8", linecolor="#137", linewidth=.75, ax=ax, native_scale=True) # hue='sex'

In [None]:
# line of therapy over time
fig, ax = plt.subplots(figsize=(12, 6))
df = data.select('assessment_year', 'line_of_therapy').to_pandas()
sns.boxplot(data=df, x="assessment_year", y="line_of_therapy", color=".8", linecolor="#137", linewidth=.75, ax=ax, native_scale=True)

# Targets

In [None]:
targ_cols = [
    'target_ED_30d',
    'target_ED_60d',
    'target_ED_90d',
    'target_hemoglobin_grade2plus',
    'target_hemoglobin_grade3plus',
    'target_neutrophil_grade2plus',
    'target_neutrophil_grade3plus',
    'target_platelet_grade2plus',
    'target_platelet_grade3plus',
    'target_bilirubin_grade2plus',
    'target_bilirubin_grade3plus',
    'target_AKI_grade2plus',
    'target_AKI_grade3plus',
    'target_ALT_grade2plus',
    'target_ALT_grade3plus',
    'target_AST_grade2plus',
    'target_AST_grade3plus',
    'target_anxiety_3pt_change',
    'target_depression_3pt_change',
    'target_drowsiness_3pt_change',
    'target_lack_of_appetite_3pt_change',
    'target_nausea_3pt_change',
    'target_pain_3pt_change',
    'target_shortness_of_breath_3pt_change',
    'target_tiredness_3pt_change',
    'target_well_being_3pt_change',
    'target_death_in_30d',
    'target_death_in_365d'
]
data = data.select('assessment_year', *targ_cols)

In [None]:
%%time
from ydata_profiling import ProfileReport
df = data.with_columns([pl.col(col).cast(pl.String).cast(pl.Categorical) for col in targ_cols]).to_pandas()
profile = ProfileReport(df, minimal=True)
profile.to_file("reports/targets.html")
profile.to_notebook_iframe()

In [None]:
# target counts over time
counts = (
    data
    .unpivot(index=['assessment_year'], variable_name='target', value_name='value')
    .group_by(['assessment_year', 'target', 'value']).agg(pl.len())
    .sort('target')
    .to_pandas()
)
plot_count_over_time(counts, x='assessment_year', y='len', hue='value', catcol='target')

In [None]:
plot_count_over_time(counts.query('value == 1'), x='assessment_year', y='len', catcol='target')