# Baseline Characteristics of population
Shows some descriptive statistics of the discharged patients of the `algo_vs_doc` project

In [None]:
import os
import pandas as pd

# Define project root directory
project_root = '../'

## Setttings and Data

In [None]:
data_folder = "data"
tables_folder = "tables"

hospitals = {
    'vumc': "merged_data_vumc.tsv",
    'lumc': "merged_data_lumc.tsv"
}

## Load Data

In [None]:
lumc = pd.read_csv(
    os.path.join(project_root, data_folder, hospitals['lumc']), sep="\t"
)

vumc = pd.read_csv(
    os.path.join(project_root, data_folder, hospitals['vumc']), sep="\t"
)

## LUMC baseline characteristics

In [None]:
len(lumc)

In [None]:
lumc["length_of_stay_hours"] = (
    pd.to_datetime(lumc["discharge_time"]) - pd.to_datetime(lumc["admission_timestamp"])
).dt.total_seconds() / (60 * 60)

In [None]:
lumc["age"].describe()

In [None]:
lumc["sex"].fillna("unknown").value_counts()

In [None]:
lumc["icu_specialty"].fillna("unknown").value_counts()

In [None]:
lumc["length_of_stay_hours"].describe()

## VUMC baseline characteristics

In [None]:
len(vumc)

In [None]:
vumc["age"].describe()

In [None]:
vumc["sex"].fillna("unknown").value_counts()

In [None]:
vumc["icu_specialty"].fillna("unknown").value_counts()

In [None]:
vumc["length_of_stay_hours"].describe()

## Create Baseline Characteristics table

In [None]:
# common columns containg baseline data
cols = ['age', 'sex', 'icu_specialty', 'length_of_stay_hours', 'outcome', 'hospital']

# merge both hospital data frames
lumc['hospital'] = 'lumc'
vumc['hospital'] = 'vumc'

merged = pd.concat([lumc[cols], vumc[cols]])

# create baseline dataframe (table)
baseline = pd.DataFrame()
groups = {
    'All': (~merged['hospital'].isna()), # all records
    'VUmc': (merged['hospital'] == 'vumc'),
    'LUMC': (merged['hospital'] == 'lumc')
}

merged["sex"] = merged["sex"].fillna("unknown")

for group in groups:
    select = groups[group]

    # ICU discharges, n
    total = len(merged[select])
    baseline.loc['ICU discharges, n', group] = f"{total}"
    
    # Age, mean (SD)
    age_mean = merged[select]['age'].mean()
    age_sd = merged[select]['age'].std()
    baseline.loc['Age, mean (SD)', group] = f"{age_mean:.0f} ({age_sd:.0f})"
    
    # Female, n (%)
    female_n = merged[select & (merged['sex'] == 'female')]['sex'].count()
    female_pct = 100*female_n/total
    baseline.loc['Female, n (%)', group] = f"{female_n} ({female_pct:.1f})"

    # Admitting specialty, n (%)
    baseline.loc['Admitting specialty, n (%)', group] = ""

    # Surgical
    select_surgical = ['cardio_surgery', 'surgery', 'urology', 'ent', 'neuro_surgery', 'mka']
    surgical_n = merged[select & merged['icu_specialty'].isin(select_surgical)]['icu_specialty'].count()
    surgical_pct = 100*surgical_n/total
    baseline.loc['Surgical', group] = f"{surgical_n} ({surgical_pct:.1f})"
    
    # Cardiothoracic Surgery
    ctc_n = merged[select & (merged['icu_specialty'] == 'cardio_surgery')]['icu_specialty'].count()
    ctc_pct = 100*ctc_n/total
    baseline.loc['Cardiothoracic Surgery', group] = f"{ctc_n} ({ctc_pct:.1f})"

    # General Surgery
    gen_surg_n = merged[select & (merged['icu_specialty'] == 'surgery')]['icu_specialty'].count()
    gen_surg_pct = 100*gen_surg_n/total
    baseline.loc['General Surgery', group] = f"{gen_surg_n} ({gen_surg_pct:.1f})"

    # Neurosurgery
    neuro_surg_n = merged[select & (merged['icu_specialty'] == 'neuro_surgery')]['icu_specialty'].count()
    neuro_surg_pct = 100*neuro_surg_n/total
    baseline.loc['Neurosurgery', group] = f"{neuro_surg_n} ({neuro_surg_pct:.1f})"

    # Otolaryngology
    ent_n = merged[select & (merged['icu_specialty'] == 'ent')]['icu_specialty'].count()
    ent_pct = 100*ent_n/total
    baseline.loc['Otolaryngology', group] = f"{ent_n} ({ent_pct:.1f})"

    # Other surgery
    other_surg_n = merged[select & merged['icu_specialty'].isin(['urology', 'mka'])]['icu_specialty'].count()
    other_surg_pct = 100*other_surg_n/total
    baseline.loc['Other surgery', group] = f"{other_surg_n} ({other_surg_pct:.1f})"

    # Medical
    select_medical = ['internal_medicine', 'internal_medicine_hematology', 'psychiatry', 'neurology', 'cardiology', 'pulmonary_medicine', 
                      'pulmonary', 'gastroenterology']
    medical_n = merged[select & merged['icu_specialty'].isin(select_medical)]['icu_specialty'].count()
    medical_pct = 100*medical_n/total
    baseline.loc['Medical', group] = f"{medical_n} ({medical_pct:.1f})"

    # Internal Medicine
    medicine_n = merged[select & merged['icu_specialty'].isin(['internal_medicine', 'internal_medicine_hematology'])]['icu_specialty'].count()
    medicine_pct = 100*medicine_n/total
    baseline.loc['Internal Medicine', group] = f"{medicine_n} ({medicine_pct:.1f})"

    # Gastro-enterology
    ge_n = merged[select & (merged['icu_specialty'] == 'gastroenterology')]['icu_specialty'].count()
    ge_pct = 100*ge_n/total
    baseline.loc['Gastro-enterology', group] = f"{ge_n} ({ge_pct:.1f})"

    # Cardiology
    cardio_n = merged[select & (merged['icu_specialty'] == 'cardiology')]['icu_specialty'].count()
    cardio_pct = 100*cardio_n/total
    baseline.loc['Cardiology', group] = f"{cardio_n} ({cardio_pct:.1f})"

    # Neurology
    neuro_n = merged[select & (merged['icu_specialty'] == 'neurology')]['icu_specialty'].count()
    neuro_pct = 100*neuro_n/total
    baseline.loc['Neurology', group] = f"{neuro_n} ({neuro_pct:.1f})"

    # Pulmonary Medicine
    pulmo_n = merged[select & merged['icu_specialty'].isin(['pulmonary_medicine', 'pulmonary'])]['icu_specialty'].count()
    pulmo_pct = 100*pulmo_n/total
    baseline.loc['Pulmonary Medicine', group] = f"{pulmo_n} ({pulmo_pct:.1f})"

    # Other medical
    other_n = merged[select & merged['icu_specialty'].isin(['psychiatry'])]['icu_specialty'].count()
    other_pct = 100*other_n/total
    baseline.loc['Other medical', group] = f"{other_n} ({other_pct:.1f})"

    # Unspecified
    unspecified_n = merged[select & merged['icu_specialty'].isin(['other', 'intensive_care'])]['icu_specialty'].count()
    unspecified_pct = 100*unspecified_n/total
    baseline.loc['Unspecified', group] = f"{unspecified_n} ({unspecified_pct:.1f})"

    # Length of Stay, days, median (IQR)
    merged['length_of_stay_days'] = merged['length_of_stay_hours']/24
    los_median = merged[select]['length_of_stay_days'].median()
    los_q25 = merged[select]['length_of_stay_days'].quantile(0.25)
    los_q75 = merged[select]['length_of_stay_days'].quantile(0.75)
    baseline.loc['Length of Stay, days, median (IQR)', group] = f"{los_median:.1f} ({los_q25:.1f}-{los_q75:.1f})"

    # Readmission/death within 7 days after discharge, n (%)
    outcome_n = merged[select & (merged['outcome'] == True)]['outcome'].count()
    outcome_pct = 100*outcome_n/total
    baseline.loc['Readmission/death within 7 days after discharge, n (%)', group] = f"{outcome_n} ({outcome_pct:.1f})"

baseline.to_csv(
    os.path.join(project_root, tables_folder, "Combined", "baseline_characteristics.csv")
    )
baseline