In [1]:
import pandas as pd
import numpy as np
from utility_functions import load_file, pickle_file, starting_run, finished_run
from data_reading_functions import code_lengths
from analysis_variables import procedure_codes, data_enrichment_function

### Imports & File Loading

In [2]:
sedd_core_filtered = load_file("sedd_core_filtered.pickle")
sasd_core_filtered = load_file("sasd_core_filtered.pickle")
sid_core_filtered = load_file("sid_core_filtered.pickle")
sid_ed_admissions = load_file("sedd_appendix.pickle")
null_codes = {name: '                       '[:length] for name, length in code_lengths.items()}

### Utility Functions

In [3]:
def create_linker_table(sedd, sid_ed, sid):
    def create_linker_table(dataset, sid_flag):
        join_dataset = sid if sid_flag else dataset
        return dataset.reset_index().groupby("visit_link")[["record_id", "year"]].min().join(
            join_dataset[["age", "female", "homeless", "race", "married", "median_zip_income", "payer", "discharge_quarter"]],
            on="record_id"
        ).rename(columns={
            "record_id": "initial_record_id",
            "year": "initial_year",
            "discharge_quarter": "initial_discharge_quarter"
        })
    #init linker_table with year and record_id of initial ED visit
    linker_table = create_linker_table(sedd, False).append(
        create_linker_table(sid_ed, True)
    ).sort_values(["initial_year", "initial_discharge_quarter"]).reset_index().drop_duplicates("visit_link", keep="first").set_index("visit_link")
    linker_table = linker_table[linker_table["initial_year"] < "2018"]
    
    #add max_year based on initial_year (assume following for 1 year)
    linker_table["max_year"] = (linker_table["initial_year"].astype(int) + 1).astype(str)
    
    linker_table = linker_table.astype({
        "age": "int",
        "female": "int",
        "homeless": "int",
        "median_zip_income": "int"
    })
    
    
    return linker_table.loc[linker_table.index.astype("int") > 0, :]

In [4]:
def censor_first_6_mos(linker_table):
    min_year = linker_table["initial_year"].min()
    return linker_table.query(f"initial_year > '{min_year}' or initial_discharge_quarter > '2'")

In [5]:
def filter_data_on_year(sedd, sasd, sid, sid_ed, linker_table):
    return (
        dataset.loc[dataset[["visit_link", "year", "discharge_quarter"]].join(
            linker_table[["initial_year", "max_year", "initial_discharge_quarter"]], on="visit_link"
        ).query(
            "(initial_year == year and initial_discharge_quarter >= discharge_quarter) or (max_year == year and initial_discharge_quarter <= discharge_quarter)"
        ).index] for dataset in [sedd, sasd, sid, sid_ed]
    )

In [6]:
def count_followed_visits(sedd, sasd, sid, sid_ed, linker_table):
    def count_visits(dataset, col_name):
        return dataset.join(linker_table, on="visit_link", rsuffix="_x").groupby("visit_link").count()["year"].rename(col_name)
    return linker_table.join(
        count_visits(sedd, "ED_revisits").add(count_visits(sid_ed, "ED_revisits"), fill_value=0).sub(1)
    ).join(
        count_visits(sasd, "surgery_visits")).join(
        count_visits(sid, "inpatient_visits")).fillna(0)

In [7]:
def create_code_lookup_table(sedd, sasd, sid, linker_table):
    def preprocess_dataset_on_init_chart(dataset):
        return linker_table.join(dataset, on="initial_record_id", how="inner", rsuffix="_x").reset_index(drop=True).groupby("visit_link")
    def preprocess_dataset(dataset):
        return dataset.join(linker_table, on="visit_link", how="inner", rsuffix="_x").groupby("visit_link")
    def postprocess_dataset(dataset, code_type):
        return pd.DataFrame(dataset[code_type].agg(np.sum).explode().replace(null_codes[code_type], np.nan).dropna().rename("codes").astype("str"))
    
    cpt_codes = postprocess_dataset(preprocess_dataset(sedd), 'cpt_codes').append(
        postprocess_dataset(preprocess_dataset(sasd), 'cpt_codes')
    )
    cpt_codes['cpt_flag'] = True
    
    icd_codes = postprocess_dataset(preprocess_dataset_on_init_chart(sedd), 'ICD-10').append(
        postprocess_dataset(preprocess_dataset_on_init_chart(sid), 'ICD-10'))
    icd_codes['icd_flag'] = True
    
    icd_proc_codes = postprocess_dataset(preprocess_dataset(sid), 'ICD-10-procedures')
    icd_proc_codes['icd_proc_flag'] = True
    
    return cpt_codes.append(icd_codes).append(icd_proc_codes)

In [8]:
def process_datasets(sedd, sasd, sid, sid_ed):
    linker_table = create_linker_table(sedd, sid_ed, sid)
    linker_table = censor_first_6_mos(linker_table)
    sedd, sasd, sid, sid_ed = filter_data_on_year(sedd, sasd, sid, sid_ed, linker_table)
    linker_table = count_followed_visits(sedd, sasd, sid, sid_ed, linker_table)
    linker_table = data_enrichment_function(sedd, sasd, sid, sid_ed, linker_table)
    codes = create_code_lookup_table(sedd, sasd, sid, linker_table)
    return linker_table, codes, sid

### Main Code

In [9]:
starting_run("process full datasets")
filtered_dataset, filtered_dataset_codes, sid_filtered = process_datasets(sedd_core_filtered, sasd_core_filtered, sid_core_filtered, sid_ed_admissions)
starting_run("store datasets")
pickle_file("filtered_dataset.pickle", filtered_dataset)
pickle_file("filtered_dataset_codes.pickle", filtered_dataset_codes)
pickle_file("filtered_sid_data.pickle", sid_filtered)
finished_run()

Starting process full datasets 20:57:43.701644
Starting store datasets 20:57:44.935515
Finished  20:57:44.951134
