In [1]:
import pandas as pd
from data_reading_functions import data_dir, read_data, core_reference, code_lengths
from analysis_variables import procedure_codes, diagnosis_codes, dataset_filtering_function
from utility_functions import pickle_file, starting_run, finished_run

### Data Reading

In [2]:
split_codes = lambda val, col_name: [val[i:i+code_lengths[col_name]] for i in range(0, len(val), code_lengths[col_name])]

def process_dataset(dataset, proc_code_type):
    starting_run(f"{dataset} construction")
    dataset_core = read_data(core_reference[dataset]["2018"], f"MD_{dataset.upper()}_2018_CORE.asc").append(
        read_data(core_reference[dataset]["2017"], f"MD_{dataset.upper()}_2017_CORE.asc")
    ).append(
        read_data(core_reference[dataset]["2016"], f"MD_{dataset.upper()}_2016_CORE.asc"), ignore_index=True
    ).set_index("record_id")
    
    starting_run(f"{dataset} filtering")
    dataset_core = dataset_filtering_function(dataset_core, proc_code_type)
    
    starting_run(f"{dataset} code splitting")
    dataset_core["ICD-10"] = dataset_core["ICD-10"].transform(split_codes, col_name="ICD-10")
    dataset_core[proc_code_type] = dataset_core[proc_code_type].transform(split_codes, col_name=proc_code_type)
    
    starting_run(f"{dataset} storage")
    pickle_file(f"{dataset}_core_filtered.pickle", dataset_core)
    
    del dataset_core
    finished_run(f"{dataset} creation")

In [3]:
process_dataset("sedd", "cpt_codes")
process_dataset("sasd", "cpt_codes")
process_dataset("sid", "ICD-10-procedures")

Starting sedd construction 19:58:20.234687
Starting sedd filtering 20:10:12.229199
Starting sedd code splitting 20:15:07.166787
Starting sedd storage 20:15:08.200587
Finished sedd creation 20:15:08.429819
Starting sasd construction 20:15:08.467588
Starting sasd filtering 20:35:17.553379
Starting sasd code splitting 20:41:39.400870
Starting sasd storage 20:41:39.661421
Finished sasd creation 20:41:39.720899
Starting sid construction 20:41:39.735148
Starting sid filtering 20:42:24.865537
Starting sid code splitting 20:42:33.891101
Starting sid storage 20:42:33.906752
Finished sid creation 20:42:33.938005
