In [1]:
import pandas as pd
from data_reading_functions import data_dir, read_data, core_reference, code_lengths
from analysis_variables import procedure_codes, diagnosis_codes, dataset_filtering_function
from utility_functions import pickle_file, starting_run, finished_run

### Data Reading

In [2]:
split_codes = lambda val, col_name: [val[i:i+code_lengths[col_name]] for i in range(0, len(val), code_lengths[col_name])]

def process_dataset(dataset, proc_code_type):
    starting_run(f"{dataset} construction")
    dataset_core = read_data(core_reference[dataset]["2018"], f"MD_{dataset.upper()}_2018_CORE.asc").append(
        read_data(core_reference[dataset]["2017"], f"MD_{dataset.upper()}_2017_CORE.asc")
    ).append(
        read_data(core_reference[dataset]["2016"], f"MD_{dataset.upper()}_2016_CORE.asc"), ignore_index=True
    ).set_index("record_id")
    
    starting_run(f"{dataset} filtering")
    dataset_core = dataset_filtering_function(dataset, dataset_core, proc_code_type)
    starting_run(f"{dataset} code splitting")
    dataset_core["ICD-10"] = dataset_core["ICD-10"].transform(split_codes, col_name="ICD-10")
    dataset_core[proc_code_type] = dataset_core[proc_code_type].transform(split_codes, col_name=proc_code_type)
    
    starting_run(f"{dataset} storage")
    pickle_file(f"{dataset}_core_filtered.pickle", dataset_core)
    
    del dataset_core
    finished_run(f"{dataset} creation")

In [3]:
process_dataset("sedd", "cpt_codes")
process_dataset("sasd", "cpt_codes")
process_dataset("sid", "ICD-10-procedures")

Starting sedd construction 13:36:30.937353
Starting sedd filtering 13:59:27.430128
Starting sedd code splitting 14:06:12.038371
Starting sedd storage 14:06:22.745426
Finished sedd creation 14:06:27.354267
Starting sasd construction 14:06:27.359251
Starting sasd filtering 14:29:21.413159
Starting sasd code splitting 14:35:16.428422
Starting sasd storage 14:35:29.561531
Finished sasd creation 14:35:36.835101
Starting sid construction 14:35:36.838907
Starting sid filtering 14:36:24.492628
Starting sid code splitting 14:36:36.431703
Starting sid storage 14:36:38.681687
Finished sid creation 14:36:41.993523
