In [1]:
from utility_functions import load_file, pickle_file, starting_run, finished_run
from data_reading_functions import data_dir, read_data, died_reference
import pandas as pd
import numpy as np

In [2]:
summary_table = load_file("filtered_dataset.pickle")
codes_table = load_file("filtered_dataset_codes.pickle")
index = summary_table["max_year"].astype("int")
death_records = []

In [3]:
for dataset in ["sedd", "sid", "sasd"]:
    for year in ["2016", "2017", "2018"]:
        starting_run(dataset + year)
        data_file = read_data(died_reference[dataset][year], f"MD_{dataset.upper()}_{year}_CORE.asc")
        deaths = data_file.astype({"died": "int"}).join(index, how="inner", on="visit_link").query(f"max_year >={year} and died == 1")
        death_records.append(deaths)
        del data_file
        del deaths

Starting sedd2016 20:58:00.276124
Starting sedd2017 20:58:24.438037
Starting sedd2018 20:58:47.738795
Starting sid2016 20:59:10.983326
Starting sid2017 20:59:16.862246
Starting sid2018 20:59:22.774629
Starting sasd2016 20:59:30.842946
Starting sasd2017 21:00:08.060825
Starting sasd2018 21:00:41.291958


In [4]:
deaths = pd.concat(death_records).set_index("visit_link").join(summary_table, how="right", lsuffix="_died")
deaths["died"] = deaths["died"].fillna(0)

In [5]:
sedd_core_filtered = load_file("sedd_core_filtered.pickle")
sasd_core_filtered = load_file("sasd_core_filtered.pickle")
sid_core_filtered = load_file("sid_core_filtered.pickle")

In [6]:
enriched_deaths = deaths.query("died == 1").join(
    sedd_core_filtered, on="record_id", how="left", rsuffix="_sedd").join(
    sasd_core_filtered, on="record_id", how="left", rsuffix="_sasd").join(
    sid_core_filtered, on="record_id", how="left", rsuffix="_sid")

In [7]:
enriched_valid_deaths = enriched_deaths[
    enriched_deaths[["ICD-10", "ICD-10_sasd", "ICD-10_sid"]
    ].any(axis=1)].query("record_id != initial_record_id")

In [8]:
fully_filtered_summary = deaths.drop(
    enriched_deaths.drop(enriched_valid_deaths.index).index
).drop(columns=["record_id", "max_year_died"])

In [9]:
enriched_deaths.drop(enriched_valid_deaths.index).index.shape # count number of dropped patients

(8,)

In [10]:
pickle_file("fully_filtered_summary.pickle", fully_filtered_summary)
pickle_file("fully_filtered_codes.pickle", codes_table.loc[fully_filtered_summary.index])