# Diagnosis Filtering

This notebook covers the creation of references for the following Stage 1 filtering criteria:
1. `Diagnosis` `List` Those with secondary causes within 1 year (+/-) of Index Date

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import pandas as pd
import sys
from glob import glob
from tqdm import tqdm
REPO_DIR = "/Users/mdorosan/Desktop/2024/elpha-singhealth-fh"
sys.path.append(REPO_DIR)
from datetime import datetime

import utils.PATHS as PATHS
import utils.utils as utils
# import utils.emr_utils as emr_utils
# import utils.load_utils as load_utils

  from pandas.core import (


In [3]:
dia_fp_list = glob(os.path.join(PATHS.DIAGNOSIS, "*.csv"))

# general labs dataframe
df_list = [pd.read_csv(path, low_memory=False) for path in tqdm(dia_fp_list)]
dia_df = pd.concat(df_list, ignore_index=True)

100%|█████████████████████████████████████████████| 4/4 [00:00<00:00,  6.64it/s]


In [4]:
dia_df.columns

Index(['Institution Code', 'Patient ID', 'Gender', 'Race', 'Nationality',
       'Date of Birth', 'Resident Indicator', 'Death Date', 'Diagnosis Code',
       'Diagnosis Description', 'Diagnosis Catalogue Code',
       'Diagnosis Code (ICD10)', 'Diagnosis Description (ICD10)',
       'ILD Indicator (ICD10)', 'Diagnosis Date', 'Case No', 'eHIntS Case Id',
       'Visit No', 'Admit/Visit Date', 'ICD10 Diagnosis Type',
       'ICD10 Diagnosis Type2', 'Diagnosis Creation Date',
       'Diagnosis Catalog Code', 'Diagnosis Catalogue Code.1',
       'Diagnosis Catalogue Text Code', 'Diagnosis Catalogue Text Description',
       'Reference Catalog Code', 'Reference Catalog Description', 'BMI',
       'Height', 'Weight'],
      dtype='object')

In [5]:
prl_fp_list = glob(os.path.join(PATHS.PROBLEM_LIST, "*.csv"))

df_list = []
for path in tqdm(prl_fp_list):
    temp = pd.read_csv(path, low_memory=False)

    # address div block at df tail
    subset = temp.columns.tolist()
    subset.remove("Visit Date")
    temp = temp.dropna(subset=subset, how="all")

    df_list.append(temp)
    
prl_df = pd.concat(df_list, ignore_index=True)

100%|█████████████████████████████████████████████| 4/4 [00:06<00:00,  1.54s/it]


In [6]:
prl_df.columns

Index(['Visit Date', 'Institution', 'Case No', 'Visit No', 'Patient ID',
       'Date of Birth', 'Race', 'Gender', 'Resident Indicator', 'Nationality',
       'Problem Serial No', 'Problem Summary', 'Problem Code (Coded)',
       'Problem Desc (Coded)', 'Coding Scheme (Coded)',
       'Problem Shortname (Coded)', 'Problem Status', 'Problem Onset (Day)',
       'Problem Onset (Month)', 'Problem Onset (Year)', 'Created Date',
       'Entered Date', 'Resolved Date', 'Updated Date', 'Smoking',
       'Smoking History', 'Smoking History.1'],
      dtype='object')

In [7]:
# load ldlc for index date reference
path = os.path.join(REPO_DIR, "results", "ldlc_valid_18_nontg_index.csv")
ldlc_valid_18_nontg_index = pd.read_csv(path)

# prep datetime cols
ldlc_valid_18_nontg_index["Index Date"] = pd.to_datetime(
    ldlc_valid_18_nontg_index["Index Date"]
)

In [9]:
def check_secondary_diagnosis(
    row, 
    date_col,
    code_col,
    ref_df,
):
    pid = row["Patient ID"]
    index_date = row["Index Date"]

    ref_df[date_col] = pd.to_datetime(ref_df[date_col])
    pid_dia = ref_df[ref_df["Patient ID"] == pid]
    
    ## take only those records which are between 365 days and 42 days of index date
    pid_dia = pid_dia[
        (index_date - pid_dia[date_col] >= pd.Timedelta(days=0)) & 
        (index_date - pid_dia[date_col] <= pd.Timedelta(days=365))
    ]
    ## if not empty apply correction else do not
    if (pid_dia.empty):
        return False
    else:
        # TO UDPATE: Break apart snomed search and icd10 search to enable prefix matching in icd10
        has_secondary = (
            pid_dia[code_col]
            .isin(utils.secondary_dia_icd10 + utils.secondary_dia_snomed)
            .any()
        )
        return has_secondary

## SNOMED-coded Problem List : Using Codes

In [11]:
tqdm.pandas()
date_col = "Visit Date"
code_col = "Problem Code (Coded)"
with_secondary_prl = ldlc_valid_18_nontg_index[
    ldlc_valid_18_nontg_index.progress_apply(
        check_secondary_diagnosis, args=(date_col, code_col, prl_df), axis=1)
]

100%|███████████████████████████████████| 24122/24122 [9:42:58<00:00,  1.45s/it]


In [22]:
with_secondary_prl["Patient ID"].unique().size

3

## ICD10-coded Diagnosis: Using Codes

In [12]:

date_col = "Admit/Visit Date"
code_col = "Diagnosis Code (ICD10)"
with_secondary_dia = ldlc_valid_18_nontg_index.loc[
    ldlc_valid_18_nontg_index.progress_apply(
        check_secondary_diagnosis, args=(date_col, code_col, dia_df), axis=1)
]

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 24122/24122 [06:12<00:00, 64.68it/s]


In [21]:
with_secondary_dia["Patient ID"].unique().size

0

### SNOMED: Using keywords

In [13]:
def check_secondary_diagnosis(
    row, 
    date_col,
    desc_col,
    ref_df,
):
    pid = row["Patient ID"]
    index_date = row["Index Date"]

    ref_df[date_col] = pd.to_datetime(ref_df[date_col])
    pid_dia = ref_df[ref_df["Patient ID"] == pid]
    
    ## take only those records which are between 365 days and 42 days of index date
    pid_dia = pid_dia[
        (index_date - pid_dia[date_col] >= pd.Timedelta(days=0)) & 
        (index_date - pid_dia[date_col] <= pd.Timedelta(days=365))
    ]
    ## if not empty apply correction else do not
    if (pid_dia.empty):
        return False
    else:
        has_secondary = (
            pid_dia[desc_col]
            .str.contains('|'.join(utils.secondary_pregnancy_terms), case=False)
            .any()
        )
        return has_secondary

In [14]:

date_col = "Visit Date"
desc_col = "Problem Desc (Coded)"
with_secondary_prl_keywords = ldlc_valid_18_nontg_index.loc[
    ldlc_valid_18_nontg_index.progress_apply(
        check_secondary_diagnosis, args=(date_col, desc_col, prl_df), axis=1)
]

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 24122/24122 [34:29<00:00, 11.66it/s]


In [17]:
with_secondary_prl_keywords["Patient ID"].unique().size

17

### ICD10: Using keywords

In [15]:

date_col = "Admit/Visit Date"
desc_col = "Diagnosis Description (ICD10)"
with_secondary_dia_keywords = ldlc_valid_18_nontg_index.loc[
    ldlc_valid_18_nontg_index.progress_apply(
        check_secondary_diagnosis, args=(date_col, desc_col, dia_df), axis=1)
]

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 24122/24122 [05:02<00:00, 79.63it/s]


In [19]:
with_secondary_dia_keywords["Patient ID"].unique().size

1

## End