In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys
from dotenv import load_dotenv
load_dotenv("../.env")  # take environment variables
PROJECT_ROOT = os.environ.get("PROJECT_ROOT")
sys.path.append(PROJECT_ROOT)

import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
from datetime import datetime
import random

import utils.PATHS as PATHS
import utils.utils as utils
# import utils.emr_utils as emr_utils
# import utils.load_utils as load_utils

## Construct Date of Birth dictionary

In [None]:
dia_fp_list = glob(os.path.join(PATHS.DIAGNOSIS, "*.csv"))

# general labs dataframe
df_list = [pd.read_csv(path, low_memory=False) for path in tqdm(dia_fp_list)]
dia_df = pd.concat(df_list, ignore_index=True)

prl_fp_list = glob(os.path.join(PATHS.PROBLEM_LIST, "*.csv"))

df_list = []
for path in tqdm(prl_fp_list):
    temp = pd.read_csv(path, low_memory=False)

    # address div block at df tail
    subset = temp.columns.tolist()
    subset.remove("Visit Date")
    temp = temp.dropna(subset=subset, how="all")

    df_list.append(temp)
    
prl_df = pd.concat(df_list, ignore_index=True)


lab_fp_list = glob(os.path.join(PATHS.LABS, "*.csv"))

# general labs dataframe
df_list = [pd.read_csv(path, low_memory=False) for path in tqdm(lab_fp_list)]
lab_df = pd.concat(df_list, ignore_index=True)

100%|█████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  3.64it/s]
100%|█████████████████████████████████████████████████████████| 8/8 [00:21<00:00,  2.72s/it]
 65%|███████████████████████████████████▌                   | 31/48 [01:05<00:38,  2.24s/it]

In [None]:
usecols = ["Patient ID", "Date of Birth"]
dob_df = pd.concat([prl_df[usecols], dia_df[usecols], lab_df[usecols]], ignore_index=True)
dob_df = dob_df.drop_duplicates()
dob_df = dob_df.dropna(how='any')

In [None]:
# sanity checks : nans and ambiguities per patient
tqdm.pandas()

# multiple dob per patient (including nan) : NONE, all pids have 1 unique DOB
(dob_df.groupby("Patient ID")["Date of Birth"]
 .progress_apply(set)
 .progress_apply(list)
 .progress_apply(lambda x: len(x))
).sort_values()


In [None]:
dob_dict = dob_df.groupby('Patient ID')['Date of Birth'].first().to_dict()

In [None]:
path = os.path.join(PROJECT_ROOT, "results", "scaled_ldlc_valid_18_nontg_index_nosecondary_final.csv")
ldlc_valid_18_nontg_index_nosecondary_final = pd.read_csv(path, low_memory=False)

In [None]:
# add dob
ldlc_valid_18_nontg_index_nosecondary_final["Date of Birth"] = ldlc_valid_18_nontg_index_nosecondary_final["Patient ID"].apply(lambda x: dob_dict.get(x))

In [None]:
ldlc_valid_18_nontg_index_nosecondary_final.head()

## Premature `CHD`

In [None]:
paths = [path for path in glob(os.path.join(PROJECT_ROOT, "results", "*")) if "chd" in path]

In [None]:
col_rename = {
    "Diagnosis Description (ICD10)": "desc",
    "Diagnosis Code (ICD10)": "code",
    "Problem Code (Coded)": "code",
    "Problem Desc (Coded)": "desc",
    "Visit Date": "date",
    "Admit/Visit Date": "date",
}
df_list = []
for path in paths:
    temp = pd.read_csv(path, low_memory=False)
    temp = temp.rename(columns=col_rename)
    temp = temp[["code", "desc", "Score", "Key Term"]]
    df_list.append(temp)

relevant_history = pd.concat(df_list, ignore_index=True)
terms = relevant_history['desc'].unique().tolist()
codes = relevant_history['code'].unique().tolist()

In [None]:
prl_usecols = ["Patient ID", "Problem Code (Coded)", "Problem Desc (Coded)", "Visit Date", "Gender"]
dia_usecols = ["Patient ID", "Diagnosis Description (ICD10)", "Diagnosis Code (ICD10)", "Admit/Visit Date", "Gender"]

prl_temp = prl_df[prl_usecols].rename(columns=col_rename)
dia_temp = dia_df[dia_usecols].rename(columns=col_rename)

history = pd.concat([prl_temp, dia_temp], ignore_index=True)
history = history.rename(columns=col_rename)
display(history.head())

# filter history
history = history[history['code'].isin(codes)]
display(history.head())

In [None]:
history['dob'] = history["Patient ID"].apply(lambda x: dob_dict.get(x))

In [None]:
def get_age(ref, dob, fmt="%Y-%m-%d",):
    ref = datetime.strptime(ref, fmt)
    dob = datetime.strptime(dob, fmt)
    try:
        return ref.year - dob.year - ((ref.month,  ref.day) < (dob.month, dob.day))
    except:
        return np.nan
        
history['age'] = history[['date', 'dob']].apply(lambda row: get_age(row['date'], row['dob'], fmt="%Y-%m-%d"), axis=1)

In [None]:
history.head()

In [None]:
history.shape

In [None]:
def is_qualified_history(row):
    if row["Gender"] == 'MALE':
        return row['age'] <= 55
    else:
        return row['age'] <= 65
        
premature_chd = history[history.apply(is_qualified_history, axis=1)]

In [None]:
premature_chd.shape

In [None]:
premature_chd_ids = premature_chd['Patient ID'].unique().tolist()

In [None]:
ldlc_valid_18_nontg_index_nosecondary_final['premature_chd_history'] = ldlc_valid_18_nontg_index_nosecondary_final['Patient ID'].isin(premature_chd_ids)

## Premature `CVD`

In [None]:
paths = [path for path in glob(os.path.join(PROJECT_ROOT, "results", "*")) if "cvd" in path]

In [None]:
col_rename = {
    "Diagnosis Description (ICD10)": "desc",
    "Diagnosis Code (ICD10)": "code",
    "Problem Code (Coded)": "code",
    "Problem Desc (Coded)": "desc",
    "Visit Date": "date",
    "Admit/Visit Date": "date",
}
df_list = []
for path in paths:
    temp = pd.read_csv(path, low_memory=False)
    temp = temp.rename(columns=col_rename)
    temp = temp[["code", "desc", "Score", "Key Term"]]
    df_list.append(temp)

relevant_history = pd.concat(df_list, ignore_index=True)
terms = relevant_history['desc'].unique().tolist()
codes = relevant_history['code'].unique().tolist()

In [None]:
prl_usecols = ["Patient ID", "Problem Code (Coded)", "Problem Desc (Coded)", "Visit Date", "Gender"]
dia_usecols = ["Patient ID", "Diagnosis Description (ICD10)", "Diagnosis Code (ICD10)", "Admit/Visit Date", "Gender"]

prl_temp = prl_df[prl_usecols].rename(columns=col_rename)
dia_temp = dia_df[dia_usecols].rename(columns=col_rename)

history = pd.concat([prl_temp, dia_temp], ignore_index=True)
history = history.rename(columns=col_rename)
display(history.head())

# filter history
history = history[history['code'].isin(codes)]
display(history.head())

In [None]:
history['dob'] = history["Patient ID"].apply(lambda x: dob_dict.get(x))

In [None]:
def get_age(ref, dob, fmt="%Y-%m-%d",):
    ref = datetime.strptime(ref, fmt)
    dob = datetime.strptime(dob, fmt)
    try:
        return ref.year - dob.year - ((ref.month,  ref.day) < (dob.month, dob.day))
    except:
        return np.nan
        
history['age'] = history[['date', 'dob']].apply(lambda row: get_age(row['date'], row['dob'], fmt="%Y-%m-%d"), axis=1)

In [None]:
history.head()

In [None]:
history.shape

In [None]:
def is_qualified_history(row):
    if row["Gender"] == 'MALE':
        return row['age'] <= 55
    else:
        return row['age'] <= 65
        
premature_cvd = history[history.apply(is_qualified_history, axis=1)]

In [None]:
premature_cvd.shape

In [None]:
premature_cvd_ids = premature_cvd['Patient ID'].unique().tolist()

In [None]:
ldlc_valid_18_nontg_index_nosecondary_final['premature_cvd_history'] = ldlc_valid_18_nontg_index_nosecondary_final['Patient ID'].isin(premature_cvd_ids)

## `xanthomas` Physical Examination

In [None]:
paths = [path for path in glob(os.path.join(PROJECT_ROOT, "results", "*")) if "xanthomas" in path]

In [None]:
col_rename = {
    "Diagnosis Description (ICD10)": "desc",
    "Diagnosis Code (ICD10)": "code",
    "Problem Code (Coded)": "code",
    "Problem Desc (Coded)": "desc",
    "Visit Date": "date",
    "Admit/Visit Date": "date",
}
df_list = []
for path in paths:
    temp = pd.read_csv(path, low_memory=False)
    if temp.empty:
        continue
    temp = temp.rename(columns=col_rename)
    temp = temp[["code", "desc", "Score", "Key Term"]]
    df_list.append(temp)

if len(df_list):
    relevant_history = pd.concat(df_list, ignore_index=True)
    terms = relevant_history['desc'].unique().tolist()
    codes = relevant_history['code'].unique().tolist()
    
    prl_usecols = ["Patient ID", "Problem Code (Coded)", "Problem Desc (Coded)", "Visit Date", "Gender"]
    dia_usecols = ["Patient ID", "Diagnosis Description (ICD10)", "Diagnosis Code (ICD10)", "Admit/Visit Date", "Gender"]
    
    prl_temp = prl_df[prl_usecols].rename(columns=col_rename)
    dia_temp = dia_df[dia_usecols].rename(columns=col_rename)
    
    history = pd.concat([prl_temp, dia_temp], ignore_index=True)
    history = history.rename(columns=col_rename)
    display(history.head())
    
    # filter history
    xanthomas_reference = history[history['code'].isin(codes)]
    display(history.head())
    xanthomas_reference_ids = xanthomas_reference['Patient ID'].unique().tolist()
else:
    xanthomas_reference_ids = []

In [None]:
ldlc_valid_18_nontg_index_nosecondary_final['has_xanthomas'] = ldlc_valid_18_nontg_index_nosecondary_final['Patient ID'].isin(xanthomas_reference_ids)

## `corneal_arcus` Physical Examination

In [None]:
paths = [path for path in glob(os.path.join(PROJECT_ROOT, "results", "*")) if "corneal_arcus" in path]

In [None]:
col_rename = {
    "Diagnosis Description (ICD10)": "desc",
    "Diagnosis Code (ICD10)": "code",
    "Problem Code (Coded)": "code",
    "Problem Desc (Coded)": "desc",
    "Visit Date": "date",
    "Admit/Visit Date": "date",
}
df_list = []
for path in paths:
    temp = pd.read_csv(path, low_memory=False)
    if temp.empty:
        continue
    temp = temp.rename(columns=col_rename)
    temp = temp[["code", "desc", "Score", "Key Term"]]
    df_list.append(temp)

if len(df_list):
    relevant_history = pd.concat(df_list, ignore_index=True)
    terms = relevant_history['desc'].unique().tolist()
    codes = relevant_history['code'].unique().tolist()
    
    prl_usecols = ["Patient ID", "Problem Code (Coded)", "Problem Desc (Coded)", "Visit Date", "Gender"]
    dia_usecols = ["Patient ID", "Diagnosis Description (ICD10)", "Diagnosis Code (ICD10)", "Admit/Visit Date", "Gender"]
    
    prl_temp = prl_df[prl_usecols].rename(columns=col_rename)
    dia_temp = dia_df[dia_usecols].rename(columns=col_rename)
    
    history = pd.concat([prl_temp, dia_temp], ignore_index=True)
    history = history.rename(columns=col_rename)
    display(history.head())
    
    # filter history
    corneal_arcus_reference = history[history['code'].isin(codes)]
    display(history.head())
    corneal_arcus_reference_ids = corneal_arcus_reference['Patient ID'].unique().tolist()
else:
    corneal_arcus_reference_ids = []

In [None]:
ldlc_valid_18_nontg_index_nosecondary_final['has_corneal_arcus'] = ldlc_valid_18_nontg_index_nosecondary_final['Patient ID'].isin(corneal_arcus_reference_ids)

## Scoring

In [None]:
def score(row):
    score = 0
    
    if row['Scaled LDL-C Max'] >= 8.40:
        score += 8
    elif 6.49 <= row['Scaled LDL-C Max'] < 8.40: 
        score += 5
    elif 4.93 <= row['Scaled LDL-C Max'] < 6.49: 
        score += 3
    elif row['Scaled LDL-C Max'] < 4.93:
        score += 1
    else:
        raise ValueError("Unable to handle value.")
    
    if row['premature_chd_history']:
        score += 2
    if row['premature_cvd_history']:
        score += 1
    if row['has_corneal_arcus']:
        score += 4
    if row['has_xanthomas']:
        score += 6
    return score

def segment(score):
    if score >= 6.0:
        return "Cases"
    elif 2.0 < score < 6.0: # [3, 5]
        return "Unknown"
    elif score <= 2:
        return "Control"
    else:
        raise ValueError("Unable to handle score.")


In [None]:
usecols = ['premature_chd_history', 'premature_cvd_history', 'has_corneal_arcus', 'has_xanthomas']
for col in usecols:
    display(ldlc_valid_18_nontg_index_nosecondary_final[col].value_counts())

In [None]:
ldlc_valid_18_nontg_index_nosecondary_final["Score"] = ldlc_valid_18_nontg_index_nosecondary_final.apply(score, axis=1)
ldlc_valid_18_nontg_index_nosecondary_final["Stage2_Category"] = ldlc_valid_18_nontg_index_nosecondary_final["Score"].apply(segment)
ldlc_valid_18_nontg_index_nosecondary_final["Stage2_Category"].value_counts()

## End.