In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys
from dotenv import load_dotenv
load_dotenv("../.env")  # take environment variables
PROJECT_ROOT = os.environ.get("PROJECT_ROOT")
sys.path.append(PROJECT_ROOT)

import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
from datetime import datetime
import random

import utils.PATHS as PATHS
import utils.utils as utils
# import utils.emr_utils as emr_utils
# import utils.load_utils as load_utils

## Construct Date of Birth dictionary

In [3]:
dia_fp_list = glob(os.path.join(PATHS.DIAGNOSIS, "*.csv"))

# general labs dataframe
df_list = [pd.read_csv(path, low_memory=False) for path in tqdm(dia_fp_list)]
dia_df = pd.concat(df_list, ignore_index=True)

prl_fp_list = glob(os.path.join(PATHS.PROBLEM_LIST, "*.csv"))

df_list = []
for path in tqdm(prl_fp_list):
    temp = pd.read_csv(path, low_memory=False)

    # address div block at df tail
    subset = temp.columns.tolist()
    subset.remove("Visit Date")
    temp = temp.dropna(subset=subset, how="all")

    df_list.append(temp)
    
prl_df = pd.concat(df_list, ignore_index=True)


lab_fp_list = glob(os.path.join(PATHS.LABS, "*.csv"))

# general labs dataframe
df_list = [pd.read_csv(path, low_memory=False) for path in tqdm(lab_fp_list)]
lab_df = pd.concat(df_list, ignore_index=True)

100%|█████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  3.64it/s]
100%|█████████████████████████████████████████████████████████| 8/8 [00:21<00:00,  2.72s/it]
100%|███████████████████████████████████████████████████████| 48/48 [01:44<00:00,  2.18s/it]


In [4]:
usecols = ["Patient ID", "Date of Birth"]
dob_df = pd.concat([prl_df[usecols], dia_df[usecols], lab_df[usecols]], ignore_index=True)
dob_df = dob_df.drop_duplicates()
dob_df = dob_df.dropna(how='any')

In [5]:
# sanity checks : nans and ambiguities per patient
tqdm.pandas()

# multiple dob per patient (including nan) : NONE, all pids have 1 unique DOB
(dob_df.groupby("Patient ID")["Date of Birth"]
 .progress_apply(set)
 .progress_apply(list)
 .progress_apply(lambda x: len(x))
).sort_values()


100%|████████████████████████████████████████████| 532284/532284 [00:08<00:00, 60173.08it/s]
100%|███████████████████████████████████████████| 532284/532284 [00:00<00:00, 740809.86it/s]
100%|██████████████████████████████████████████| 532284/532284 [00:00<00:00, 1361628.15it/s]


Patient ID
aab81cff7d5058c22fe8    1
aab6b4e9f47611161755    1
aab7053f4d6faa0b10e7    1
aab71b6a4980cf082a2d    1
aab727071631e1bb66cb    1
                       ..
5515d18bbaf283373cbe    1
5515bf6e46d09d0ba63b    1
55157f9db1ab8b1d5778    1
551531a6e22696a77a07    1
926d6b2cb5a8067430b5    2
Name: Date of Birth, Length: 532284, dtype: int64

In [6]:
dob_dict = dob_df.groupby('Patient ID')['Date of Birth'].first().to_dict()

In [7]:
path = os.path.join(PROJECT_ROOT, "results", "scaled_ldlc_valid_18_nontg_index_nosecondary_final.csv")
ldlc_valid_18_nontg_index_nosecondary_final = pd.read_csv(path, low_memory=False)

In [8]:
# add dob
ldlc_valid_18_nontg_index_nosecondary_final["Date of Birth"] = ldlc_valid_18_nontg_index_nosecondary_final["Patient ID"].apply(lambda x: dob_dict.get(x))

In [9]:
ldlc_valid_18_nontg_index_nosecondary_final.head()

Unnamed: 0,Patient ID,LDL-C Max,Index Date,Scaled LDL-C Max,Category,Date of Birth
0,00031c3262ee7a0c2981,2.46,2015-04-10,2.46,Control,1988-05-01
1,0005f0349ba521e1ecb8,2.33,2017-07-14,2.33,Control,1964-08-01
2,000b2a238717215130d6,3.74,2018-09-20,3.74,Unknown,1956-07-01
3,000e1e9fd6b490cfc00b,4.0,2017-10-28,4.0,Unknown,1990-04-01
4,001eca77df4947cc6b82,0.58,2017-12-12,0.58,Control,1931-06-01


## Premature `CHD`

In [10]:
paths = [path for path in glob(os.path.join(PROJECT_ROOT, "results", "*")) if "chd" in path]

In [11]:
col_rename = {
    "Diagnosis Description (ICD10)": "desc",
    "Diagnosis Code (ICD10)": "code",
    "Problem Code (Coded)": "code",
    "Problem Desc (Coded)": "desc",
    "Visit Date": "date",
    "Admit/Visit Date": "date",
}
df_list = []
for path in paths:
    temp = pd.read_csv(path, low_memory=False)
    temp = temp.rename(columns=col_rename)
    temp = temp[["code", "desc", "Score", "Key Term"]]
    df_list.append(temp)

relevant_history = pd.concat(df_list, ignore_index=True)
terms = relevant_history['desc'].unique().tolist()
codes = relevant_history['code'].unique().tolist()

In [12]:
prl_usecols = ["Patient ID", "Problem Code (Coded)", "Problem Desc (Coded)", "Visit Date", "Gender"]
dia_usecols = ["Patient ID", "Diagnosis Description (ICD10)", "Diagnosis Code (ICD10)", "Admit/Visit Date", "Gender"]

prl_temp = prl_df[prl_usecols].rename(columns=col_rename)
dia_temp = dia_df[dia_usecols].rename(columns=col_rename)

history = pd.concat([prl_temp, dia_temp], ignore_index=True)
history = history.rename(columns=col_rename)
display(history.head())

# filter history
history = history[history['code'].isin(codes)]
display(history.head())

Unnamed: 0,Patient ID,code,desc,date,Gender
0,f04cd72dfb45a0929025,35027015,CHRONIC CHOLECYSTITIS,2016-01-01,MALE
1,fd381d17b78192f49b0b,379662011,BREAST CANCER,2016-01-01,FEMALE
2,fd381d17b78192f49b0b,1233066018,UTI - URINARY TRACT INFECTION,2016-01-01,FEMALE
3,0ece0e12fdc7605939c2,2793932016,HISTORY OF RENAL TRANSPLANT*,2016-01-01,MALE
4,0ece0e12fdc7605939c2,1232637011,PARTIAL THICKNESS BURN OF TRUNK,2016-01-01,MALE


Unnamed: 0,Patient ID,code,desc,date,Gender
3281647,b90b6597eaca1dd4493b,I351,Aortic (valve) insufficiency,2016-02-29,MALE
3285320,ba145e42c3ff11db8461,I350,Aortic (valve) stenosis,2016-12-22,FEMALE
3285321,ba145e42c3ff11db8461,I350,Aortic (valve) stenosis,2016-12-22,FEMALE
3287089,94807e1791068a604b66,Q2110,Unspecified atrial septal defect,2016-03-14,MALE
3287092,94807e1791068a604b66,Q250,Patent ductus arteriosus,2016-03-14,MALE


In [13]:
history['dob'] = history["Patient ID"].apply(lambda x: dob_dict.get(x))

In [14]:
def get_age(ref, dob, fmt="%Y-%m-%d",):
    ref = datetime.strptime(ref, fmt)
    dob = datetime.strptime(dob, fmt)
    try:
        return ref.year - dob.year - ((ref.month,  ref.day) < (dob.month, dob.day))
    except:
        return np.nan
        
history['age'] = history[['date', 'dob']].apply(lambda row: get_age(row['date'], row['dob'], fmt="%Y-%m-%d"), axis=1)

In [15]:
history.head()

Unnamed: 0,Patient ID,code,desc,date,Gender,dob,age
3281647,b90b6597eaca1dd4493b,I351,Aortic (valve) insufficiency,2016-02-29,MALE,1972-12-01,43
3285320,ba145e42c3ff11db8461,I350,Aortic (valve) stenosis,2016-12-22,FEMALE,1986-09-01,30
3285321,ba145e42c3ff11db8461,I350,Aortic (valve) stenosis,2016-12-22,FEMALE,1986-09-01,30
3287089,94807e1791068a604b66,Q2110,Unspecified atrial septal defect,2016-03-14,MALE,2016-03-01,0
3287092,94807e1791068a604b66,Q250,Patent ductus arteriosus,2016-03-14,MALE,2016-03-01,0


In [16]:
history.shape

(50, 7)

In [17]:
def is_qualified_history(row):
    if row["Gender"] == 'MALE':
        return row['age'] <= 55
    else:
        return row['age'] <= 65
        
premature_chd = history[history.apply(is_qualified_history, axis=1)]

In [18]:
premature_chd.shape

(43, 7)

In [19]:
premature_chd_ids = premature_chd['Patient ID'].unique().tolist()

In [20]:
ldlc_valid_18_nontg_index_nosecondary_final['premature_chd_history'] = ldlc_valid_18_nontg_index_nosecondary_final['Patient ID'].isin(premature_chd_ids)

## Premature `CVD`

In [21]:
paths = [path for path in glob(os.path.join(PROJECT_ROOT, "results", "*")) if "cvd" in path]

In [22]:
col_rename = {
    "Diagnosis Description (ICD10)": "desc",
    "Diagnosis Code (ICD10)": "code",
    "Problem Code (Coded)": "code",
    "Problem Desc (Coded)": "desc",
    "Visit Date": "date",
    "Admit/Visit Date": "date",
}
df_list = []
for path in paths:
    temp = pd.read_csv(path, low_memory=False)
    temp = temp.rename(columns=col_rename)
    temp = temp[["code", "desc", "Score", "Key Term"]]
    df_list.append(temp)

relevant_history = pd.concat(df_list, ignore_index=True)
terms = relevant_history['desc'].unique().tolist()
codes = relevant_history['code'].unique().tolist()

In [23]:
prl_usecols = ["Patient ID", "Problem Code (Coded)", "Problem Desc (Coded)", "Visit Date", "Gender"]
dia_usecols = ["Patient ID", "Diagnosis Description (ICD10)", "Diagnosis Code (ICD10)", "Admit/Visit Date", "Gender"]

prl_temp = prl_df[prl_usecols].rename(columns=col_rename)
dia_temp = dia_df[dia_usecols].rename(columns=col_rename)

history = pd.concat([prl_temp, dia_temp], ignore_index=True)
history = history.rename(columns=col_rename)
display(history.head())

# filter history
history = history[history['code'].isin(codes)]
display(history.head())

Unnamed: 0,Patient ID,code,desc,date,Gender
0,f04cd72dfb45a0929025,35027015,CHRONIC CHOLECYSTITIS,2016-01-01,MALE
1,fd381d17b78192f49b0b,379662011,BREAST CANCER,2016-01-01,FEMALE
2,fd381d17b78192f49b0b,1233066018,UTI - URINARY TRACT INFECTION,2016-01-01,FEMALE
3,0ece0e12fdc7605939c2,2793932016,HISTORY OF RENAL TRANSPLANT*,2016-01-01,MALE
4,0ece0e12fdc7605939c2,1232637011,PARTIAL THICKNESS BURN OF TRUNK,2016-01-01,MALE


Unnamed: 0,Patient ID,code,desc,date,Gender
3249060,8553766829a37c670348,I64,"Stroke, not specified as haemorrhage orinfarction",2016-10-25,MALE
3249172,b20d03fb9e2f87375797,I64,"Stroke, not specified as haemorrhage orinfarction",2016-11-09,MALE
3249173,b20d03fb9e2f87375797,I64,"Stroke, not specified as haemorrhage orinfarction",2016-12-28,MALE
3249325,535f1cc90a9cb626f28d,I64,"Stroke, not specified as haemorrhage orinfarction",2016-08-01,FEMALE
3249698,42a725602ba8e6153efe,I64,"Stroke, not specified as haemorrhage orinfarction",2016-07-19,MALE


In [24]:
history['dob'] = history["Patient ID"].apply(lambda x: dob_dict.get(x))

In [25]:
def get_age(ref, dob, fmt="%Y-%m-%d",):
    ref = datetime.strptime(ref, fmt)
    dob = datetime.strptime(dob, fmt)
    try:
        return ref.year - dob.year - ((ref.month,  ref.day) < (dob.month, dob.day))
    except:
        return np.nan
        
history['age'] = history[['date', 'dob']].apply(lambda row: get_age(row['date'], row['dob'], fmt="%Y-%m-%d"), axis=1)

In [26]:
history.head()

Unnamed: 0,Patient ID,code,desc,date,Gender,dob,age
3249060,8553766829a37c670348,I64,"Stroke, not specified as haemorrhage orinfarction",2016-10-25,MALE,1953-07-01,63
3249172,b20d03fb9e2f87375797,I64,"Stroke, not specified as haemorrhage orinfarction",2016-11-09,MALE,1953-12-01,62
3249173,b20d03fb9e2f87375797,I64,"Stroke, not specified as haemorrhage orinfarction",2016-12-28,MALE,1953-12-01,63
3249325,535f1cc90a9cb626f28d,I64,"Stroke, not specified as haemorrhage orinfarction",2016-08-01,FEMALE,1949-11-01,66
3249698,42a725602ba8e6153efe,I64,"Stroke, not specified as haemorrhage orinfarction",2016-07-19,MALE,1953-08-01,62


In [27]:
history.shape

(601, 7)

In [28]:
def is_qualified_history(row):
    if row["Gender"] == 'MALE':
        return row['age'] <= 55
    else:
        return row['age'] <= 65
        
premature_cvd = history[history.apply(is_qualified_history, axis=1)]

In [29]:
premature_cvd.shape

(172, 7)

In [30]:
premature_cvd_ids = premature_cvd['Patient ID'].unique().tolist()

In [31]:
ldlc_valid_18_nontg_index_nosecondary_final['premature_cvd_history'] = ldlc_valid_18_nontg_index_nosecondary_final['Patient ID'].isin(premature_cvd_ids)

## `xanthomas` Physical Examination

In [32]:
paths = [path for path in glob(os.path.join(PROJECT_ROOT, "results", "*")) if "xanthomas" in path]

In [33]:
col_rename = {
    "Diagnosis Description (ICD10)": "desc",
    "Diagnosis Code (ICD10)": "code",
    "Problem Code (Coded)": "code",
    "Problem Desc (Coded)": "desc",
    "Visit Date": "date",
    "Admit/Visit Date": "date",
}
df_list = []
for path in paths:
    temp = pd.read_csv(path, low_memory=False)
    if temp.empty:
        continue
    temp = temp.rename(columns=col_rename)
    temp = temp[["code", "desc", "Score", "Key Term"]]
    df_list.append(temp)

if len(df_list):
    relevant_history = pd.concat(df_list, ignore_index=True)
    terms = relevant_history['desc'].unique().tolist()
    codes = relevant_history['code'].unique().tolist()
    
    prl_usecols = ["Patient ID", "Problem Code (Coded)", "Problem Desc (Coded)", "Visit Date", "Gender"]
    dia_usecols = ["Patient ID", "Diagnosis Description (ICD10)", "Diagnosis Code (ICD10)", "Admit/Visit Date", "Gender"]
    
    prl_temp = prl_df[prl_usecols].rename(columns=col_rename)
    dia_temp = dia_df[dia_usecols].rename(columns=col_rename)
    
    history = pd.concat([prl_temp, dia_temp], ignore_index=True)
    history = history.rename(columns=col_rename)
    display(history.head())
    
    # filter history
    xanthomas_reference = history[history['code'].isin(codes)]
    display(history.head())
    xanthomas_reference_ids = xanthomas_reference['Patient ID'].unique().tolist()
else:
    xanthomas_reference_ids = []

Unnamed: 0,Patient ID,code,desc,date,Gender
0,f04cd72dfb45a0929025,35027015,CHRONIC CHOLECYSTITIS,2016-01-01,MALE
1,fd381d17b78192f49b0b,379662011,BREAST CANCER,2016-01-01,FEMALE
2,fd381d17b78192f49b0b,1233066018,UTI - URINARY TRACT INFECTION,2016-01-01,FEMALE
3,0ece0e12fdc7605939c2,2793932016,HISTORY OF RENAL TRANSPLANT*,2016-01-01,MALE
4,0ece0e12fdc7605939c2,1232637011,PARTIAL THICKNESS BURN OF TRUNK,2016-01-01,MALE


Unnamed: 0,Patient ID,code,desc,date,Gender
0,f04cd72dfb45a0929025,35027015,CHRONIC CHOLECYSTITIS,2016-01-01,MALE
1,fd381d17b78192f49b0b,379662011,BREAST CANCER,2016-01-01,FEMALE
2,fd381d17b78192f49b0b,1233066018,UTI - URINARY TRACT INFECTION,2016-01-01,FEMALE
3,0ece0e12fdc7605939c2,2793932016,HISTORY OF RENAL TRANSPLANT*,2016-01-01,MALE
4,0ece0e12fdc7605939c2,1232637011,PARTIAL THICKNESS BURN OF TRUNK,2016-01-01,MALE


In [34]:
ldlc_valid_18_nontg_index_nosecondary_final['has_xanthomas'] = ldlc_valid_18_nontg_index_nosecondary_final['Patient ID'].isin(xanthomas_reference_ids)

## `corneal_arcus` Physical Examination

In [35]:
paths = [path for path in glob(os.path.join(PROJECT_ROOT, "results", "*")) if "corneal_arcus" in path]

In [36]:
col_rename = {
    "Diagnosis Description (ICD10)": "desc",
    "Diagnosis Code (ICD10)": "code",
    "Problem Code (Coded)": "code",
    "Problem Desc (Coded)": "desc",
    "Visit Date": "date",
    "Admit/Visit Date": "date",
}
df_list = []
for path in paths:
    temp = pd.read_csv(path, low_memory=False)
    if temp.empty:
        continue
    temp = temp.rename(columns=col_rename)
    temp = temp[["code", "desc", "Score", "Key Term"]]
    df_list.append(temp)

if len(df_list):
    relevant_history = pd.concat(df_list, ignore_index=True)
    terms = relevant_history['desc'].unique().tolist()
    codes = relevant_history['code'].unique().tolist()
    
    prl_usecols = ["Patient ID", "Problem Code (Coded)", "Problem Desc (Coded)", "Visit Date", "Gender"]
    dia_usecols = ["Patient ID", "Diagnosis Description (ICD10)", "Diagnosis Code (ICD10)", "Admit/Visit Date", "Gender"]
    
    prl_temp = prl_df[prl_usecols].rename(columns=col_rename)
    dia_temp = dia_df[dia_usecols].rename(columns=col_rename)
    
    history = pd.concat([prl_temp, dia_temp], ignore_index=True)
    history = history.rename(columns=col_rename)
    display(history.head())
    
    # filter history
    corneal_arcus_reference = history[history['code'].isin(codes)]
    display(history.head())
    corneal_arcus_reference_ids = corneal_arcus_reference['Patient ID'].unique().tolist()
else:
    corneal_arcus_reference_ids = []

In [37]:
ldlc_valid_18_nontg_index_nosecondary_final['has_corneal_arcus'] = ldlc_valid_18_nontg_index_nosecondary_final['Patient ID'].isin(corneal_arcus_reference_ids)

## Scoring

In [38]:
def score(row):
    score = 0
    
    if row['Scaled LDL-C Max'] >= 8.40:
        score += 8
    elif 6.49 <= row['Scaled LDL-C Max'] < 8.40: 
        score += 5
    elif 4.93 <= row['Scaled LDL-C Max'] < 6.49: 
        score += 3
    elif row['Scaled LDL-C Max'] < 4.93:
        score += 1
    else:
        raise ValueError("Unable to handle value.")
    
    if row['premature_chd_history']:
        score += 2
    if row['premature_cvd_history']:
        score += 1
    if row['has_corneal_arcus']:
        score += 4
    if row['has_xanthomas']:
        score += 6
    return score

def segment(score):
    if score >= 6.0:
        return "Cases"
    elif 2.0 < score < 6.0: # [3, 5]
        return "Unknown"
    elif score <= 2:
        return "Control"
    else:
        raise ValueError("Unable to handle score.")


In [39]:
usecols = ['premature_chd_history', 'premature_cvd_history', 'has_corneal_arcus', 'has_xanthomas']
for col in usecols:
    display(ldlc_valid_18_nontg_index_nosecondary_final[col].value_counts())

premature_chd_history
False    20785
True         1
Name: count, dtype: int64

premature_cvd_history
False    20745
True        41
Name: count, dtype: int64

has_corneal_arcus
False    20786
Name: count, dtype: int64

has_xanthomas
False    20786
Name: count, dtype: int64

In [40]:
ldlc_valid_18_nontg_index_nosecondary_final["Score"] = ldlc_valid_18_nontg_index_nosecondary_final.apply(score, axis=1)
ldlc_valid_18_nontg_index_nosecondary_final["Stage2_Category"] = ldlc_valid_18_nontg_index_nosecondary_final["Score"].apply(segment)
ldlc_valid_18_nontg_index_nosecondary_final["Stage2_Category"].value_counts()

Stage2_Category
Control    19565
Unknown     1136
Cases         85
Name: count, dtype: int64

## End.