In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import pandas as pd
import sys
from glob import glob
REPO_DIR = "/Users/mdorosan/Desktop/2024/elpha-singhealth-fh"
# REPO_DIR = "/home/hsrc_michael/2024/elpha-singhealth-fh"
sys.path.append(REPO_DIR)
from tqdm import tqdm
from datetime import datetime
import random

import utils.PATHS as PATHS
import utils.utils as utils
# import utils.emr_utils as emr_utils
# import utils.load_utils as load_utils

  from pandas.core import (


# Lipid-lowering treatment criteria for scaling of LDL-C values

## Notes

* Drug dispensed data for each patient is based on the `Generic Drug Name` column
* Currently the `Drug Prescribed Date From` is used as reference as `Drug Prescribed Date To` may go beyond the `Index Date` from the previous notebook.
* This notebook proceeds as follows:
   1. Creation of a patient list according to lipid-lowering treatment (LLT) criteria
   2. Scaling the LDL-C values accordingly, i.e., dividing LDL-C values 0.7 (confirm if this is divide or multiple like a scaling factor)
   3. Segmentation of cohort according to scaled LDL-C thresholds
      - Cases: LDL-C >= `4.9`
      - Unknown: `3.36` < LDL-C < `4.9`
      - Controls: LDL-C <= `3.36`

## 1. LLT Data Preprocessing

In [3]:
dd_fp_list = glob(os.path.join(PATHS.DRUG_DISPENSED, "*.csv"))

# general labs dataframe
df_list = [pd.read_csv(path, low_memory=False) for path in tqdm(dd_fp_list)]
df = pd.concat(df_list, ignore_index=True)

drug_name_col = "Generic Drug Name"
pid_col = "Patient ID"
date_col = "Drug Prescribed Date From"
backup_date_col = "Drug Dispensed Date From" # has more weird values
ref_date_col = "Drug Reference Date"

# prep datetime cols
df[date_col] = pd.to_datetime(df[date_col])
df[backup_date_col] = pd.to_datetime(df[backup_date_col])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [04:00<00:00,  5.02s/it]


In [4]:
tqdm.pandas()

# RESOLVE OPTION
def resolve_date(row):
    """Use prescribed date when dispensed date is not available."""
    if np.isnat(np.datetime64(str(row[date_col]))):
        return row[backup_date_col]
    return row[date_col]

df[ref_date_col] = df.progress_apply(lambda row: resolve_date(row), axis=1)

100%|█████████████████████████████████████████████████████████████████████████████████████| 20132904/20132904 [40:12<00:00, 8344.30it/s]


In [5]:
# DROP OPTION

# df["Drug Reference Date"] = df[date_col]
# df = df.dropna(subset=["Drug Reference Date"])

In [6]:
# see utils.llt_meds for list of relevant prescriptions
llt = df[df[drug_name_col].str.lower().isin(utils.llt_meds)]

In [7]:
# # Inspect weird dates
# grouper = df.groupby(df[backup_date_col].dt.year)
# grouper.size()

## 2. Load the LDL-C values

In [8]:
path = os.path.join(REPO_DIR, "results", "ldlc_valid_18_nontg_index_nosecondary_final.csv")
ldlc_valid_18_nontg_index_nosecondary_final = pd.read_csv(path)

In [9]:
ldlc_valid_18_nontg_index_nosecondary_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20787 entries, 0 to 20786
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Patient ID  20787 non-null  object 
 1   LDL-C Max   20787 non-null  float64
 2   Index Date  20787 non-null  object 
dtypes: float64(1), object(2)
memory usage: 487.3+ KB


In [10]:
# prep datetime cols
ldlc_valid_18_nontg_index_nosecondary_final["Index Date"] = pd.to_datetime(
    ldlc_valid_18_nontg_index_nosecondary_final["Index Date"]
)

## 3. LLT data filtering based on LDL-C Index Date

In [11]:
# excldue drug data that are too old: over 1 year before the minimum LDLC Index Date
mask = df[ref_date_col] < (ldlc_valid_18_nontg_index_nosecondary_final["Index Date"].min() - pd.Timedelta(days=366))
llt = df[~mask]

# check
display(llt[ref_date_col].min())
display(llt[ref_date_col].max())

Timestamp('2014-11-04 00:00:00')

Timestamp('2024-03-27 00:00:00')

# 4. Scaling the LDL-C Max values

In [12]:
ezetimibe_combis = {
    "atorvastatin": {
        "10mg" : 2.0,
        "20mg" : 2.2,
        "40mg" : 2.2,
    },
    "pravastatin sodium": {
        '20MG': 1.6,
    },
    "simvastatin": {
        "10mg": 1.9,
        "20mg": 2.0,
    },
    "rosuvastatin" : {
        "10MG" : 1.9,
        "20MG" : 2.1,
    },
}

special_factors = {
    "pravastatin sodium": {
        '20MG': 1.3,
    }, # 20MG

    "ezetimibe,simvastatin tab" : {
        "10/10": 1.9,
        "10/20": 2.0,
    }, # 10/10, 10/20

    "ezetimibe": {
        "10mg": 1.2,
    }, # 10mg

    "simvastatin": {
        "10mg": 1.4,
        "20mg": 1.6,
    }, # 10mg, 20mg

    "atorvastatin": {
        "10mg" : 1.6,
        "20mg" : 1.8,
        "40mg" : 2.0,
    }, # 10mg, 40mg, 20mg

    "rosuvastatin": {
        "10MG" : 1.9,
        "20MG" : 2.1,
    }, # 10MG, 20MG

    # # --- NOT IN NUHS DOC --- #
    # "lovastatin": {
    #     "20mg" : None,
    # }, # 20mg

    # "metronidazole, nystatin": {
    #     "500mg": None,
    #     "100,000 unit": None,
    # }, # 500mg, 100,000 unit  

    # "nystatin": {
    #     "100,000IU" : None,
    # }, # 100,000IU
} 

In [14]:
def llt_rescale(row):
    pid = row["Patient ID"]
    index_date = row["Index Date"]
    temp_llt = llt[llt[pid_col] == pid]
    
    ## take only those records which are between 365 days and 42 days of index date
    temp_llt = temp_llt[
        (index_date - temp_llt[ref_date_col] >= pd.Timedelta(days=42)) & 
        (index_date - temp_llt[ref_date_col] <= pd.Timedelta(days=365))
    ]
    
    ## if not empty apply correction else do not
    if (temp_llt.empty):
        factor = 1
    else:
        # sort by date, get the most_recent_date, get all dispensed on same most_recent-date         
        most_recent_date = temp_llt[ref_date_col].sort_values(ascending=False).iloc[0]
        temp_llt = temp_llt[temp_llt[ref_date_col] == most_recent_date]

        # intersections
        prescribed_list = temp_llt[drug_name_col].str.lower().unique().tolist()

        # referenced later in elif block
        special_set = set(prescribed_list).intersection(set(special_factors.keys()))
        
        if len(prescribed_list) == 1:
            drug_key =  prescribed_list[0]
            factor_dict = special_factors.get(drug_key)
            if not factor_dict:
                if drug_key in utils.llt_meds:
                    factor = 1/0.7
                else:
                    factor = 1
            else:
                drug_subset = temp_llt.loc[temp_llt[drug_name_col].str.lower() == drug_key]
                dose_key = drug_subset.iloc[0]["Drug Strength"]
                factor = factor_dict.get(dose_key)
            
        elif len(set(prescribed_list).intersection(set(ezetimibe_combis.keys()))) \
            and ("ezetimibe" in prescribed_list):
            prescribed_list.remove("ezetimibe")
            
            combis_set = set(prescribed_list).intersection(set(ezetimibe_combis.keys()))
            
            # get a random special drug
            drug_key = random.sample(list(combis_set), 1)[0]

            # get dose
            drug_subset = temp_llt.loc[temp_llt[drug_name_col].str.lower() == drug_key]
            dose_key = drug_subset.iloc[0]["Drug Strength"]

            # get factor
            factor = ezetimibe_combis.get(drug_key).get(dose_key)

        # next block assumes ezetimibe cases has already been evaluated
        # non-ezetimibe combinations
        elif len(special_set): 
        
            # get a random special drug
            drug_key = random.sample(list(special_set), 1)[0]

            # get dose
            drug_subset = temp_llt.loc[temp_llt[drug_name_col].str.lower() == drug_key]
            dose_key = drug_subset.iloc[0]["Drug Strength"]

            # get factor
            factor = special_factors.get(drug_key).get(dose_key)

        # default scaling factor on master list of llt medications (see utils.py)
        elif len(set(prescribed_list).intersection(set(utils.llt_meds))):
            factor = 1/0.7

        else:
            factor = 1 
    # display(row["Patient ID"])        
    return row["LDL-C Max"] * factor
    

In [15]:
tqdm.pandas()

ldlc_valid_18_nontg_index_nosecondary_final["Scaled LDL-C Max"] = ldlc_valid_18_nontg_index_nosecondary_final.progress_apply(
    llt_rescale, axis=1
)

100%|███████████████████████████████████████████████████████████████████████████████████████████| 20787/20787 [3:24:27<00:00,  1.69it/s]


In [16]:
# number of cases scaled
(ldlc_valid_18_nontg_index_nosecondary_final["LDL-C Max"] != ldlc_valid_18_nontg_index_nosecondary_final["Scaled LDL-C Max"]).sum()

806

In [17]:
ldlc_valid_18_nontg_index_nosecondary_final[ldlc_valid_18_nontg_index_nosecondary_final["LDL-C Max"] != ldlc_valid_18_nontg_index_nosecondary_final["Scaled LDL-C Max"]]

Unnamed: 0,Patient ID,LDL-C Max,Index Date,Scaled LDL-C Max
24,005fea938fa8e55e5b82,2.61,2017-12-11,3.728571
94,0127cc42a5f153ced7f9,1.70,2015-09-15,2.380000
126,0183a8b095cbcf928046,1.98,2015-11-17,2.772000
160,01e3259cc164dd825c39,1.67,2017-12-06,3.340000
175,02253c435ea32cc78a34,2.16,2017-11-14,3.456000
...,...,...,...,...
20670,fe7ad02ccf26189ed59a,2.05,2016-07-07,2.870000
20731,ff3a413299e78f9b8598,3.87,2017-02-28,7.740000
20765,ffbc05e08474f03a444e,1.61,2017-09-12,2.576000
20778,ffed72306d9970bafa2a,2.98,2016-08-22,4.172000


In [18]:
##
def segment(ldlc_val):
    if ldlc_val >= 4.9:
        return "Cases"
    elif 3.36 < ldlc_val < 4.9:
        return "Unknown"
    elif ldlc_val <= 3.36:
        return "Control"
    else:
        raise ValueError("Unable to handle ldlc_val.")

ldlc_valid_18_nontg_index_nosecondary_final["Category"] = ldlc_valid_18_nontg_index_nosecondary_final["Scaled LDL-C Max"].apply(segment)

In [19]:
ldlc_valid_18_nontg_index_nosecondary_final["Category"].value_counts()

Category
Control    13949
Unknown     5571
Cases       1267
Name: count, dtype: int64

In [17]:
ldlc_valid_18_nontg_index_nosecondary_final["Category"].value_counts()

Category
Control    14044
Unknown     5575
Cases       1168
Name: count, dtype: int64

In [18]:
# Add race/nationality filter in reporting results
# Add table 1 using general lab data (main source, ldlc is the limiting feature)

## End.