In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import pandas as pd
import sys
from glob import glob
REPO_DIR = "/Users/mdorosan/Desktop/2024/elpha-singhealth-fh"
sys.path.append(REPO_DIR)
from tqdm import tqdm
from datetime import datetime

import utils.PATHS as PATHS
import utils.utils as utils
# import utils.emr_utils as emr_utils
# import utils.load_utils as load_utils

  from pandas.core import (


# Lipid-lowering treatment criteria for scaling of LDL-C values

## Notes

* Drug dispensed data for each patient is based on the `Generic Drug Name` column
* Currently the `Drug Prescribed Date From` is used as reference as `Drug Prescribed Date To` may go beyond the `Index Date` from the previous notebook.
* This notebook proceeds as follows:
   1. Creation of a patient list according to lipid-lowering treatment (LLT) criteria
   2. Scaling the LDL-C values accordingly, i.e., dividing LDL-C values 0.7 (confirm if this is divide or multiple like a scaling factor)
   3. Segmentation of cohort according to scaled LDL-C thresholds
      - Cases: LDL-C >= `4.9`
      - Unknown: `3.36` < LDL-C < `4.9`
      - Controls: LDL-C <= `3.36`

## 1. LLT criteria patient list

In [3]:
dd_fp_list = glob(os.path.join(PATHS.DRUG_DISPENSED, "*.csv"))

# general labs dataframe
df_list = [pd.read_csv(path, low_memory=False) for path in tqdm(dd_fp_list)]
df = pd.concat(df_list, ignore_index=True)

100%|███████████████████████████████████████████| 48/48 [01:17<00:00,  1.62s/it]


In [4]:
drug_name_col = "Generic Drug Name"
pid_col = "Patient ID"
date_col = "Drug Prescribed Date From"

# see utils.llt_meds for list of relevant prescriptions
llt = df[df[drug_name_col].str.lower().isin(utils.llt_meds)]

In [5]:
# prep datetime cols
llt[date_col] = pd.to_datetime(llt[date_col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  llt[date_col] = pd.to_datetime(llt[date_col])


## 2. Scaling the LDL-C values

In [6]:
path = os.path.join(REPO_DIR, "results", "ldlc_valid_18_nontg_index.csv")
ldlc_valid_18_nontg_index = pd.read_csv(path)

In [7]:
ldlc_valid_18_nontg_index.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24122 entries, 0 to 24121
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Patient ID  24122 non-null  object 
 1   LDL-C Max   24122 non-null  float64
 2   Index Date  24122 non-null  object 
dtypes: float64(1), object(2)
memory usage: 565.5+ KB


In [8]:
# prep datetime cols
ldlc_valid_18_nontg_index["Index Date"] = pd.to_datetime(
    ldlc_valid_18_nontg_index["Index Date"]
)

In [9]:
def llt_recal(row):
    pid = row["Patient ID"]
    index_date = row["Index Date"]
    temp_llt = llt[llt[pid_col] == pid]
    
    ## take only those records which are between 365 days and 42 days of index date
    temp_llt = temp_llt[
        (index_date - temp_llt[date_col] >= pd.Timedelta(days=42)) & 
        (index_date - temp_llt[date_col] <= pd.Timedelta(days=365))
    ]
    ## if not empty apply correction else do not
    if (temp_llt.empty):
        return row["LDL-C Max"]
    else:
        return row["LDL-C Max"] / 0.7
    

In [10]:
tqdm.pandas()

ldlc_valid_18_nontg_index["Scaled LDL-C Max"] = ldlc_valid_18_nontg_index.progress_apply(
    llt_recal, axis=1
)

100%|█████████████████████████████████████| 24122/24122 [07:15<00:00, 55.42it/s]


In [11]:
# number of cases scaled
(ldlc_valid_18_nontg_index["LDL-C Max"] != ldlc_valid_18_nontg_index["Scaled LDL-C Max"]).sum()

1457

In [12]:
ldlc_valid_18_nontg_index[ldlc_valid_18_nontg_index["LDL-C Max"] != ldlc_valid_18_nontg_index["Scaled LDL-C Max"]]

Unnamed: 0,Patient ID,LDL-C Max,Index Date,Scaled LDL-C Max
26,0058f60fa347864cdd99,0.95,2016-12-04,1.357143
28,005fea938fa8e55e5b82,2.61,2017-12-11,3.728571
64,00c1792ab91a9dbfa06a,1.86,2015-06-23,2.657143
73,00db0b5981b442793f2b,3.53,2016-11-08,5.042857
99,010deabc7cacd3a7ecb4,1.29,2015-07-15,1.842857
...,...,...,...,...
24007,fea6bb891c7340c41edb,2.37,2016-10-22,3.385714
24054,ff2a9e32968dabeb4cbc,3.84,2016-05-10,5.485714
24056,ff38216f6e43cd9602c5,0.89,2018-10-18,1.271429
24059,ff3a413299e78f9b8598,3.87,2017-02-28,5.528571


In [15]:
##
def segment(ldlc_val):
    if ldlc_val >= 4.9:
        return "Cases"
    elif 3.36 < ldlc_val < 4.9:
        return "Unknown"
    elif ldlc_val <= 3.36:
        return "Control"
    else:
        raise ValueError("Unable to handle ldlc_val.")

ldlc_valid_18_nontg_index["Category"] = ldlc_valid_18_nontg_index["Scaled LDL-C Max"].apply(segment)

In [18]:
ldlc_valid_18_nontg_index["Category"].value_counts()

Category
Control    16569
Unknown     6147
Cases       1406
Name: count, dtype: int64

In [None]:
# Add race/nationality filter in reporting results
# Add table 1 using general lab data (main source, ldlc is the limiting feature)

## End.