In [None]:
import duckdb
import pandas as pd
import numpy as np
import os

import pyrootutils
root = pyrootutils.setup_root(
    search_from=os.path.abspath(''),
    indicator=[".git"],
    pythonpath=True, # add root directory to the PYTHONPATH (helps with imports)
    dotenv=True, # load environment variables from .env if exists in root directory
)
from src.utils.query_utils.extractor import Extractor
from src.utils.file_management.config_loader import load_yaml, process_config_values
from src.utils.file_management.file_manager import FileManager

In [None]:
# --- LOAD RELATIONAL DATABASE ---
datapath = #TODO 'path_to_your_data/parquet'
data_asset = 'DEID_CDW'

# Enter tables you are planning to query here  
imagingfact             = duckdb.read_parquet(os.path.join(datapath,data_asset,'imagingfact','*.parquet'))
medicationorderfact     = duckdb.read_parquet(os.path.join(datapath,data_asset,'medicationorderfact','*.parquet'))
diagnosiseventfact      = duckdb.read_parquet(os.path.join(datapath,data_asset,'diagnosiseventfact','*.parquet'))
diagnosisterminologydim = duckdb.read_parquet(os.path.join(datapath,data_asset,'diagnosisterminologydim','*.parquet'))
procedureeventfact      = duckdb.read_parquet(os.path.join(datapath,data_asset,'procedureeventfact','*.parquet'))

procedureterminologydim = duckdb.read_parquet(os.path.join(datapath,data_asset,'procedureterminologydim','*.parquet'))
#referralfact       = duckdb.read_parquet(os.path.join(datapath,data_asset,'referralfact','*.parquet'))
referraleventfact       = duckdb.read_parquet(os.path.join(datapath,data_asset,'referraleventfact','*.parquet'))
patdurabledim           = duckdb.read_parquet(os.path.join(datapath,data_asset,'patdurabledim','*.parquet'))
note_metadata           = duckdb.read_parquet(os.path.join(datapath,data_asset,'note_metadata','*.parquet'))
note_concepts           = duckdb.read_parquet(os.path.join(datapath,data_asset,'note_concepts','*.parquet'))
note_text               = duckdb.read_parquet(os.path.join(datapath,data_asset,'note_text','*.parquet'))
imaging_series          = duckdb.read_parquet(os.path.join(datapath,'IMAGING','series','*.parquet'))

# Initialize data extraction tools
check_query_flag = True       #axilluary checks to see if query makes sense
PlumsExtractor = Extractor(num_results_flag=True, display_results_flag=True)

In [None]:
# --- Cohort selection ---
# Load file configuration
from src.utils.file_management.config_loader import load_yaml, process_config_values
from src.utils.file_management.file_manager import FileManager

cohort_cfg_path = #TODO '/path_to_your_project/code/config/datasets/cohort03_MriNoninvasive.yaml'
config = load_yaml(cohort_cfg_path)
config = process_config_values(config)
print(config.keys())

PlumsFiles = FileManager(config.get('file_directory'))

# Identify Patients
df = PlumsFiles.read_file(PlumsFiles.get_datapath('patientdurablekey_csv'))
patientdurablekey_workinglist = df['patientdurablekey'].to_list()
print(len(patientdurablekey_workinglist))

# Identify Imaging ID (accessions)
df = PlumsFiles.read_file(PlumsFiles.get_datapath('accessionnumber_csv'))
accessionnumber_workinglist = df['accessionnumber'].to_list()
accessionnumber_workinglist = PlumsExtractor.remove_invalid(accessionnumber_workinglist)
print(len(accessionnumber_workinglist))

# Identify MRIs
MRI_cohort_filepath = config.get('query_output_dir') +'/20240801_dcm_dirlist_t1sag_t1ax_t2sag_t2ax_all_seqs.csv'
df = pd.read_csv(MRI_cohort_filepath)
accessionnumber_imageslist = df['AccessionNumber'].to_list()
print(len(accessionnumber_imageslist))

## Extraction

### Demographics

In [None]:
# Demographics
demographicsQuery = f'''
/*
Description: Select relevant demographics (sex, ethnicity, birthday, postal code, ...) for study cohort
*/

SELECT DISTINCT
  patientdurablekey, 
  patientepicid, 
  sex, 
  preferredlanguage, 
  ucsfderivedraceethnicity_x, 
  birthdate, 
  deathdate,
  stateorprovince,
  addresskey,
  postalcode, 
  maritalstatus,
  religion,
  smokingstatus,
  primaryfinancialclass, 
  sexassignedatbirth, 
  genderidentity 
  -- highestlevelofeducation, *Unspecified
  --address, DEID
  --city, DEID
  --county, DEID
FROM 
  patdurabledim
  WHERE 
    patientdurablekey IN {tuple(patientdurablekey_workinglist)}
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(demographicsQuery,runtime_flag=True,df_type='pandas')
# Save table
PlumsFiles.save_df_to_parquet(results_df_pd,PlumsFiles.get_datapath('patdurabledim_parquet'))
PlumsFiles.save_df_to_csv(results_df_pd,PlumsFiles.get_datapath('patdurabledim_csv'))

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd,'patientdurablekey')

results_df_pd[0:10]

### Imaging Info

In [None]:
# Imaging
imagingQuery = f'''
/*
Description: Select relevant patient, visit, and image keys from imagingfact table for study cohort.
*/

SELECT DISTINCT
    accessionnumber, 
    patientdurablekey,
    encounterkey, 
    firstprocedurename, 
    examstartdatekey,
    orderingdepartmentname,
    orderingdepartmentspecialty,
    performingdepartmentname,
    performingdepartmentspecialty,
    CASE WHEN UPPER(firstprocedurename) LIKE '%MR%' THEN 'MR'
      WHEN UPPER(firstprocedurename) LIKE '%XR%' THEN 'XR'
      WHEN UPPER(firstprocedurename) LIKE '%CT%' THEN 'CT'
      END AS imagetype

FROM 
    imagingfact

  WHERE patientdurablekey IN {tuple(patientdurablekey_workinglist)}
  AND examstartdatekey > 0
  AND canceled=0
  
  AND 
    (
      UPPER(firstprocedurename) LIKE '%LUMB%' -- LUMBAR
      OR UPPER(firstprocedurename) LIKE '%L_SPINE%' -- L-SPINE & L SPINE, FETAL SPINE, TOTAL SPINE
    )
    AND 
    (
      UPPER(firstprocedurename) LIKE '%MR%'
      OR UPPER(firstprocedurename) LIKE '%XR%'
      OR UPPER(firstprocedurename) LIKE '%CT%'
    )

    AND UPPER(firstprocedurename) NOT LIKE '%CERV%' -- CERVICAL
    AND UPPER(firstprocedurename) NOT LIKE '%THOR%' -- THORACIC
    AND UPPER(firstprocedurename) NOT LIKE '%FETAL%'
    AND UPPER(firstprocedurename) NOT LIKE '%TOTAL%'
    AND UPPER(firstprocedurename) NOT LIKE '%OUTSIDE%' -- exam at occured outside UCSF
    
  ORDER BY
    patientdurablekey,
    examstartdatekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(imagingQuery,runtime_flag=True,df_type='pandas')

# Save table
PlumsFiles.save_df_to_parquet(results_df_pd,PlumsFiles.get_datapath('imagingfact_parquet'))
PlumsFiles.save_df_to_csv(results_df_pd,PlumsFiles.get_datapath('imagingfact_csv'))


### Medications

In [None]:
# Medications from medicationorderfact 
#WHO guidelines - Analgesic https://www.ncbi.nlm.nih.gov/books/NBK554435/
#chrome-extension://efaidnbmnnnibpcajpcglclefindmkaj/https://www.hopkinsmedicine.org/transplant/programs/kidney/living_donors/NSAIDs%20List%20to%20Avoid.pdf

#STEP 1 - NSAID
nsaid_list = ['ACTRON','ADVIL','ALEVE','ALGIX','ANAPROX','ANSAID','ASPIRIN','ACETYLSALICYLICACID','BRUFEN','BUTAZOLIDIN','CATAFLAM','CEEOXX','CELEBREX',
              'CELECOXIB','CEOXX','CHOLINEMAGNESIUMTRISALICYLATE','CLINORIL','CLOTAM',
              'DAYPRO','DAYRUN','DEXDETOPROFEN','','DICLOFENAC','DIFLUNISAL','DISALCID','DOLOBID','DURAPROX','DYNASTAT','ETODOLAC','ETORICOXIB','EQUIOXX',
              'FELDENE','FENOPROFEN','FENOPRON','FIROCOXIB','FLURBIPROFEN','','FLURWOOD',
              'FROBEN','IBUPROFEN','INDOCIN,INDOCINSR','INDOMETHACIN','','KERAL','KETOFLAM','KETOPROFEN','KETOROLAC','LICOFELONE','LODINE,LODINEXL',
              'LORNOXICAM','LOXOPROFEN','LOXONIN','LOXOMAC','LUMIRACOXIB','MECLOMEN',
              'MECLOFENAMICACID','MECLOFENEMATE','MEDIPREN','MEFENAMICACID','MELOX','MELOXICAM','MESULID','MIDOL','MOBIC','MOBIFLEX','MONO-GESIC',
              'MOTRIN','MOVALIS','NABUMETONE','NALFON','NAPRELAN','NAPROSYN','NAPROXEN','NIMALOX',
              'NIMESULIDE','NUPRIN','NUROFEN','ORUDIS','ORUVAIL','OXAPOROZIN','OXENO','PARECOXIB','PHENYLBUTAZONE','PIROXICAM','PONSTEL','PREVICOX',
              'PREXIGE','RAPID','RECOXA','RELAFEN','ROFECOXIB','SALFLEX','SALICYLATE','SALSALATE',
              'SALSITAB','SPRIX','SULIDE','SULINDAC','TENOXICAM','TOLECTIN','TOLFENAMICACID','TORADOL','TRILISATEDISACLID','TUFNIL','TYLENOL',
              'URBIFEN','VALDECOXIB(BEXTRA)','VIOXX','VOLTAREN,VOLTAREN-XR','XEFO']
nsaid_step1_query_txt = ' OR '.join(['UPPER(medicationname) LIKE \'%'+x+'%\'' for x in nsaid_list])

#STEP 2 - Weak Opioids (less addictive)
opioid_step2_list = ['TRAMADOL', 'TILIDIN', 'TILIDINE', 'CODEINE','%CODONE%','HYDROCODONE']
opioid_step2_query_txt = ' OR '.join(['UPPER(medicationname) LIKE \'%'+x+'%\'' for x in opioid_step2_list])

#STEP 3  - Strong Opioids
opioid_step3_list = ['OXYCONTIN','VICODIN','OXYCODONE','HYDROCODONE','FENTANYL','FENTANIL','%FENTAN_L%',
               'MORPHINE','MORPHONE','%MORPH_NE%','MEPERIDINE','BUPRENORPHINE','DILAUDID']
opioid_step3_query_txt = ' OR '.join(['UPPER(medicationname) LIKE \'%'+x+'%\'' for x in opioid_step3_list])

# ALTERNATIVE STEP -  other pain meds
alt_med_list = ['MARIJUANA','CANABIS','CANNABIS','CANNABIDIOL','%CAN%ABI%', 'METHOCARBAMOL']
alt_med_query_txt = ' OR '.join(['UPPER(medicationname) LIKE \'%'+x+'%\'' for x in alt_med_list])

medsQuery = f'''
/*
Description: Select relevant meds (opioid vs nsaid meds) from medication table.
*/

SELECT DISTINCT 
  patientdurablekey,
  medicationorderkey, -- 
  encounterkey, -- 
  medicationkey, -- 
  medicationname, 
  ordername, -- 
  medicationtherapeuticclass,
  medicationpharmaceuticalclass, 
  medicationpharmaceuticalsubclass, -- 
  medicationstrength, -- 
  medicationform, -- 
  medicationroute, -- 
  route, -- 
  frequency, -- *
  doseunit, -- 
  durationkey, -- *
  startdatekey, -- *
  enddatekey, -- *
  orderedbyprovidertype,
  orderedbyproviderprimaryspecialty,
  authorizedbyprovidertype,
  authorizedbyproviderprimaryspecialty,
  class,
  mode,
  
  CASE
  WHEN {opioid_step3_query_txt} THEN 'STEP 3 OPIOID'
  WHEN {opioid_step2_query_txt} THEN 'STEP 2 OPIOID'
  WHEN {nsaid_step1_query_txt} THEN 'STEP 1 NSAID'
  WHEN {alt_med_query_txt} THEN 'ALTERNATIVE'
  ELSE 'Unspecified'
  END AS medicationtype
  
FROM 
  medicationorderfact
  WHERE patientdurablekey IN {tuple(patientdurablekey_workinglist)}
    AND startdatekey > 0
    -- for perscribed medication
    AND UPPER(mode) LIKE 'OUTPATIENT'
    
    AND
    (
        --FOR OPIOIDS
        (
          (
          UPPER(medicationtherapeuticclass) LIKE 'ANALGESICS'
          AND UPPER(medicationpharmaceuticalsubclass) LIKE '%OPIOID%'
          AND UPPER(medicationpharmaceuticalsubclass) NOT LIKE '%NON-OPIOID%'
          AND UPPER(medicationpharmaceuticalsubclass) NOT LIKE '%DIETARY SUPPLEMENT%'
          AND 
          (
            UPPER(medicationpharmaceuticalsubclass) LIKE '% AGONISTS%' --not antagonists
            OR UPPER(medicationpharmaceuticalsubclass) LIKE '%CODEINE%'
            OR UPPER(medicationpharmaceuticalsubclass) LIKE '%FENTANYL%'
            OR UPPER(medicationpharmaceuticalsubclass) LIKE '%CODONE%'
            OR UPPER(medicationpharmaceuticalsubclass) LIKE '%HYDROMORPHONE%'
            OR UPPER(medicationpharmaceuticalsubclass) LIKE '%MEPERIDINE%'
            OR UPPER(medicationpharmaceuticalsubclass) LIKE '%PENTAZOCINE%'
            OR UPPER(medicationpharmaceuticalsubclass) LIKE '%PROPOXYPHENE%'
            OR UPPER(medicationpharmaceuticalsubclass) LIKE '%TRAMADOL%'
          )
          )
        )
        OR
        -- FOR NASIDs (non-steroidal anti-inflammatory drugs)
        (
          {nsaid_step1_query_txt}
        )
        OR
        -- FOR Alternative Medicine
        (
          {alt_med_query_txt}
        )
    )
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(medsQuery,runtime_flag=True,df_type='pandas')

# Save table
PlumsFiles.save_df_to_parquet(results_df_pd,PlumsFiles.get_datapath('medicationorderfact_parquet'))
PlumsFiles.save_df_to_csv(results_df_pd,PlumsFiles.get_datapath('medicationorderfact_csv'))


#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')
    print('MedicationOrderFact -  total medication names: ')
    PlumsExtractor.col_to_list(results_df_pd, 'medicationname')

#Check medication distribution
if check_query_flag==True:
  # Total counts
  print(results_df_pd['medicationtype'].value_counts()) 
  # Unique sets
  df_plot = pd.concat([results_df_pd['patientdurablekey'],pd.get_dummies(results_df_pd['medicationtype'])],axis=1).drop_duplicates().reset_index(drop=True)
  df_onehot = df_plot.groupby('patientdurablekey').sum() #.reset_index()
  a,b = np.unique(df_onehot.to_numpy(),axis=0,return_counts=True)
  for idx in range(len(a)):
      print('one hot key=',a[idx],'\tn=',b[idx])

results_df_pd[0:10]

### Clinical Notes

In [None]:
textQuery = f'''
/*
Description: Select notes associated with imaging.
*/

SELECT 
  a.patientepicid,
  a.patientdurablekey,
  a.deid_note_key,
  a.deid_note_id,
  a.procedureorderfactid,
  a.accessionnumber,
  a.accessionnumber2,
  a.accessionnumber3,
  a.encounterfactid,
  a.encounterkey,
  a.note_type,
  a.proc_note_type,
  a.ref_note_type,
  a.ip_note_type_c,
  a.note_type_noadd_c,
  a.encounter_type,
  a.enc_dept_name,
  a.enc_dept_specialty,
  a.employeeepicid,
  a.providerepicid,
  a.auth_prov_type,
  a.prov_specialty,
  a.deid_service_date,
  b.note_text,

FROM (
  SELECT 
    * 
  FROM
    note_metadata
    WHERE
      accessionnumber IN {tuple(accessionnumber_workinglist)}
) AS a
LEFT JOIN note_text as b 
ON a.deid_note_key = b.deid_note_key
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(textQuery,runtime_flag=True,df_type='pandas')

# Save table
PlumsFiles.save_df_to_parquet(results_df_pd,PlumsFiles.get_datapath('note_text_parquet'))
PlumsFiles.save_df_to_csv(results_df_pd,PlumsFiles.get_datapath('note_text_csv'))

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')
    PlumsExtractor.col_to_list(results_df_pd, 'accessionnumber')
    print('Accession Num Note Metadata: ')
    PlumsExtractor.col_to_list(results_df_pd, 'prov_specialty')
    
results_df_pd[0:10]

In [None]:
# # Notes associated with imaging
# textQuery = f'''
# /*
# Description: Select notes associated with imaging in the radiology department.
# */

# SELECT 
#   a.patientepicid,
#   a.patientdurablekey,
#   a.deid_note_key,
#   a.deid_note_id,
#   a.procedureorderfactid,
#   a.accessionnumber,
#   a.accessionnumber2,
#   a.accessionnumber3,
#   a.encounterfactid,
#   a.encounterkey,
#   a.note_type,
#   a.proc_note_type,
#   a.ref_note_type,
#   a.ip_note_type_c,
#   a.note_type_noadd_c,
#   a.encounter_type,
#   a.enc_dept_name,
#   a.enc_dept_specialty,
#   a.employeeepicid,
#   a.providerepicid,
#   a.auth_prov_type,
#   a.prov_specialty,
#   a.deid_service_date,
#   b.note_text,
# FROM (
#   SELECT 
#     * 
#   FROM
#     note_metadata
#     WHERE
#       accessionnumber IN {tuple(accessionnumber_imageslist)}
#       AND 
#       (
#       UPPER(prov_specialty) LIKE '%RADIOLOGY%'
#       OR UPPER(enc_dept_specialty) LIKE '%RADIOLOGY%'
#       )
# ) AS a
# LEFT JOIN note_text as b ON a.deid_note_key = b.deid_note_key
# '''

# # Run query and update relevant keys
# results_df_pd = PlumsExtractor.run_query(textQuery,runtime_flag=True,df_type='pandas')

# # Save table
# PlumsFiles.save_df_to_csv(results_df_pd,PlumsFiles.get_datapath('note_text_csv').replace('.csv','_imageExists.csv'))

# #Check whether query makes sense
# if check_query_flag==True:
#     PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')
#     PlumsExtractor.col_to_list(results_df_pd, 'accessionnumber')
#     print('Accession Num Note Metadata: ')
#     PlumsExtractor.col_to_list(results_df_pd, 'prov_specialty')
    
# results_df_pd[0:10]

### Referals

In [None]:
# # Physical Therapy

# physicalTherapyQuery = f'''
# /*
# Description: Identify patients who had physical therapy.
# */

# SELECT 
#   patientdurablekey,
#   encounterkey, 
#   startdatekey,
#   startdatekeyvalue,
#   enddatekey,
#   enddatekeyvalue,
#   eventlengthinminutes,
#   type,
#   class,
#   referredtoproviderspecialty,
#   referredtodepartmentspecialty,
#   --*,
#   CASE
#     WHEN UPPER(type) LIKE '%CONSULT%' THEN 'PHYSICAL THERAPY CONSULTATION'
#     WHEN UPPER(type) LIKE '%PHYSICAL%THERAPY%' THEN 'PHYSICAL THERAPY'
#     END AS proceduretype
# FROM referraleventfact
#   WHERE patientdurablekey IN {tuple(patientdurablekey_workinglist)}
#   AND 
#   ( LOWER(referredtoproviderspecialty) LIKE '%physical%therapy%'
#     OR LOWER(type) LIKE '%physical%therapy%'
#   )
  
#   AND startdatekey > 0
#   AND LOWER(currentactiononnextappointment) LIKE '%checked%in%'

# ORDER BY
#   patientdurablekey,
#   startdatekey
# '''

# # Run query and update relevant keys
# results_df_pd = PlumsExtractor.run_query(physicalTherapyQuery,runtime_flag=True,df_type='pandas')

# # Save table
# PlumsFiles.save_df_to_parquet(results_df_pd,PlumsFiles.get_datapath('referraleventfact_parquet'))
# PlumsFiles.save_df_to_csv(results_df_pd,PlumsFiles.get_datapath('referraleventfact_csv'))

# #Check whether query makes sense
# if check_query_flag==True:
#     PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')
#     PlumsExtractor.col_to_list(results_df_pd, 'encounterkey')


# '''
# MADE ON and CHECKED IN
# total query results:  9185
# total unique patientdurablekey:  574

# CHECKED IN (only)
# total query results:  4144
# total unique patientdurablekey:  514

# MADE ON (only)
# total query results:  5041
# total unique patientdurablekey:  573

# #type
# total query results:  2825
# total unique patientdurablekey:  376

# #referredtoproviderspecialty
# total query results:  3916
# total unique patientdurablekey:  602

# #type or referredtoproviderspecialty
# total query results:  5651
# total unique patientdurablekey:  773
# '''

# print(results_df_pd['proceduretype'].value_counts())

# results_df_pd[0:10]

### Refine Diagnoses

In [None]:
diagnosisQuery = f'''
/*
Description: Identify diagnosis keys and names for low back pain.
*/

SELECT DISTINCT 
  diagnosiskey, 
  diagnosisname, 
  type, 
  value
FROM diagnosisterminologydim
WHERE 
  (
    (
      LOWER(diagnosisname) LIKE '%back%pain%'
      AND 
      (
      LOWER(diagnosisname) LIKE '%lumb%'
      OR LOWER(diagnosisname) LIKE '%low%'
      )
    )
    OR
    (
      value LIKE 'M51.1'
      OR value LIKE 'M54.4'
      OR value LIKE 'M54.9'
    )
  )
  AND 
  (
    type LIKE 'ICD-10-CM'
    --OR type LIKE 'ICD-9-CM'
  )
ORDER BY type, value
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(diagnosisQuery,runtime_flag=True,df_type='pandas')

LBP_key_workinglist = results_df_pd['diagnosiskey'].drop_duplicates().tolist()
print('number of diagnosis keys:',len(LBP_key_workinglist))

In [None]:
diagnosisQuery = f'''
/*
Created By: Michelle Tong
Description: Review all diagnoses for patients in cohort.
*/

SELECT DISTINCT
  patientdurablekey,
  diagnosisname
FROM 
  diagnosiseventfact
  
  WHERE
  patientdurablekey IN {tuple(patientdurablekey_workinglist)}

'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(diagnosisQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')
    print('DiagnosisEventFact -  total diagnosis names: ')
    PlumsExtractor.col_to_list(results_df_pd, 'diagnosisname')
    
# TOTAL DIAGNOSES
diagnosis_counts = results_df_pd['diagnosisname'].value_counts(sort=True) 
print(len(diagnosis_counts))
diagnosis_counts = diagnosis_counts/len(patientdurablekey_workinglist)

# DIAGNOSES WITH AT LEAST 0.05% PREVALENCE IN THE COHORT
# Number of patients who must have the diagnoses for it be X% diagnosed 
percent_occurance_in_population = 0.05
cutoff = len(patientdurablekey_workinglist)*percent_occurance_in_population
print(cutoff)
# Diagnoses that match critera
diagnosis_counts = diagnosis_counts[diagnosis_counts>percent_occurance_in_population]
print(len(diagnosis_counts))
diagnosisname_workinglist = diagnosis_counts.index.tolist()
diagnosis_counts[0:50]

In [None]:
#infection in bone, intervertebral disc, vertebral endplate and vertebral body, disc and adjacent vertebral body, disc and adjacent vertebral body
exclude_diag_list = ['infection','osteomyelitis','discitis','spondylitis','spondylodiscitis','septic facet joint']
exclude_diag_query_txt = ' OR '.join(['LOWER(a.diagnosisname) LIKE \'%'+x+'%\'' for x in exclude_diag_list])

diagnosisQuery = f'''
/*
Description: Select relevant diagnoses for patient cohort.
*/

SELECT DISTINCT
  a.patientdurablekey,
  a.diagnosiskey,
  a.diagnosisname,
  a.startdatekey,
  a.departmentname, 
  a.departmentspecialty,
  a.encounterkey, -- 
  a.departmentkey, -- 
  a.enddatekey, -- 
  a.documentedbyprovidertype, -- 
  a.documentedbyproviderprimaryspecialty, -- 
  a.type,
  b.type as value_type,
  b.value,
  CASE
      WHEN LOWER(a.diagnosisname) LIKE '%back%pain%' THEN 'low back pain'
      WHEN LOWER(a.diagnosisname) LIKE '%radicul%' THEN 'radiculopathy'
      WHEN LOWER(a.diagnosisname) LIKE '%spinal stenosis%' THEN 'spinal stenosis'
      WHEN (LOWER(a.diagnosisname) LIKE '%herniated%disc%' OR LOWER(a.diagnosisname) LIKE '%disc%herniation%') THEN 'herniated disc'
      WHEN LOWER(a.diagnosisname) LIKE '%fusion%' THEN 'fusion'
      WHEN LOWER(a.diagnosisname) LIKE '%fracture%' THEN 'fracture'
      WHEN LOWER(a.diagnosisname) LIKE '%failed back%' THEN 'failed back'
      WHEN LOWER(a.diagnosisname) LIKE '%scoliosis%' THEN 'scoliosis'
      WHEN LOWER(a.diagnosisname) LIKE '%abscess%' THEN 'abscess'
      WHEN ({exclude_diag_query_txt}) THEN 'infection'
      WHEN (LOWER(a.diagnosisname) LIKE '%tumor%' OR LOWER(a.diagnosisname) LIKE '%metastasis%' OR LOWER(a.diagnosisname) LIKE '%oncology%') THEN 'cancer'
      WHEN LOWER(a.diagnosisname) LIKE '%depression%' THEN 'depression'
      WHEN LOWER(a.diagnosisname) LIKE '%anxiety%' THEN 'anxiety'
      ELSE 'category not specified'
      END AS diagnosistype
FROM (
  SELECT 
    * 
  FROM
    diagnosiseventfact
    WHERE
      patientdurablekey IN {tuple(patientdurablekey_workinglist)}
      AND 
      (
        diagnosisname IN {tuple(diagnosisname_workinglist)}
        OR
        diagnosiskey IN {tuple(LBP_key_workinglist)}
      )
      AND startdatekey > 0
) AS a
LEFT JOIN diagnosisterminologydim as b 
ON a.diagnosiskey = b.diagnosiskey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(diagnosisQuery,runtime_flag=True,df_type='pandas')

# Save table
PlumsFiles.save_df_to_parquet(results_df_pd,PlumsFiles.get_datapath('diagnosiseventfact_parquet'))
PlumsFiles.save_df_to_csv(results_df_pd,PlumsFiles.get_datapath('diagnosiseventfact_csv'))

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'diagnosisname')
    
results_df_pd[0:10]

# End