In [None]:
import pandas as pd

import os
import pyrootutils
root = pyrootutils.setup_root(
    search_from=os.path.abspath(''),
    indicator=[".git"],
    pythonpath=True, # add root directory to the PYTHONPATH (helps with imports)
    dotenv=True, # load environment variables from .env if exists in root directory
)

from utils.file_management.config_loader import load_yaml, process_config_values
from utils.file_management.file_manager import FileManager
from utils.query_utils.extractor import Extractor

from utils.model_utils.data_utils import convert_categorical_to_binary

In [None]:
# Load yaml file with dataset information
config_path = str(root) + '/src/config/LBP_cohort.yaml'
config = process_config_values(load_yaml(config_path))

# Load paths to data
PlumsFiles = FileManager(config.get('file_directory'))

#Set print counts flag
check_query_flag = True       #axilluary checks to see if query makes sense
PlumsExtractor = Extractor(num_results_flag=True, display_results_flag=True)

# Identify Patients
patientdurablekey_list = pd.read_csv(PlumsFiles.get_datapath('patientdurablekey_csv'))
patientdurablekey_list = list(patientdurablekey_list['patientdurablekey'])
print(len(patientdurablekey_list))

# Identify Imaging ID (accessions)
accessionnumber_list = pd.read_csv(PlumsFiles.get_datapath('accessionnumber_csv'))
accessionnumber_list = list(accessionnumber_list['accessionnumber'])
print(len(accessionnumber_list))

# Data Prep

### Reference dates

In [None]:
imaging_query = f"""
/*
Description: Reference dataframe with dates of first MRI
*/

WITH min_dates AS (
    SELECT 
        patientdurablekey,
        MIN(examstartdatekey) AS first_examstartdatekey
    FROM read_parquet('{PlumsFiles.get_datapath('imagingfact_parquet')}')
    WHERE accessionnumber IN {tuple(accessionnumber_list)}
    GROUP BY patientdurablekey
)

SELECT DISTINCT
    a.patientdurablekey, 
    a.accessionnumber,  
    a.examstartdatekey as first_examstartdatekey,
    a.examstartdatekey//10000 as first_examyear,
FROM read_parquet('{PlumsFiles.get_datapath('imagingfact_parquet')}') as a
JOIN min_dates as b
ON a.patientdurablekey = b.patientdurablekey
AND a.examstartdatekey = b.first_examstartdatekey

WHERE a.accessionnumber IN {tuple(accessionnumber_list)}

ORDER BY
    a.examstartdatekey,
    a.patientdurablekey
"""

tmp_img_df = PlumsExtractor.run_query(imaging_query, runtime_flag=True)
tmp_img_df.head()

## Outcome labels

### Each patient has multiple outcomes (one-hot-encoding) 
Does the patient have an NSAID prescription, opioid prescription, or physical therapy? (one-to-many)

In [None]:
# Intervention Labels
interventionQuery = f'''
/*
Description: Does the patient have an NSAID prescription, opioid prescription, or physical therapy? (one-to-many)
*/

SELECT DISTINCT
    a.patientdurablekey,
    a.medicationtype AS interventiontype
FROM 
  read_parquet('{PlumsFiles.get_datapath('medicationorderfact_parquet')}') as a
LEFT JOIN tmp_img_df as b
ON a.patientdurablekey = b.patientdurablekey
WHERE 
  a.patientdurablekey IN {tuple(patientdurablekey_list)}
  AND a.startdatekey BETWEEN b.first_examstartdatekey AND b.first_examstartdatekey + 10000

UNION

SELECT 
  a.patientdurablekey,
  CASE WHEN UPPER(a.proceduretype) LIKE '%PHYSICAL THERAPY%' THEN 'PHYSICAL THERAPY'
  END AS interventiontype
FROM 
  read_parquet('{PlumsFiles.get_datapath('referraleventfact_parquet')}') as a
LEFT JOIN tmp_img_df as b
ON a.patientdurablekey = b.patientdurablekey
WHERE 
  a.patientdurablekey IN {tuple(patientdurablekey_list)}
  AND a.startdatekey BETWEEN b.first_examstartdatekey AND b.first_examstartdatekey + 10000

ORDER BY
  a.patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(interventionQuery,runtime_flag=True,df_type='pandas')

# Save table
PlumsFiles.save_df_to_parquet(results_df_pd,PlumsFiles.get_datapath('medicationorderfact_analysis_parquet'))
PlumsFiles.save_df_to_csv(results_df_pd,PlumsFiles.get_datapath('medicationorderfact_analysis_csv'))

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')
    print('MedicationOrderFact -  total medication names: ')
    PlumsExtractor.col_to_list(results_df_pd, 'interventiontype')
    
results_df_pd.head()

In [None]:
df_outcomes_multiple_categories = results_df_pd.copy()

count_key = 'patientdurablekey'
group_keys = ['interventiontype']

df_binary, feature_columns = convert_categorical_to_binary(df_outcomes_multiple_categories, group_keys, count_key)
df_binary = df_binary.groupby(count_key)[feature_columns].sum().reset_index()

# Summarize the label categories and total matches
df_outcomes_one_hot = df_binary.copy()

# Save table
PlumsFiles.save_df_to_parquet(df_outcomes_one_hot,PlumsFiles.get_datapath('interventiontype_multilabel_analysis_parquet'))
PlumsFiles.save_df_to_csv(df_outcomes_one_hot,PlumsFiles.get_datapath('interventiontype_multilabel_analysis_csv'))

df_outcomes_one_hot

### Each patient has one outcome (ordinal) 
What is the strongest class of medication prescribed? No, NSAID, or opioid prescription? (one-to-one)


In [None]:
# Intervention Labels
interventionQuery = f'''
/*
Description: What is the strongest class of medication prescribed? No, NSAID, or opioid prescription? (one-to-one)
*/

SELECT DISTINCT
    a.patientdurablekey,
    CASE WHEN LOWER(a.medicationtype) LIKE '%step 1%' THEN 1
        WHEN LOWER(a.medicationtype) LIKE '%step 2%' THEN 2
        WHEN LOWER(a.medicationtype) LIKE '%step 3%' THEN 3
    ELSE 0
    END AS interventiontype
FROM 
  read_parquet('{PlumsFiles.get_datapath('medicationorderfact_parquet')}') as a
LEFT JOIN tmp_img_df as b
ON a.patientdurablekey = b.patientdurablekey
WHERE 
  a.patientdurablekey IN {tuple(patientdurablekey_list)}
  AND a.startdatekey BETWEEN b.first_examstartdatekey AND b.first_examstartdatekey + 10000
  
ORDER BY
  a.patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(interventionQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')
    
results_df_pd.head()

In [None]:
# Group by 'key' and select the maximum value from the 'label' column for each key
df_single_outcomes = results_df_pd.groupby('patientdurablekey', as_index=False)['interventiontype'].max()

# Identify patients without any interventions
missing_patients = list(set(patientdurablekey_list) - set(df_single_outcomes['patientdurablekey']))
print(len(missing_patients))
# Create a new DataFrame with the new keys and label set to 0
df_missing = pd.DataFrame({'patientdurablekey': missing_patients, 'interventiontype': 0})
# Append the new rows to the existing DataFrame
df_single_outcomes = pd.concat([df_single_outcomes, df_missing], ignore_index=True)

# Save table
PlumsFiles.save_df_to_parquet(df_single_outcomes,PlumsFiles.get_datapath('interventiontype_singlelabel_analysis_parquet'))
PlumsFiles.save_df_to_csv(df_single_outcomes,PlumsFiles.get_datapath('interventiontype_singlelabel_analysis_csv'))

df_single_outcomes

## Predictors

### Predictors demographics (one-to-one)

In [None]:
# Intervention Labels
dataQuery = f'''
/*
Create Date: All patient demographics (pre-cleaning)
*/

SELECT DISTINCT
    *
FROM 
  read_parquet('{PlumsFiles.get_datapath('patdurabledim_parquet')}')
WHERE 
  patientdurablekey IN {tuple(patientdurablekey_list)}
ORDER BY
  patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(dataQuery,runtime_flag=True,df_type='pandas')

results_df_pd.head()

In [None]:
# Distribution of unique values within a category
results_df_pd.value_counts('maritalstatus')

In [None]:
# Intervention Labels
dataQuery = f'''
/*
Create Date: Patient demographics mapped to meaningful categories (cleaned)
*/

SELECT DISTINCT
    a.patientdurablekey,
    sex,
    b.ageatfirstimaging,
    c.first_examyear as yearatfirstimaging,
    CASE WHEN preferredlanguage LIKE 'English' THEN 'english'
        ELSE 'Non-english'
        END AS preferredlanguage,
    CASE WHEN ucsfderivedraceethnicity_x LIKE 'White' THEN 'white'
        WHEN ucsfderivedraceethnicity_x LIKE 'Asian' THEN 'asian'
        WHEN ucsfderivedraceethnicity_x LIKE 'Latinx' THEN 'latinx'
        WHEN ucsfderivedraceethnicity_x LIKE 'Black or African American' THEN 'black or african american'
        WHEN ucsfderivedraceethnicity_x LIKE 'Unknown/Declined' THEN 'unknown'
        ELSE 'other' -- captures other, multi-race, southwest asian, north african, pacific islander, native american
        END AS raceethnicity,
    CASE WHEN smokingstatus LIKE 'Every Day' THEN 'smoker'
        WHEN smokingstatus LIKE 'Heavy Smoker' THEN 'smoker'
        WHEN smokingstatus LIKE 'Light Smoker' THEN 'smoker'
        WHEN smokingstatus LIKE 'Some Days' THEN 'smoker'
        WHEN smokingstatus LIKE 'Smoker, Current Status Unknown' THEN 'smoker'
        WHEN smokingstatus LIKE 'Former' THEN 'former'
        WHEN smokingstatus LIKE 'Passive Smoke Exposure - Never Smoker' THEN 'never'
        WHEN smokingstatus LIKE 'Never' THEN 'never'
        ELSE 'unknown'
        END AS smokingstatus,
    CASE WHEN religion LIKE 'Unknown' THEN 'unknown'
        WHEN religion LIKE '*Unspecified' THEN 'unknown'
        WHEN religion LIKE 'none' THEN 'none'
        WHEN religion LIKE 'No Faith' THEN 'none'
        WHEN religion LIKE 'Non-Denominational' THEN 'none'
        WHEN religion LIKE 'Spiritual & Not Religious' THEN 'none'
        ELSE 'religious' 
        END AS religion,
    CASE WHEN maritalstatus LIKE 'Married' THEN 'partnered'
        WHEN maritalstatus LIKE 'Significant Other' THEN 'partnered'
        WHEN maritalstatus LIKE 'Registered Domestic Partner' THEN 'partnered'
        WHEN maritalstatus LIKE '' THEN 'partnered'
        WHEN maritalstatus LIKE 'Single' THEN 'single/separated'
        WHEN maritalstatus LIKE 'Divorced' THEN 'single/separated'
        WHEN maritalstatus LIKE 'Widowed' THEN 'single/separated'
        WHEN maritalstatus LIKE 'Legally Separated' THEN 'single/separated'
        WHEN maritalstatus LIKE 'RDP-Widowed' THEN 'single/separated' 
            -- can use Reg Dom Partner filling status for 2 years after change in status for benefits 
        WHEN maritalstatus LIKE 'RDP-Dissolved' THEN 'single/separated' 
            -- can use Reg Dom Partner filling status for 2 years after change in status for benefits 
        ELSE 'unknown'
        END AS socialsupport

FROM 
  read_parquet('{PlumsFiles.get_datapath('patdurabledim_parquet')}') as a
INNER JOIN (
  SELECT DISTINCT
      patientdurablekey,
      MIN(ageatimaging) as ageatfirstimaging,
  FROM 
    read_parquet('{PlumsFiles.get_datapath('patdurabledim_parquet')}')
  GROUP BY 
    patientdurablekey   
  ) as b
ON a.patientdurablekey = b.patientdurablekey
INNER JOIN(
    SELECT DISTINCT
        patientdurablekey,
        first_examyear,
    FROM 
        tmp_img_df
    ) as c
ON a.patientdurablekey = c.patientdurablekey

WHERE 
  a.patientdurablekey IN {tuple(patientdurablekey_list)}
  AND LOWER(sex) IN {('female','male')} 


ORDER BY
  a.patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(dataQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')
    
df_patient_info = results_df_pd.copy()

# Save table
PlumsFiles.save_df_to_parquet(results_df_pd,PlumsFiles.get_datapath('patdurabledim_analysis_parquet'))
PlumsFiles.save_df_to_csv(results_df_pd,PlumsFiles.get_datapath('patdurabledim_analysis_csv'))

results_df_pd.head()

In [None]:
df_patient_info.value_counts('religion')
df_patient_info.value_counts('religion')/len(df_patient_info)

In [None]:
import matplotlib.pyplot as plt

plt.hist(df_patient_info['ageatfirstimaging'])
plt.show()

plt.hist(df_patient_info['yearatfirstimaging'])
plt.show()

### Predictors exam start date (one-to-one)

In [None]:
# Intervention Labels
interventionQuery = f'''
/*
Create Date: Patient interventions
*/

SELECT DISTINCT
    a.patientdurablekey,
    --a.startdatekey,
    --b.first_examstartdatekey,
    a.medicationtype AS interventiontype
FROM 
  read_parquet('{PlumsFiles.get_datapath('medicationorderfact_parquet')}') as a
LEFT JOIN tmp_img_df as b
ON a.patientdurablekey = b.patientdurablekey
WHERE 
  a.patientdurablekey IN {tuple(patientdurablekey_list)}
  
ORDER BY
  a.patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(interventionQuery,runtime_flag=True,df_type='pandas')

# Save table
PlumsFiles.save_df_to_parquet(results_df_pd,PlumsFiles.get_datapath('medicationorderfact_analysis_parquet'))
PlumsFiles.save_df_to_csv(results_df_pd,PlumsFiles.get_datapath('medicationorderfact_analysis_csv'))

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')
    print('MedicationOrderFact -  total medication names: ')
    PlumsExtractor.col_to_list(results_df_pd, 'interventiontype')
    
results_df_pd.head()

### Predictors billing (one-to-one)

In [None]:
# Intervention Labels
dataQuery = f'''
/*
Create Date: Patient billing information for insurance (pre-cleaned)
*/

SELECT DISTINCT
    *
FROM 
  read_parquet('{PlumsFiles.get_datapath('billingaccountfact_parquet')}')
WHERE 
  patientdurablekey IN {tuple(patientdurablekey_list)}
ORDER BY
  patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(dataQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')
    
results_df_pd.head()

In [None]:
# Distribution of unique values within a category
results_df_pd.value_counts('fc1_plan')

In [None]:
# Intervention Labels
dataQuery = f'''
/*
Create Date: Patient billing information for insurance mapped to meaningful categories (cleaned)
*/

SELECT DISTINCT
    patientdurablekey,
    encounterkey,
    CASE WHEN fc1_plan LIKE 'PPO' THEN 'PPO' -- preferred provider organizations
        WHEN fc1_plan LIKE '%Medicare%' THEN 'medicare' -- older using social security
        WHEN fc1_plan LIKE 'Managed Medi-Cal' THEN 'medicaid' -- low-income
        WHEN fc1_plan LIKE 'Medi-Cal Standard' THEN 'medicaid' -- low-income
        WHEN fc1_plan LIKE 'Medicaid Out-of-State ' THEN 'medicaid' -- low-income
        WHEN fc1_plan LIKE 'HMO' THEN 'HMO/POS/EPO'
        WHEN fc1_plan LIKE 'IPA' THEN 'HMO/POS/EPO'
        WHEN fc1_plan LIKE 'POS' THEN 'HMO/POS/EPO' -- between PPO and HMO, closer to HMO
        WHEN fc1_plan LIKE 'EPO' THEN 'HMO/POS/EPO' -- between PPO and HMO, closer to HMO
        ELSE 'other' -- Worker's Comp, TriCare, GOV, Indemnity(sued), MIA
        END AS primaryinsurance,
    CASE WHEN fc1_plan LIKE 'PPO' THEN 5 -- preferred provider organizations
        WHEN fc1_plan LIKE '%Medicare%' THEN 4 -- older using social security
        WHEN fc1_plan LIKE 'Managed Medi-Cal' THEN 3 -- low-income
        WHEN fc1_plan LIKE 'Medi-Cal Standard' THEN 3 -- low-income
        WHEN fc1_plan LIKE 'Medicaid Out-of-State ' THEN 3 -- low-income
        WHEN fc1_plan LIKE 'HMO' THEN 2
        WHEN fc1_plan LIKE 'IPA' THEN 2
        WHEN fc1_plan LIKE 'POS' THEN 2 -- between PPO and HMO, closer to HMO
        WHEN fc1_plan LIKE 'EPO' THEN 2 -- between PPO and HMO, closer to HMO
        ELSE 1 -- Worker's Comp, TriCare, GOV, Indemnity(sued), MIA
        END AS primaryinsurancekey,
FROM 
  read_parquet('{PlumsFiles.get_datapath('billingaccountfact_parquet')}')
WHERE 
  patientdurablekey IN {tuple(patientdurablekey_list)}
ORDER BY
  patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(dataQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')
    
# Save table
PlumsFiles.save_df_to_parquet(results_df_pd,PlumsFiles.get_datapath('billingaccountfact_analysis_parquet'))
PlumsFiles.save_df_to_csv(results_df_pd,PlumsFiles.get_datapath('billingaccountfact_analysis_csv'))

results_df_pd.head()


In [None]:
df_patient_info = results_df_pd.copy()
df_patient_info.value_counts('primaryinsurance')
df_patient_info.value_counts('primaryinsurance')/len(df_patient_info)

### Predictors derived from diagnoses (one-to-one)
Acute/Chronic LBP, Sciatica, Disc Pathology

In [None]:
# Intervention Labels
inputQuery = f'''
/*
Create Date: Patient demographics mapped to meaningful categories (cleaned)
*/

SELECT DISTINCT
  a.patientdurablekey,
  --diagnosisname,
  
  -- START DEMOGRAPHICS/PSYCHOSOCIAL
  CASE WHEN LOWER(a.diagnosisname) LIKE '%anxiety%' THEN 1
      ELSE 0
      END AS 'anxiety',
  CASE WHEN LOWER(a.diagnosisname) LIKE '%depression%' THEN 1
      ELSE 0
      END AS 'depression',
  CASE WHEN (Anxiety==1 OR Depression==1) THEN 1
    ELSE 0
    END AS 'negativepsychstate',
  CASE WHEN (LOWER(a.diagnosisname) LIKE '%obes%' AND LOWER(a.diagnosisname) NOT LIKE '%nonobes%') THEN 1
      ELSE 0
      END AS 'obesity',
  -- END DEMOGRAPHICS/PSYCHOSOCIAL
  
  -- START SYMPTOMS/DIAGNOSES
  CASE WHEN (LOWER(a.diagnosisname) LIKE '%back%pain%' AND LOWER(a.diagnosisname) LIKE '%acute%') THEN 'acute'
      WHEN (LOWER(a.diagnosisname) LIKE '%back%pain%') AND (LOWER(a.diagnosisname) LIKE '%chronic%') 
          AND (LOWER(a.diagnosisname) NOT LIKE '%unspecif%chronic%') THEN 'chronic'
      ELSE 'unspecified'
      END AS 'lbpduration',
  CASE WHEN (LOWER(a.diagnosisname) LIKE '%sciatic%' AND LOWER(a.diagnosisname) NOT LIKE '%without%') THEN 1
      ELSE 0
      END AS 'sciatica',
  CASE WHEN ((LOWER(a.diagnosisname) LIKE '%facet%' OR LOWER(diagnosisname) LIKE '%joint%')
          AND LOWER(a.diagnosisname) LIKE '%arthropathy%') THEN 1
      ELSE 0
      END AS 'facetjointarthropathy',
  CASE WHEN LOWER(a.diagnosisname) LIKE '%scoliosis%' THEN 1
      ELSE 0
      END AS 'scoliosis',
  CASE WHEN ((LOWER(a.diagnosisname) LIKE '%disc %' 
      OR LOWER(a.diagnosisname) LIKE '%disc'
      OR LOWER(a.diagnosisname) LIKE '%disc,%'
      OR LOWER(a.diagnosisname) LIKE '%discogenic%'
      OR LOWER(a.diagnosisname) LIKE '%discectomy%'
      ) AND LOWER(a.diagnosisname) NOT LIKE '%optic%') THEN 1 --discomfort, discharge, discussion, discrepancy)
      ELSE 0
      END AS 'discpathology',
  CASE WHEN (LOWER(a.diagnosisname) LIKE '%stenosis%' AND (LOWER(a.diagnosisname) LIKE '%spin%' 
          OR LOWER(a.diagnosisname) LIKE '%lumb%' OR LOWER(a.diagnosisname) LIKE '%foramina%'
          OR LOWER(a.diagnosisname) LIKE '%thoracic%')) THEN 1
      ELSE 0
      END AS 'spinalstenosis',
  CASE WHEN (LOWER(a.diagnosisname) LIKE '%sacroiliac%' 
      OR LOWER(a.diagnosisname) LIKE '% si %'
      OR LOWER(a.diagnosisname) LIKE 'si %') THEN 1
      ELSE 0
      END AS 'sacroiliacjoint',
  CASE WHEN (LOWER(a.diagnosisname) LIKE '%diabet%' AND LOWER(a.diagnosisname) NOT LIKE '%history%'
       AND LOWER(a.diagnosisname) NOT LIKE '%screening%' AND LOWER(a.diagnosisname) NOT LIKE '%antidiabet%' 
       AND LOWER(a.diagnosisname) NOT LIKE '%pre-diabet%') THEN 1 -- diabetes, diabetic
       ELSE 0
       END AS 'diabetes',
  CASE WHEN (LOWER(a.diagnosisname) LIKE '%back%pain%' AND LOWER(a.diagnosisname) LIKE '%right%') THEN 1 -- 'unilateral'
      WHEN ((LOWER(a.diagnosisname) LIKE '%back%pain%') AND (LOWER(a.diagnosisname) LIKE '%left%')) THEN 1 -- 'unilateral' 
      WHEN ((LOWER(a.diagnosisname) LIKE '%back%pain%') AND (LOWER(a.diagnosisname) LIKE '%bilateral%')) THEN 3 --'bilateral' 
      ELSE 0 --'unspecified'
      END AS 'lbplaterality',
  CASE WHEN ((LOWER(a.diagnosisname) LIKE '%radicul%' AND LOWER(a.diagnosisname) NOT LIKE '%without%') OR
      (LOWER(a.diagnosisname) LIKE '%radiat%' AND 
      (LOWER(a.diagnosisname) LIKE '%back%pain%' OR LOWER(a.diagnosisname) LIKE '%lbp%'))) THEN 1 -- radiating, radiation
      ELSE 0
      END AS 'radiculopathy',
  CASE WHEN (LOWER(a.diagnosisname) LIKE '%numbness%' OR LOWER(a.diagnosisname) LIKE '%tingling%') THEN 1 
      ELSE 0
      END AS 'numbnesstingling',
  CASE WHEN (LOWER(a.diagnosisname) LIKE '%osteoarthritis%'
      OR LOWER(a.diagnosisname) LIKE '%osteoarthrosis%') THEN 1
      ELSE 0
      END AS 'osteoarthritisososteoarthritis',
  CASE WHEN (LOWER(a.diagnosisname) LIKE '%osteoporosis%'
      OR LOWER(a.diagnosisname) LIKE '%osteopenia%') THEN 1
      ELSE 0
      END AS 'osteopeniaosteoporosis',
  CASE WHEN (LOWER(diagnosisname) LIKE '%bladder%' OR 
      (LOWER(diagnosisname) LIKE '%bladder%' 
      AND (LOWER(diagnosisname) LIKE '%dysfunction%'
      OR LOWER(diagnosisname) LIKE '%disorder%'
      OR LOWER(diagnosisname) LIKE '%irrita%')) ) THEN 1 -- iritable, irritation
      ELSE 0
      END AS 'bowelbladder',
  CASE WHEN ((LOWER(diagnosisname))  LIKE '%fibromyalgia%' OR LOWER(diagnosisname) LIKE '%fibrosis%') THEN 1
      ELSE 0
      END AS 'fibromyalgiafibrosis'
  -- END SYMPTOMS/DIAGNOSES
FROM 
  read_parquet('{PlumsFiles.get_datapath('diagnosiseventfact_parquet')}') as a
LEFT JOIN tmp_img_df as b
ON a.patientdurablekey = b.patientdurablekey
WHERE 
  a.patientdurablekey IN {tuple(patientdurablekey_list)}
  AND a.startdatekey < b.first_examstartdatekey + 100
  -- first_examstartdatekey + 100 adds 1 month to first_examstartdatekey assuming the format is YYYYMMDD

ORDER BY
  a.patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(inputQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')
    
# Save table
PlumsFiles.save_df_to_parquet(results_df_pd,PlumsFiles.get_datapath('diagnosiseventfact_analysis_parquet'))
PlumsFiles.save_df_to_csv(results_df_pd,PlumsFiles.get_datapath('diagnosiseventfact_analysis_csv'))

patientdurablekey_study_list = results_df_pd['patientdurablekey']

results_df_pd.head()

In [None]:
# Distribution of unique values within a category
results_df_pd.value_counts('negativepsychstate')

In [None]:
# Set 'key' as index if needed
df_prep = results_df_pd.copy()
df_prep['lbpduration'][df_prep['lbpduration']=='unspecified'] = 0
df_prep['lbpduration'][df_prep['lbpduration']=='acute'] = 1
df_prep['lbpduration'][df_prep['lbpduration']=='chronic'] = 2

df_predictors = df_prep.groupby('patientdurablekey', as_index=True).max()

column_names = list(df_predictors.columns)
for col in column_names:
    print(df_predictors[col].value_counts())
    
df_predictors

### Images

In [None]:
# Intervention Labels
imageQuery = f'''
/*
Create Date: MRI image metadata
*/

SELECT DISTINCT
    --a.*,
    a.body_part_examined
FROM 
  read_csv('{config.get('mri_cfg').get('save_search_cfg',{}).get('metadata_csv',None)}') as a
LEFT JOIN tmp_img_df as b
ON a.accession_number = b.accessionnumber

WHERE 
  b.patientdurablekey IN {tuple(patientdurablekey_study_list)}

'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(imageQuery,runtime_flag=True,df_type='pandas')
    
list(results_df_pd['body_part_examined'])

In [None]:
# Intervention Labels
imageQuery = f'''
/*
Create Date: MRI image metadata 
*/

SELECT DISTINCT
    --a.*,
    b.patientdurablekey,
    a.accession_number,
    a.dcm_dirs,
    a.scan_type,
    a.body_part_examined,
    CONCAT(b.patientdurablekey, '_', a.accession_number) AS subject_id  
FROM 
  read_csv('{config.get('mri_cfg').get('save_search_cfg',{}).get('metadata_csv',None)}') as a
LEFT JOIN tmp_img_df as b
ON a.accession_number = b.accessionnumber

WHERE 
  b.patientdurablekey IN {tuple(patientdurablekey_study_list)}
  AND UPPER(a.body_part_examined) LIKE '%SPINE%'
  AND UPPER(a.body_part_examined) NOT LIKE '%BRAIN%' -- TBD remove ORBIT
  AND UPPER(a.body_part_examined) NOT LIKE '%HEAD%'
  AND UPPER(a.body_part_examined) NOT LIKE '%PELVIS%'
  AND UPPER(a.body_part_examined) NOT LIKE '%ABDOMEN%'
  AND UPPER(a.body_part_examined) NOT LIKE '%TMJ%'
  AND UPPER(a.body_part_examined) NOT LIKE '%TSPINE%'
  AND UPPER(a.body_part_examined) NOT LIKE '%TMJ%'
  
  --AND LOWER(a.scan_type) LIKE 't1-ax'
  
ORDER BY
  --a.scan_type,
  b.patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(imageQuery,runtime_flag=True,df_type='pandas')

# Save table
PlumsFiles.save_df_to_parquet(results_df_pd,PlumsFiles.get_datapath('imagingeventfact_analysis_parquet'))
PlumsFiles.save_df_to_csv(results_df_pd,PlumsFiles.get_datapath('imagingeventfact_analysis_csv'))

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')

filtered_df = results_df_pd.copy()

results_df_pd.head()

In [None]:
# Visualize MR images
import pydicom as dicom
import matplotlib.pyplot as plt

# Number of rows and columns per figure
rows, cols = 4, 8
images_per_figure = rows * cols

# Loop through the dataframe in batches of 32 images
for start_idx in range(0, 130, images_per_figure):
    fig, axes = plt.subplots(rows, cols, figsize=(20, 10))
    fig.suptitle(f'Images {start_idx + 1} to {start_idx + images_per_figure}', fontsize=16)
    axes = axes.ravel()  # Flatten the axes array for easy indexing

    # Process each image in the current batch
    for i, ii in enumerate(range(start_idx, min(start_idx + images_per_figure, len(results_df_pd)))):
        path = results_df_pd['dcm_dirs'][ii]
        files = os.listdir(path)
        body_part = results_df_pd['body_part_examined'][ii]
        series = results_df_pd['scan_type'][ii]
        # Read the DICOM image
        image_path = os.path.join(path, files[6])  # Change index if needed
        ds = dicom.dcmread(image_path)

        # Plot image
        axes[i].imshow(ds.pixel_array, cmap='gray')
        axes[i].set_title(f'{body_part}{series}')
        axes[i].axis('off')  # Hide axes ticks

    # Hide any remaining empty subplots
    for j in range(i + 1, rows * cols):
        axes[j].axis('off')

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()

In [None]:
# List of patients with images for each required sequence
# List of required scan types to filter by
required_scan_types = ['t1-sag', 't1-ax', 't2-sag', 't2-ax']

# Find AccessionNumbers that have all required scan types
valid_accessions = filtered_df.groupby('patientdurablekey')['scan_type'].apply(lambda x: set(required_scan_types).issubset(set(x)))

# Filter the DataFrame to include only those AccessionNumbers
filtered_df = filtered_df[filtered_df['patientdurablekey'].isin(valid_accessions[valid_accessions].index)]

df_images = filtered_df.copy()

# Display the filtered DataFrame
print(len(filtered_df))
filtered_df.head()

# Find common AccessionNumber values in both DataFrames
common_pdk = list(set(df_images['patientdurablekey']))

# Function to get image data based on scan type
def get_image_data(df, key_value, scan_type):
    try:
        result = df.loc[
            (df['patientdurablekey'] == key_value) & 
            (df['scan_type'] == scan_type), 
            'dcm_dirs'
        ]
    except:
        result = ''
    return result.values[-1] if not result.empty else None

# Build the new DataFrame
df_new = pd.DataFrame({
    'patientdurablekey': common_pdk,
    't1_sag': [get_image_data(df_images, num, 't1-sag') for num in common_pdk],
    't1_ax': [get_image_data(df_images, num, 't1-ax') for num in common_pdk],
    't2_sag': [get_image_data(df_images, num, 't2-sag') for num in common_pdk],
    't2_ax': [get_image_data(df_images, num, 't2-ax') for num in common_pdk],
    'subject_id': [list(df_images['subject_id'][df_images['patientdurablekey'] == num])[0] for num in common_pdk],
})
print(len(df_new))


save_path = config.get('analysis_data_dir') + '/analysis_imagingpaths.csv'
print(save_path)
PlumsFiles.save_df_to_csv(df_new, save_path)

df_new

## Query Development Only
Predictors derived from diagnoses

### Defining Diagnosisname Predicors - 

In [None]:
# Intervention Labels
inputQuery = f'''
/*
Description: Exploration to define diagnosisname values in each category (e.g. ethnicity)
*/

SELECT DISTINCT
  --patientdurablekey,
  diagnosisname
FROM 
  read_parquet('{PlumsFiles.get_datapath('diagnosiseventfact_parquet')}')
WHERE
   LOWER(diagnosisname) LIKE '%fibromyalgia%'
   OR LOWER(diagnosisname) LIKE '%fibrosis%'
  
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(inputQuery,runtime_flag=True,df_type='pandas')

list(set(results_df_pd['diagnosisname']))

### Disc Pathology Predictors

In [None]:
# Intervention Labels
inputQuery = f'''
/*
Description: Exploration to define diagnosisname criteria for binary categories (e.g. disc pathology)
*/

SELECT DISTINCT
  --patientdurablekey,
  diagnosisname
FROM 
  read_parquet('{PlumsFiles.get_datapath('diagnosiseventfact_parquet')}')
WHERE
   LOWER(diagnosisname) LIKE '%disc%'
   --AND LOWER(diagnosisname) NOT LIKE '%back%pain%'
   
  
ORDER BY
  patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(inputQuery,runtime_flag=True,df_type='pandas')

results_df_pd

In [None]:
set(results_df_pd['diagnosisname'])

# End