In [None]:
import pandas as pd

import os
import pyrootutils
root = pyrootutils.setup_root(
    search_from=os.path.abspath(''),
    indicator=[".git"],
    pythonpath=True, # add root directory to the PYTHONPATH (helps with imports)
    dotenv=True, # load environment variables from .env if exists in root directory
)

from utils.file_management.config_loader import load_yaml, process_config_values
from utils.file_management.file_manager import FileManager 
from utils.query_utils.extractor import Extractor

from tableone import TableOne
import matplotlib.pyplot as plt

In [None]:
# Load yaml file with dataset information
config_path = str(root) + '/config/LBP_cohort.yaml'
config = process_config_values(load_yaml(config_path))

# Load paths to data
PlumsFiles = FileManager(config.get('file_directory'))

#Set print counts flag
check_query_flag = True       #axilluary checks to see if query makes sense
PlumsExtractor = Extractor(num_results_flag=True, display_results_flag=True)

# Identify Patients
patientdurablekey_list = pd.read_csv(PlumsFiles.get_datapath('patientdurablekey_csv'))
patientdurablekey_list = list(patientdurablekey_list['patientdurablekey'])
print(len(patientdurablekey_list))

# Identify Imaging ID (accessions)
accessionnumber_list = pd.read_csv(PlumsFiles.get_datapath('accessionnumber_csv'))
accessionnumber_list = list(accessionnumber_list['accessionnumber'])
print(len(accessionnumber_list))

# Data Loading

## Categorical Data

### Load Predictors, with labels

In [None]:
dataQuery = f'''
/*
Description: Patient demographics
*/

SELECT DISTINCT
    patientdurablekey,
    ageatfirstimaging,
    yearatfirstimaging,
    sex,
    preferredlanguage,
    raceethnicity,
    smokingstatus,
    --religion,
    socialsupport,
    primaryinsurance
FROM 
  read_parquet('{PlumsFiles.get_datapath('patdurabledim_analysis_imputed_parquet')}')
WHERE 
  patientdurablekey IN {tuple(patientdurablekey_list)}

ORDER BY
  patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(dataQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')
    
df_predictors1 = results_df_pd.copy()

results_df_pd.head()

In [None]:
# Predictor Labels
dataQuery = f'''
/*
Description: Patient diagnoses
*/

SELECT DISTINCT
    *
FROM 
  read_parquet('{PlumsFiles.get_datapath('diagnosiseventfact_analysis_parquet')}')
WHERE 
  patientdurablekey IN {tuple(patientdurablekey_list)}
        
ORDER BY
  patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(dataQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')

df_predictors3 = results_df_pd.copy()

# Set 'key' as index if needed
df_predictors3 = results_df_pd.copy()
df_predictors3['lbpduration'][df_predictors3['lbpduration']=='unspecified'] = 0
df_predictors3['lbpduration'][df_predictors3['lbpduration']=='acute'] = 1
df_predictors3['lbpduration'][df_predictors3['lbpduration']=='chronic'] = 2

df_predictors3 = df_predictors3.groupby('patientdurablekey', as_index=True).max().reset_index()

results_df_pd.head()

In [None]:
# Note data
note_filepath = #TODO '/path_to_your_project/cohort_note_text.csv'

dataQuery = f'''
/*
Description: Patient clinical charts
*/

SELECT DISTINCT
    *,
FROM 
  read_csv('{note_filepath}')

ORDER BY
  patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(dataQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')

#results_df_pd.head()

# Sample code to compute the maximum value for each feature group
feature_groups = ['gpt_endplate', 'gpt_disc', 'gpt_scs', 'gpt_fj', 'gpt_lrs', 'gpt_fs', 'gpt_sij', 'gpt_olisth', 'gpt_curv', 'gpt_frac']
max_values = {}

# Loop through each feature group, filter columns that match the feature name, and calculate the max value for each group
for group in feature_groups:
    # Filter columns that contain the feature group name
    group_columns = results_df_pd.filter(regex=f'^{group}').columns
    # Calculate the max value across these columns
    max_values[group] = results_df_pd[group_columns].max(axis=1)

# Convert the results to a DataFrame for easier readability
df_predictors4 = pd.DataFrame(max_values)
df_predictors4['patientdurablekey'] = results_df_pd['patientdurablekey']
df_predictors4 = df_predictors4.groupby('patientdurablekey').max().reset_index()
# Fill in rows with missing data
df_predictors4.fillna(0, inplace=True)

df_predictors4.head()

In [None]:
# Identify patients without any interventions
missing_patients = list(set(patientdurablekey_list) - set(df_predictors4['patientdurablekey']))
print(len(missing_patients))

# Create a dictionary with 'patientdurablekey' as the missing keys and all other columns initialized to 0
missing_data = {'patientdurablekey': missing_patients}
for col in df_predictors4.columns.drop('patientdurablekey'):
    missing_data[col] = -1
# Create the new DataFrame with the missing patients
df_missing = pd.DataFrame(missing_data)
# Append the new rows to the existing DataFrame
df_predictors4 = pd.concat([df_predictors4, df_missing], ignore_index=True)

df_predictors4

### Combine Predictors, with labels

In [None]:
# Predictor Labels
dataQuery = f'''
/*
Description: Combine predictors from each relational table into one df
*/

SELECT DISTINCT
    *
FROM 
  (SELECT DISTINCT
  a.*
  FROM df_predictors1 as a
  ) as d
INNER JOIN df_predictors3 as c
ON d.patientdurablekey = c.patientdurablekey
INNER JOIN df_predictors4 as e
ON d.patientdurablekey = e.patientdurablekey
        
ORDER BY
  c.patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(dataQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')

df_predictors = results_df_pd.copy().drop(['patientdurablekey_1','patientdurablekey_2'], axis=1)
df_predictors = df_predictors.drop(['anxiety','depression'],axis=1)
df_predictors.head()

### Load Outcomes, with labels

In [None]:
# Outcome Labels
dataQuery = f'''
/*
Description: Patient medications
*/

SELECT DISTINCT
    CASE WHEN a.interventiontype=1 THEN 'nsaids'
      WHEN a.interventiontype=2 THEN 'opioids'
      WHEN a.interventiontype=3 THEN 'opioids'
    ELSE 'unspecified'
    END AS interventiontype,
    b.*
FROM 
  read_parquet('{PlumsFiles.get_datapath('interventiontype_singlelabel_analysis_parquet')}') as a
INNER JOIN df_predictors as b
ON a.patientdurablekey = b.patientdurablekey
WHERE 
  a.patientdurablekey IN {tuple(patientdurablekey_list)}
        
ORDER BY
  a.patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(dataQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')
    print(results_df_pd.value_counts('interventiontype'))
    
results_df_pd.head()

### Summarize Data Distributions

In [None]:
df_data_with_notes = results_df_pd[results_df_pd['gpt_disc'] != -1].set_index('patientdurablekey').dropna()
df_data_with_notes 

continuous = ['ageatfirstimaging', 'yearatfirstimaging']
df_summary = TableOne(df_data_with_notes, 
                      groupby='interventiontype', 
                      categorical=[x for x in df_data_with_notes.columns if x not in continuous],
                      continuous=continuous,
                      pval=True)

save_path = PlumsFiles.get_datapath('model_output_dir').replace('MODEL','classification_1class_meds').replace('INDEPENDENT_VAR','master_data_for_analysis_with_report_tableone_stats.csv') 
df_summary.to_csv(save_path,index=True)
df_summary

In [None]:
# Data summary
df_data = results_df_pd.set_index('patientdurablekey').dropna() #.replace('unknown', '')

save_path = PlumsFiles.get_datapath('model_output_dir').replace('MODEL','classification_1class_meds').replace('INDEPENDENT_VAR','master_data_for_analysis_revision1.csv') 
df_data.to_csv(save_path,index=True)

continuous = ['ageatfirstimaging', 'yearatfirstimaging']
df_summary = TableOne(df_data, 
                      groupby='interventiontype', 
                      categorical=[x for x in df_data.columns if x not in continuous],
                      continuous=continuous,
                      pval=True)
df_summary

In [None]:
df_data.hist(figsize=[23, 15])
plt.tight_layout()
plt.show()

In [None]:
df_data[df_data['interventiontype']=='nsaids'].hist(figsize=[23, 15])
plt.tight_layout()
plt.show()

In [None]:
df_data[df_data['interventiontype']=='opioids'].hist(figsize=[23, 15])
plt.tight_layout()
plt.show()

## Data Mapped from Categorical to Numerical

### Load Predictors, Cateogrical --> Numerical

In [None]:
# Predictor Labels

dataQuery = f'''
/*
Description: Patient demographics with raceethnicity One Hot Encoded 
*/

SELECT DISTINCT
    patientdurablekey,
    ageatfirstimaging,
    yearatfirstimaging,
    CASE WHEN sex LIKE 'Male' THEN 1
        ELSE 0
        END AS sex,
    CASE WHEN preferredlanguage LIKE 'english' THEN 1
        ELSE 0
        END AS preferredlanguage,
    CASE WHEN raceethnicity LIKE 'white' THEN 1
        ELSE 0
        END AS race_white,
    CASE WHEN raceethnicity LIKE 'asian' THEN 1
        ELSE 0
        END AS race_asian,
    CASE WHEN raceethnicity LIKE 'latinx' THEN 1
        ELSE 0
        END AS race_latinx,
    CASE WHEN raceethnicity LIKE 'black or african american' THEN 1
        ELSE 0
        END AS race_blackorafricanamerican,
    CASE WHEN raceethnicity LIKE 'unknown' THEN 1
        ELSE 0
        END AS race_unknown,
    CASE WHEN raceethnicity LIKE 'other' THEN 1
        ELSE 0
        END AS race_other,
    CASE WHEN smokingstatus LIKE 'smoker' THEN 2
        WHEN smokingstatus LIKE 'former' THEN 1
        WHEN smokingstatus LIKE 'never' THEN 0
        ELSE -1
        END AS smokingstatus,
    --CASE WHEN religion LIKE 'unknown' THEN -1
    --    WHEN religion LIKE 'none' THEN 0
    --    ELSE 1
    --    END AS religion,
    CASE WHEN socialsupport LIKE 'partnered' THEN 1
        WHEN socialsupport LIKE 'single/separated' THEN 0
        ELSE -1
        END AS socialsupport,
    primaryinsurance
FROM 
  read_parquet('{PlumsFiles.get_datapath('patdurabledim_analysis_imputed_parquet')}')
WHERE 
  patientdurablekey IN {tuple(patientdurablekey_list)}

ORDER BY
  patientdurablekey
'''

dataQuery = f'''
/*
Description: Patient demographics with raceethnicity ordinally encoded
*/

SELECT DISTINCT
    patientdurablekey,
    ageatfirstimaging,
    yearatfirstimaging,
    CASE WHEN sex LIKE 'Male' THEN 1
        ELSE 0
        END AS sex,
    CASE WHEN preferredlanguage LIKE 'english' THEN 1
        ELSE 0
        END AS preferredlanguage,
    CASE WHEN raceethnicity LIKE 'white' THEN 1
        WHEN raceethnicity LIKE 'asian' THEN 2
        WHEN raceethnicity LIKE 'latinx' THEN 3
        WHEN raceethnicity LIKE 'black or african american' THEN 4
        WHEN raceethnicity LIKE 'unknown' THEN -1
        WHEN raceethnicity LIKE 'other' THEN 6
        ELSE -1
        END AS raceethnicity,
    CASE WHEN smokingstatus LIKE 'smoker' THEN 2
        WHEN smokingstatus LIKE 'former' THEN 1
        WHEN smokingstatus LIKE 'never' THEN 0
        ELSE -1
        END AS smokingstatus,
    --CASE WHEN religion LIKE 'unknown' THEN -1
    --    WHEN religion LIKE 'none' THEN 0
    --    ELSE 1
    --    END AS religion,
    CASE WHEN socialsupport LIKE 'partnered' THEN 1
        WHEN socialsupport LIKE 'single/separated' THEN 0
        ELSE -1
        END AS socialsupport,
    CASE WHEN primaryinsurance LIKE 'PPO' THEN 4
        WHEN primaryinsurance LIKE 'HMO/POS/EPO' THEN 3
        WHEN primaryinsurance LIKE 'medicare' THEN 2
        WHEN primaryinsurance LIKE 'medicaid' THEN 1
        WHEN primaryinsurance LIKE 'other' THEN 0
        ELSE -1
        END AS primaryinsurance
FROM 
  read_parquet('{PlumsFiles.get_datapath('patdurabledim_analysis_imputed_parquet')}')
WHERE 
  patientdurablekey IN {tuple(patientdurablekey_list)}

ORDER BY
  patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(dataQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')
    
df_predictors1 = results_df_pd.copy()

results_df_pd.head()

In [None]:
# Predictor Labels
dataQuery = f'''
/*
Description: Patient diagnoses
*/

SELECT DISTINCT
    *
FROM 
  read_parquet('{PlumsFiles.get_datapath('diagnosiseventfact_analysis_parquet')}')
WHERE 
  patientdurablekey IN {tuple(patientdurablekey_list)}
        
ORDER BY
  patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(dataQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')

df_predictors3 = results_df_pd.copy()

# Set 'key' as index if needed
df_predictors3 = results_df_pd.copy()
df_predictors3['lbpduration'][df_predictors3['lbpduration']=='unspecified'] = 0
df_predictors3['lbpduration'][df_predictors3['lbpduration']=='acute'] = 1
df_predictors3['lbpduration'][df_predictors3['lbpduration']=='chronic'] = 2

df_predictors3 = df_predictors3.groupby('patientdurablekey', as_index=True).max().reset_index()

results_df_pd.head()

In [None]:
# Note data
note_filepath = #TODO '/path_to_your_project/cohort_note_text.csv'

dataQuery = f'''
/*
Description: Patient clinical charts
*/

SELECT DISTINCT
    *,
FROM 
  read_csv('{note_filepath}')

ORDER BY
  patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(dataQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')

#results_df_pd.head()

# Sample code to compute the maximum value for each feature group
feature_groups = ['gpt_endplate', 'gpt_disc', 'gpt_scs', 'gpt_fj', 'gpt_lrs', 'gpt_fs', 'gpt_sij', 'gpt_olisth', 'gpt_curv', 'gpt_frac']
max_values = {}

# Loop through each feature group, filter columns that match the feature name, and calculate the max value for each group
for group in feature_groups:
    # Filter columns that contain the feature group name
    group_columns = results_df_pd.filter(regex=f'^{group}').columns
    # Calculate the max value across these columns
    max_values[group] = results_df_pd[group_columns].max(axis=1)

# Convert the results to a DataFrame for easier readability
df_predictors4 = pd.DataFrame(max_values)
df_predictors4['patientdurablekey'] = results_df_pd['patientdurablekey']
df_predictors4 = df_predictors4.groupby('patientdurablekey').max().reset_index()
# Fill in rows with missing data
df_predictors4.fillna(0, inplace=True)

df_predictors4.head()

In [None]:
# Identify patients without any interventions
missing_patients = list(set(patientdurablekey_list) - set(df_predictors4['patientdurablekey']))
print(len(missing_patients))

# Create a dictionary with 'patientdurablekey' as the missing keys and all other columns initialized to 0
missing_data = {'patientdurablekey': missing_patients}
for col in df_predictors4.columns.drop('patientdurablekey'):
    missing_data[col] = 0
# Create the new DataFrame with the missing patients
df_missing = pd.DataFrame(missing_data)
# Append the new rows to the existing DataFrame
df_predictors4 = pd.concat([df_predictors4, df_missing], ignore_index=True)
df_predictors4

### Combine Predictors, Categorical --> Numerical

In [None]:
# Predictor Labels
dataQuery = f'''
/*
Description: For analysis
*/

SELECT DISTINCT
    *
FROM 
  (SELECT DISTINCT
  a.*,
  --b.primaryinsurance
  FROM df_predictors1 as a
  --INNER JOIN df_predictors2 as b
  --  ON a.patientdurablekey = b.patientdurablekey
  ) as d
INNER JOIN df_predictors3 as c
ON d.patientdurablekey = c.patientdurablekey
INNER JOIN df_predictors4 as e
ON d.patientdurablekey = e.patientdurablekey
        
ORDER BY
  c.patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(dataQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')

df_predictors = results_df_pd.copy().drop(['patientdurablekey_1','patientdurablekey_2'], axis=1)
df_predictors = df_predictors.drop(['anxiety','depression'],axis=1)
df_predictors.head()

In [None]:
# Sanity check data distributions
column_names = list(df_predictors.columns[1:])
print(column_names)
for col in column_names:
    print(df_predictors[col].value_counts())

### Load Outputs, Categoircal --> Numerical

In [None]:
# Outcome Labels
dataQuery = f'''
/*
Description: Patient medications
*/

SELECT DISTINCT
    CASE WHEN a.interventiontype=1 THEN 1
      WHEN a.interventiontype=2 THEN 2
      WHEN a.interventiontype=3 THEN 2
    ELSE 0
    END AS interventiontype,
    b.*
FROM 
  read_parquet('{PlumsFiles.get_datapath('interventiontype_singlelabel_analysis_parquet')}') as a
INNER JOIN df_predictors as b
ON a.patientdurablekey = b.patientdurablekey
WHERE 
  a.patientdurablekey IN {tuple(patientdurablekey_list)}
        
ORDER BY
  a.patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(dataQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')
    print(results_df_pd.value_counts('interventiontype'))
    
results_df_pd.head()

### Summarize Data Distributions

In [None]:
# Data summary
df_data = results_df_pd.set_index('patientdurablekey').dropna()

save_path = PlumsFiles.get_datapath('model_output_dir').replace('MODEL','classification_1class_meds').replace('INDEPENDENT_VAR','master_numerical_data_for_analysis_revision1.csv') 
df_data.to_csv(save_path,index=True)

continuous = ['ageatfirstimaging', 'yearatfirstimaging']
df_summary = TableOne(df_data, 
                      groupby='interventiontype', 
                      categorical=[x for x in df_data.columns if x not in continuous],
                      continuous=continuous,
                      pval=True)
df_summary

In [None]:
print(df_summary)

In [None]:
df_data.info()