In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import cohen_kappa_score

import os
import pyrootutils
root = pyrootutils.setup_root(
    search_from=os.path.abspath(''),
    indicator=[".git"],
    pythonpath=True, # add root directory to the PYTHONPATH (helps with imports)
    dotenv=True, # load environment variables from .env if exists in root directory
)
from utils.file_management.config_loader import load_yaml, process_config_values
from utils.file_management.file_manager import FileManager


### Load Clinical Chart and Note Data (Labels encoded as numerical)

In [None]:
# Load yaml file with dataset information
config_path = str(root) + '/config/datasets/cohort03_MriNoninvasive.yaml'
config = process_config_values(load_yaml(config_path))

# Load paths to data
PlumsFiles = FileManager(config.get('file_directory'))

# Path to preprocessed data
master_data_path         = PlumsFiles.get_datapath('model_output_dir').replace('MODEL','classification_1class_meds').replace('INDEPENDENT_VAR','master_data_for_analysis.csv') 
master_encoded_data_path = PlumsFiles.get_datapath('model_output_dir').replace('MODEL','classification_1class_meds').replace('INDEPENDENT_VAR','master_numerical_data_for_analysis.csv') 

# Load data
prepped_data_df =  pd.read_csv(master_encoded_data_path)

comparison_df = prepped_data_df[['patientdurablekey','discpathology','spinalstenosis','facetjointarthropathy','sacroiliacjoint','scoliosis'
                                 'gpt_disc','gpt_scs','gpt_fj','gpt_sij','gpt_curv','gpt_frac','gpt_olisth','gpt_endplate','gpt_lrs','gpt_fs']]
comparison_df

### Cohen's kappa

In [None]:
col_pairs = [('disc','discpathology','gpt_disc'),
             ('spinalstenosis','spinalstenosis','gpt_scs'),
             ('fj','facetjointarthropathy','gpt_fj'),
             ('sij','sacroiliacjoint','gpt_sij'),
             ('curvature','scoliosis','gpt_curv'),
            ]
results = []
for name, tabular_col, text_col in col_pairs:
    y1 = np.clip(comparison_df[tabular_col], 0, 1)
    y2 = np.clip(comparison_df[text_col], 0, 1)
    #calculate Cohen's Kappa
    score = cohen_kappa_score(y1, y2)
    results.append({
        'variable': name,
        'kappa': np.round(score, 3),
        'n_tabular': np.sum(y1),
        'n_text': np.sum(y2),
        'ratio_text2tab': np.round(np.sum(y2)/np.sum(y1),3),
        })
agreement_df = pd.DataFrame(results)

# Save
save_path = #TODO '/path_to_your_project/data_source_agreement/cohenkappa_tabular_and_text.csv'
agreement_df.to_csv(save_path)

agreement_df