In [1]:
#This script is processing data to generate small diag ICD9 and CCS diag labels for the diagnosis prediction subtask. This process slightly cleans up data
# as it removes entries without icd9 codes, reducing dataset size
#To generate labels for other data (small proc icd9, ccs, ndc), this script must be adjusted adequately

In [17]:
# GENERATING INDEX FILE FOR SMALL DIAG/PROC ICD9
import json
import pandas as pd
import numpy as np

with open("../data/extended/preprocessing/ICDandCCSmappings/merged_simplified_icd_text.json") as file:
    icdFile = json.load(file)
    
diagMap = {}
diagIdx = 0
procMap = {}
procIdx = 0

icdFile = {key: value for key, value in sorted(icdFile.items(), key=lambda item: item[0])}

for value in icdFile:
    if value.startswith("D_"):
        diagMap[value] = diagIdx
        diagIdx += 1
    elif value.startswith("P_"): 
        procMap[value] = procIdx
        procIdx += 1      

#diagMap 1234 entries, procMap 100 entries
print(f"The resulting diagMap has {len(diagMap)} entries, procMap has {len(procMap)} entries")
        
with open("../data/extended_folds/preprocessing/smallICDdiagMapping.json", "w") as file:    
    json.dump(diagMap, file)
    
with open("../data/extended_folds/preprocessing/smallICDprocMapping.json", "w") as file:    
    json.dump(procMap, file)
    

1234 100
The resulting diagMap has 1234 entries, procMap has 100


In [18]:
def processString(string, charsToRemove):
    for char in charsToRemove: string = string.replace(char, "")
    return string

In [109]:
# GENERATING LABELS FOR SMALL DIAG ICD9
cvFolds = 10
numIcdCodes = len(diagMap)

for i in range(cvFolds):
    print("Fold Number {}:\n".format(i))
    filename = '/clinicalBERT/data/extended_folds/discharge_subjectsplit/fold' + str(i)
    taskname = '_codeprediction'
    # extension = '_notext.csv'
    extension = '_text.csv'
    foldDf = pd.read_csv(filename+extension).drop(["Unnamed: 0"], axis=1) 
    cleanFoldDf = foldDf[foldDf['NEXT_SMALL_DIAG_ICD9'].notna()].reset_index(drop=True)
    #Adding new column for label
    cleanFoldDf['LABEL_NEXT_SMALL_DIAG_ICD9'] = np.nan
    cleanFoldDf['LABEL_NEXT_SMALL_DIAG_ICD9'] = cleanFoldDf['LABEL_NEXT_SMALL_DIAG_ICD9'].astype(object)
    
    for index in cleanFoldDf.index:
        labels = np.zeros(numIcdCodes, dtype=np.int)
        icd9Codes = [x for x in processString(cleanFoldDf.loc[index, 'NEXT_SMALL_DIAG_ICD9'], charsToRemove = "[]\' ").split(',')]
        for code in icd9Codes:
            # print(code, diagMap[code])
            labelIndex = diagMap[code]
            labels[labelIndex] = 1
        cleanFoldDf.at[index, 'LABEL_NEXT_SMALL_DIAG_ICD9'] = list(labels)
    cleanFoldDf.to_csv(filename+taskname+extension)
    

In [23]:
# GENERATING INDEX FILE FOR CCS DIAG/PROC
import json
import pandas as pd
import numpy as np

with open("../data/extended/preprocessing/ICDandCCSmappings/merged_ccs_text.json") as file:
    ccsFile = json.load(file)
    
diagMap = {}
diagIdx = 0
procMap = {}
procIdx = 0

ccsFile = {key: value for key, value in sorted(ccsFile.items(), key=lambda item: item[0])}

for value in ccsFile:
    if value.startswith("D_"):
        diagMap[value] = diagIdx
        diagIdx += 1
    elif value.startswith("P_"): 
        procMap[value] = procIdx
        procIdx += 1      

print(f"The resulting diagMap has {len(diagMap)} entries, procMap has {len(procMap)} entries")
        
with open("../data/extended_folds/preprocessing/CCSdiagMapping.json", "w") as file:    
    json.dump(diagMap, file)
    
with open("../data/extended_folds/preprocessing/CCSprocMapping.json", "w") as file:    
    json.dump(procMap, file)
    
    

The resulting diagMap has 255 entries, procMap has 229 entries


In [28]:
# GENERATING LABELS FOR CCS DIAG
cvFolds = 10
numCCSCodes = len(diagMap)

for i in range(cvFolds):
    print("Fold Number {}:\n".format(i))
    origin_filename = '/clinicalBERT/data/extended_folds/fold' + str(i) +'_codeprediction'
    destination_filename = '/clinicalBERT/data/extended_folds/discharge_subjectsplit/fold' + str(i) + '_notext.csv'
    # extension = '_text.csv'
    foldDf = pd.read_csv(origin_filename+extension).drop(["Unnamed: 0"], axis=1) 
    cleanFoldDf = foldDf[foldDf['NEXT_DIAG_CCS'].notna()].reset_index(drop=True)
    #Adding new column for label
    cleanFoldDf['LABEL_NEXT_DIAG_CCS'] = np.nan
    cleanFoldDf['LABEL_NEXT_DIAG_CCS'] = cleanFoldDf['LABEL_NEXT_DIAG_CCS'].astype(object)
    
    for index in cleanFoldDf.index:
        labels = np.zeros(numCCSCodes, dtype=np.int)
        ccsCodes = [x for x in processString(cleanFoldDf.loc[index, 'NEXT_DIAG_CCS'], charsToRemove = "[]\' ").split(',')]
        for code in ccsCodes:
            # print(code, diagMap[code])
            labelIndex = diagMap[code]
            labels[labelIndex] = 1
        cleanFoldDf.at[index, 'LABEL_NEXT_DIAG_CCS'] = list(labels)
    cleanFoldDf.to_csv(destination_filename+extension)

Fold Number 0:

Fold Number 1:

Fold Number 2:

Fold Number 3:

Fold Number 4:

Fold Number 5:

Fold Number 6:

Fold Number 7:

Fold Number 8:

Fold Number 9:



In [17]:
import pandas as pd
import numpy as np
import torch

scalingFactor = 2
foldDf = pd.read_csv('fold0_codeprediction_text.csv')#.drop(["Unnamed: 0", "ADMITTIME"], axis=1) 
subDf = foldDf[["SUBJECT_ID","HADM_ID","Label"]]
subDf = subDf.iloc[0:6, :]

fake_scores = []
for i in range(6):
    fake_scores.append(np.random.randint(low=3, size=5))

subDf["fake_scores"] = fake_scores
print(subDf)
newDf = subDf.groupby(['HADM_ID']).agg(
    numNotes=('HADM_ID', lambda x: len(x)),
    maxScore=('fake_scores', lambda x: np.vstack(x).max(axis=0).tolist()),
    sumScore=('fake_scores', lambda x: np.vstack(x).sum(axis=0).tolist()))

newDf['maxScore'] = newDf['maxScore'].apply(np.array)
newDf['sumScore'] = newDf['sumScore'].apply(np.array)

yPredScores = (newDf['maxScore'] + newDf['sumScore'] / scalingFactor) / (1 + newDf['numNotes'] / scalingFactor)
        # temp = (df_sort.groupby(['HADM_ID'])['pred_score'].agg(max) + df_sort.groupby(['HADM_ID'])['pred_score'].agg(sum)/
        #         scaling_factor)/(1+df_sort.groupby(['HADM_ID'])['pred_score'].agg(len)/scaling_factor)
print(newDf)
print(yPredScores)

print(newDf['sumScore'] / scalingFactor)
print(newDf['maxScore'] + newDf['sumScore'] / scalingFactor)
print(1 + newDf['numNotes'] / scalingFactor)


   SUBJECT_ID   HADM_ID  Label      fake_scores
0       177.0  196896.0    0.0  [1, 2, 1, 1, 1]
1       198.0  131286.0    0.0  [0, 2, 1, 0, 0]
2       198.0  131286.0    0.0  [0, 2, 2, 2, 0]
3       198.0  131286.0    0.0  [2, 1, 0, 2, 1]
4       198.0  131286.0    0.0  [1, 1, 1, 2, 2]
5       199.0  185360.0    0.0  [1, 1, 1, 0, 0]
          numNotes         maxScore         sumScore
HADM_ID                                             
131286.0       4.0  [2, 2, 2, 2, 2]  [3, 6, 4, 6, 3]
185360.0       1.0  [1, 1, 1, 0, 0]  [1, 1, 1, 0, 0]
196896.0       1.0  [1, 2, 1, 1, 1]  [1, 2, 1, 1, 1]
HADM_ID
131286.0    [1.1666666666666667, 1.6666666666666667, 1.333...
185360.0                            [1.0, 1.0, 1.0, 0.0, 0.0]
196896.0                            [1.0, 2.0, 1.0, 1.0, 1.0]
dtype: object
HADM_ID
131286.0    [1.5, 3.0, 2.0, 3.0, 1.5]
185360.0    [0.5, 0.5, 0.5, 0.0, 0.0]
196896.0    [0.5, 1.0, 0.5, 0.5, 0.5]
Name: sumScore, dtype: object
HADM_ID
131286.0    [3.5, 5.0, 4.0, 5.0

In [16]:
from torchmetrics.functional import precision_recall
preds  = [np.array([0, 0, 1, 1]), np.array([1, 1, 0, 1]), np.array([0,1,1,1])]
target = [np.array([0, 0, 1, 1]), np.array([1, 1, 1, 1]), np.array([0,0,0,0])]

# print(precision_recall(preds, target, average='macro', mdmc_average='global', num_classes=4))
# print(precision_recall(preds, target, average='micro', mdmc_average='global', num_classes=4))

from sklearn.metrics import precision_recall_fscore_support

print(precision_recall_fscore_support(target, preds, average='macro'))
print(precision_recall_fscore_support(target, preds, average='micro'))

# #Precision tp/(tp+fp)
# #Recall tp/(tp+fn)
# # 'micro':
# # Calculate metrics globally by counting the total true positives, false negatives and false positives.

# # 'macro':
# # Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.

# micro: tp - 5, fp - 4, fn - 1
# p = 5/(5+4)
# r = 5/(5+1)

preds  = [np.array([0, 0, 1, 1]), np.array([1, 1, 0, 1]), np.array([0,1,1,1])]
for entry in preds:
    print(entry.flatten()>=1)



(0.6666666666666666, 0.875, 0.7416666666666667, None)
(0.625, 0.8333333333333334, 0.7142857142857143, None)
