In [156]:
import os
import pandas as pd
import json

# Function to extract entities from the JSON data
def extract_entities(json_data):
    entity_data = []  # Initialize an empty list to store entity information
    hadm_id = json_data['hadm_id']  # Extract HADM_ID
    # Iterate through each note in the JSON data
    for note in json_data['notes']:
        note_id = note['note_id']  # Extract Note ID
        # Iterate through each annotation in the note
        for annotation in note['annotations']:
            # Extract entity information
            entity_info = {
                'HADM_ID': hadm_id,
                'Note_ID': note_id,
                'Entity_Text': annotation['covered_text'],
                'Entity_Type': annotation['type'],
                'Code': annotation['code'],
                'Code_System': annotation['code_system']
            }
            # Append entity information to the list
            entity_data.append(entity_info)
    return entity_data

In [157]:
# Function to extract information from JSON files
def extract_info_from_json(json_file_path, entity_data):
    # Load JSON data from the file
    with open(json_file_path, 'r') as file:
        json_data = json.load(file)
        # Extract entities from the JSON data
        entities = extract_entities(json_data)
        # Extend entity_data list with the extracted entities
        entity_data.extend(entities)


In [158]:
# Function to search for JSON files in a folder and its subfolders
def search_files(folder_path):
    entity_data = []  # Initialize an empty list to store entity information
    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.json'):
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, filename)
                print("Processing:", json_file_path)
                # Call extract_info_from_json function to extract information from the JSON file
                extract_info_from_json(json_file_path, entity_data)
                # Print a separator after processing each file
                print("=" * 50)

    # Create a DataFrame from the entity_data list
    entity_df = pd.DataFrame(entity_data)
    # Print the DataFrame
    #print(entity_df)
    # Print the count of unique HADM_ID values
    print("Total unique HADM_ID count:", entity_df['HADM_ID'].nunique())

    return entity_df  # Return the DataFrame

In [159]:
# Specify the path to the main folder containing subfolders with JSON files
main_folder_path = r"C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2"

# Call the search_files function to start searching for JSON files in the main folder and its subfolders
entity_df = search_files(main_folder_path)

Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\100197-ICD-9.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\ICD-10\1.0\101525-ICD-10.json
Total unique HADM_ID count: 2


In [160]:
print(entity_df)

    HADM_ID  Note_ID                    Entity_Text     Entity_Type      Code  \
0    100197    25762                            IPH  MapType.APPROX       431   
1    100197    25762     intraventricular extension  MapType.APPROX       431   
2    100197    25762                  hydrocephalus  MapType.APPROX     331.4   
3    100197    25762    intraparenchymal hemorrhage  MapType.APPROX       431   
4    100197    25762                            DNR           Human    V49.86   
..      ...      ...                            ...             ...       ...   
81   101525   566978                   WBC elevated           Human   D72.829   
82   101525  1071556  fracture of the left clavicle           Human  S42.002A   
83   101525  1071962                      neck pain           Human     M54.2   
84   101525  1071963                      neck pain           Human     M54.2   
85   101525   240524                   Sinus rhythm           Human     93000   

   Code_System  
0     ICD-

In [161]:
data = entity_df.drop(['HADM_ID','Note_ID','Entity_Type'], axis=1)

In [162]:
print(data)

                      Entity_Text      Code Code_System
0                             IPH       431    ICD-9-CM
1      intraventricular extension       431    ICD-9-CM
2                   hydrocephalus     331.4    ICD-9-CM
3     intraparenchymal hemorrhage       431    ICD-9-CM
4                             DNR    V49.86    ICD-9-CM
..                            ...       ...         ...
81                   WBC elevated   D72.829   ICD-10-CM
82  fracture of the left clavicle  S42.002A   ICD-10-CM
83                      neck pain     M54.2   ICD-10-CM
84                      neck pain     M54.2   ICD-10-CM
85                   Sinus rhythm     93000         CPT

[86 rows x 3 columns]


In [163]:
# Function to split text and create new rows with B or I values

def split_text_to_rows(row):
    words = row['Entity_Text'].split()
    rows = []
    for i, word in enumerate(words):
        if i == 0:
            tag = 'B'
        else:
            tag = 'I'
        rows.append({'Entity_Text': word, 'Tag': tag, 'Code': row['Code'], 'Code_System': row['Code_System']})
    return rows

# Apply the function to each row and concatenate the results
new_rows = []
for index, row in data.iterrows():
    new_rows.extend(split_text_to_rows(row))

# Create a new dataframe with the split rows
new_df = pd.DataFrame(new_rows)

print(new_df)

          Entity_Text Tag   Code Code_System
0                 IPH   B    431    ICD-9-CM
1    intraventricular   B    431    ICD-9-CM
2           extension   I    431    ICD-9-CM
3       hydrocephalus   B  331.4    ICD-9-CM
4    intraparenchymal   B    431    ICD-9-CM
..                ...  ..    ...         ...
217              pain   I  M54.2   ICD-10-CM
218              neck   B  M54.2   ICD-10-CM
219              pain   I  M54.2   ICD-10-CM
220             Sinus   B  93000         CPT
221            rhythm   I  93000         CPT

[222 rows x 4 columns]


In [164]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(new_df)

          Entity_Text Tag      Code Code_System
0                 IPH   B       431    ICD-9-CM
1    intraventricular   B       431    ICD-9-CM
2           extension   I       431    ICD-9-CM
3       hydrocephalus   B     331.4    ICD-9-CM
4    intraparenchymal   B       431    ICD-9-CM
5          hemorrhage   I       431    ICD-9-CM
6                 DNR   B    V49.86    ICD-9-CM
7            DIABETES   B    250.00    ICD-9-CM
8                TYPE   I    250.00    ICD-9-CM
9                  II   I    250.00    ICD-9-CM
10     HYPERLIPIDEMIA   B     272.4    ICD-9-CM
11           GLAUCOMA   B     365.9    ICD-9-CM
12     OSTEOARTHRITIS   B    715.90    ICD-9-CM
13            CAROTID   B    433.10    ICD-9-CM
14           STENOSIS   I    433.10    ICD-9-CM
15               left   I    433.10    ICD-9-CM
16            60-69%,   I    433.10    ICD-9-CM
17                 rt   I    433.10    ICD-9-CM
18                 50   I    433.10    ICD-9-CM
19          extubated   B     96.71   IC

##################################################################################################################

##################################################################################################################

In [185]:
# Function to extract information from a JSON file
def extract_info_from_json(json_file_path):

    # Open the JSON file
    with open(json_file_path, 'r') as json_file:
        data = json.load(json_file)
        rows = []
    

        for note in data.get("notes", []):
            hadm_id = data.get("hadm_id")
            category = note.get("category")
            description = note.get("description")
    
            # Extract annotations from the note

            for annotation in note.get("annotations", []):
                begin = annotation.get("begin")
                end = annotation.get("end")
                code = annotation.get("code")
                code_system = annotation.get("code_system")
                description = annotation.get("description")
                covered_text = annotation.get("covered_text")
                text = note.get("text")
                rows.append([hadm_id, category, description, begin, end, code, code_system, covered_text, text])
    
        return rows

In [186]:
# Function to search for JSON files in a folder and its subfolders
def search_files(folder_path):
    # List to store rows of DataFrame
    all_rows = []

    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.json'):
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, filename)
                print("Processing:", json_file_path)
                # Call extract_info_from_json function to extract information from the JSON file
                rows = extract_info_from_json(json_file_path)
                all_rows.extend(rows)

    # Create DataFrame from the list of rows
    df = pd.DataFrame(all_rows, columns=['HADM_ID', 'Category', 'Description', 'Begin', 'End', 'Code', 'Code_System', 'Covered_Text', 'Text'])
    return df

In [187]:
# Call the search_files function with the folder path containing JSON files
folder_path = r"C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2"
df = search_files(folder_path)

Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\100197-ICD-9.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\ICD-10\1.0\101525-ICD-10.json


In [188]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(df)

    HADM_ID           Category  \
0    100197  Discharge summary   
1    100197  Discharge summary   
2    100197  Discharge summary   
3    100197  Discharge summary   
4    100197  Discharge summary   
5    100197  Discharge summary   
6    100197  Discharge summary   
7    100197  Discharge summary   
8    100197  Discharge summary   
9    100197  Discharge summary   
10   100197  Discharge summary   
11   101525        Respiratory   
12   101525          Radiology   
13   101525          Radiology   
14   101525          Radiology   
15   101525  Discharge summary   
16   101525  Discharge summary   
17   101525  Discharge summary   
18   101525  Discharge summary   
19   101525  Discharge summary   
20   101525  Discharge summary   
21   101525  Discharge summary   
22   101525  Discharge summary   
23   101525  Discharge summary   
24   101525  Discharge summary   
25   101525  Discharge summary   
26   101525  Discharge summary   
27   101525          Physician   
28   101525   

In [189]:
df = df.drop(['Category', 'Description', 'Begin', 'End','Covered_Text'], axis=1)    

In [190]:
df

Unnamed: 0,HADM_ID,Code,Code_System,Text
0,100197,431,ICD-9-CM,Admission Date: [**2136-10-23**] ...
1,100197,431,ICD-9-CM,Admission Date: [**2136-10-23**] ...
2,100197,331.4,ICD-9-CM,Admission Date: [**2136-10-23**] ...
3,100197,431,ICD-9-CM,Admission Date: [**2136-10-23**] ...
4,100197,V49.86,ICD-9-CM,Admission Date: [**2136-10-23**] ...
...,...,...,...,...
81,101525,D72.829,ICD-10-CM,"TSICU\n HPI:\n 41yo M p/w severe HA, N/V. ..."
82,101525,S42.002A,ICD-10-CM,[**2134-3-21**] 8:16 PM\n HUMERUS (AP & LAT) L...
83,101525,M54.2,ICD-10-CM,[**2134-3-23**] 6:05 PM\n CT C-SPINE W/O CONTR...
84,101525,M54.2,ICD-10-CM,"[**Last Name (LF) **],[**First Name3 (LF) 1046..."


In [191]:
# Split the text in the 'Text' column and create new rows for each word
df = df.assign(Text=df['Text'].str.split()).explode('Text').reset_index(drop=True)

# Drop the original 'Text' column if needed
# df = df.drop(['Text'], axis=1)


       HADM_ID   Code Code_System              Text
0       100197    431    ICD-9-CM         Admission
1       100197    431    ICD-9-CM             Date:
2       100197    431    ICD-9-CM  [**2136-10-23**]
3       100197    431    ICD-9-CM         Discharge
4       100197    431    ICD-9-CM             Date:
...        ...    ...         ...               ...
45938   101525  93000         CPT          previous
45939   101525  93000         CPT           tracing
45940   101525  93000         CPT         available
45941   101525  93000         CPT               for
45942   101525  93000         CPT       comparison.

[45943 rows x 4 columns]


In [192]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(df)

       HADM_ID      Code Code_System  \
0       100197       431    ICD-9-CM   
1       100197       431    ICD-9-CM   
2       100197       431    ICD-9-CM   
3       100197       431    ICD-9-CM   
4       100197       431    ICD-9-CM   
5       100197       431    ICD-9-CM   
6       100197       431    ICD-9-CM   
7       100197       431    ICD-9-CM   
8       100197       431    ICD-9-CM   
9       100197       431    ICD-9-CM   
10      100197       431    ICD-9-CM   
11      100197       431    ICD-9-CM   
12      100197       431    ICD-9-CM   
13      100197       431    ICD-9-CM   
14      100197       431    ICD-9-CM   
15      100197       431    ICD-9-CM   
16      100197       431    ICD-9-CM   
17      100197       431    ICD-9-CM   
18      100197       431    ICD-9-CM   
19      100197       431    ICD-9-CM   
20      100197       431    ICD-9-CM   
21      100197       431    ICD-9-CM   
22      100197       431    ICD-9-CM   
23      100197       431    ICD-9-CM   
