In [1]:
import os
import pandas as pd
import json

# Function to extract entities from the JSON data
def extract_entities(json_data):
    entity_data = []  # Initialize an empty list to store entity information
    hadm_id = json_data['hadm_id']  # Extract HADM_ID
    # Iterate through each note in the JSON data
    for note in json_data['notes']:
        note_id = note['note_id']  # Extract Note ID
        # Iterate through each annotation in the note
        for annotation in note['annotations']:
            # Extract entity text from the 'covered_text' field
            entity_text = annotation['covered_text']
            # Extract entity type
            entity_type = annotation['type']
            # Append entity information to the list
            entity_data.append({
                'HADM_ID': hadm_id,
                'Note_ID': note_id,
                'Entity_Text': entity_text,
                'Entity_Type': entity_type
            })
    return entity_data

In [2]:
# Function to extract information from JSON files
def extract_info_from_json(json_file_path, entity_data):
    # Load JSON data from the file
    with open(json_file_path, 'r') as file:
        json_data = json.load(file)
        # Extract entities from the JSON data
        entities = extract_entities(json_data)
        # Extend entity_data list with the extracted entities
        entity_data.extend(entities)


In [7]:
# Function to search for JSON files in a folder and its subfolders
def search_files(folder_path):
    entity_data = []  # Initialize an empty list to store entity information
    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.json'):
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, filename)
                print("Processing:", json_file_path)
                # Call extract_info_from_json function to extract information from the JSON file
                extract_info_from_json(json_file_path, entity_data)
                # Print a separator after processing each file
                print("=" * 50)

    # Create a DataFrame from the entity_data list
    entity_df = pd.DataFrame(entity_data)
    # Print the DataFrame
    #print(entity_df)
    # Print the count of unique HADM_ID values
    print("Total unique HADM_ID count:", entity_df['HADM_ID'].nunique())

    return entity_df  # Return the DataFrame

In [10]:
# Specify the path to the main folder containing subfolders with JSON files
main_folder_path = r"C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2"

# Call the search_files function to start searching for JSON files in the main folder and its subfolders
entity_df = search_files(main_folder_path)

Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\100197-ICD-9.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\ICD-10\1.0\101525-ICD-10.json
Total unique HADM_ID count: 2


In [11]:
print(entity_df)

    HADM_ID  Note_ID                    Entity_Text     Entity_Type
0    100197    25762                            IPH  MapType.APPROX
1    100197    25762     intraventricular extension  MapType.APPROX
2    100197    25762                  hydrocephalus  MapType.APPROX
3    100197    25762    intraparenchymal hemorrhage  MapType.APPROX
4    100197    25762                            DNR           Human
..      ...      ...                            ...             ...
81   101525   566978                   WBC elevated           Human
82   101525  1071556  fracture of the left clavicle           Human
83   101525  1071962                      neck pain           Human
84   101525  1071963                      neck pain           Human
85   101525   240524                   Sinus rhythm           Human

[86 rows x 4 columns]
