In [12]:
import os
import pandas as pd
import json

# Function to extract entities from the JSON data
def extract_entities(json_data):
    entity_data = []  # Initialize an empty list to store entity information
    hadm_id = json_data['hadm_id']  # Extract HADM_ID
    # Iterate through each note in the JSON data
    for note in json_data['notes']:
        note_id = note['note_id']  # Extract Note ID
        # Iterate through each annotation in the note
        for annotation in note['annotations']:
            # Extract entity information
            entity_info = {
                'HADM_ID': hadm_id,
                'Note_ID': note_id,
                'Entity_Text': annotation['covered_text'],
                'Entity_Type': annotation['type'],
                'Code': annotation['code'],
                'Code_System': annotation['code_system']
            }
            # Append entity information to the list
            entity_data.append(entity_info)
    return entity_data

In [13]:
# Function to extract information from JSON files
def extract_info_from_json(json_file_path, entity_data):
    # Load JSON data from the file
    with open(json_file_path, 'r') as file:
        json_data = json.load(file)
        # Extract entities from the JSON data
        entities = extract_entities(json_data)
        # Extend entity_data list with the extracted entities
        entity_data.extend(entities)


In [14]:
# Function to search for JSON files in a folder and its subfolders
def search_files(folder_path):
    entity_data = []  # Initialize an empty list to store entity information
    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.json'):
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, filename)
                print("Processing:", json_file_path)
                # Call extract_info_from_json function to extract information from the JSON file
                extract_info_from_json(json_file_path, entity_data)
                # Print a separator after processing each file
                print("=" * 50)

    # Create a DataFrame from the entity_data list
    entity_df = pd.DataFrame(entity_data)
    # Print the DataFrame
    #print(entity_df)
    # Print the count of unique HADM_ID values
    print("Total unique HADM_ID count:", entity_df['HADM_ID'].nunique())

    return entity_df  # Return the DataFrame

In [27]:
# Specify the path to the main folder containing subfolders with JSON files
main_folder_path = r"C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2"

# Call the search_files function to start searching for JSON files in the main folder and its subfolders
entity_df = search_files(main_folder_path)

Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\100197-ICD-9.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\ICD-10\1.0\101525-ICD-10.json
Total unique HADM_ID count: 2


In [28]:
print(entity_df)

    HADM_ID  Note_ID                    Entity_Text     Entity_Type      Code  \
0    100197    25762                            IPH  MapType.APPROX       431   
1    100197    25762     intraventricular extension  MapType.APPROX       431   
2    100197    25762                  hydrocephalus  MapType.APPROX     331.4   
3    100197    25762    intraparenchymal hemorrhage  MapType.APPROX       431   
4    100197    25762                            DNR           Human    V49.86   
..      ...      ...                            ...             ...       ...   
81   101525   566978                   WBC elevated           Human   D72.829   
82   101525  1071556  fracture of the left clavicle           Human  S42.002A   
83   101525  1071962                      neck pain           Human     M54.2   
84   101525  1071963                      neck pain           Human     M54.2   
85   101525   240524                   Sinus rhythm           Human     93000   

   Code_System  
0     ICD-

In [30]:
data = entity_df.drop(['HADM_ID','Note_ID','Entity_Type'], axis=1)

In [31]:
print(data)

                      Entity_Text      Code Code_System
0                             IPH       431    ICD-9-CM
1      intraventricular extension       431    ICD-9-CM
2                   hydrocephalus     331.4    ICD-9-CM
3     intraparenchymal hemorrhage       431    ICD-9-CM
4                             DNR    V49.86    ICD-9-CM
..                            ...       ...         ...
81                   WBC elevated   D72.829   ICD-10-CM
82  fracture of the left clavicle  S42.002A   ICD-10-CM
83                      neck pain     M54.2   ICD-10-CM
84                      neck pain     M54.2   ICD-10-CM
85                   Sinus rhythm     93000         CPT

[86 rows x 3 columns]


In [33]:
# Function to split text and create new rows with B or I values

def split_text_to_rows(row):
    words = row['Entity_Text'].split()
    rows = []
    for i, word in enumerate(words):
        if i == 0:
            tag = 'B'
        else:
            tag = 'I'
        rows.append({'Entity_Text': word, 'Tag': tag, 'Code': row['Code'], 'Code_System': row['Code_System']})
    return rows

# Apply the function to each row and concatenate the results
new_rows = []
for index, row in data.iterrows():
    new_rows.extend(split_text_to_rows(row))

# Create a new dataframe with the split rows
new_df = pd.DataFrame(new_rows)

print(new_df)

          Entity_Text Tag   Code Code_System
0                 IPH   B    431    ICD-9-CM
1    intraventricular   B    431    ICD-9-CM
2           extension   I    431    ICD-9-CM
3       hydrocephalus   B  331.4    ICD-9-CM
4    intraparenchymal   B    431    ICD-9-CM
..                ...  ..    ...         ...
217              pain   I  M54.2   ICD-10-CM
218              neck   B  M54.2   ICD-10-CM
219              pain   I  M54.2   ICD-10-CM
220             Sinus   B  93000         CPT
221            rhythm   I  93000         CPT

[222 rows x 4 columns]
