In [10]:
import os
import json
import stanza
stanza.download('en') # download English model  
nlp = stanza.Pipeline('en') # initialize English neural pipeline

In [15]:
def tokenize_text(text):
    # Tokenize the text using Stanza
    doc = nlp(text)
    # Extract tokens
    tokens = [word.text for sent in doc.sentences for word in sent.words]
    return tokens

In [16]:
def extract_info_from_json(json_file_path, hadm_id_set):
    # Open the JSON file
    with open(json_file_path, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)
        
        # Extract 'hadm_id' from the JSON data
        hadm_id = data.get('hadm_id', None)
        
        # If 'hadm_id' is not found, print a warning message
        if hadm_id is None:
            print(f"Warning: 'hadm_id' not found in {json_file_path}")
            return None
        
        # Add 'hadm_id' to the set
        hadm_id_set.add(hadm_id)

        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])
        
        # Iterate through each note
        for note in notes:
            
            # Tokenize the text in the note
            tokens = tokenize_text(note.get('covered_text', ''))
            # Print the tokens
            print("Tokens:", tokens)

            # Print 'hadm_id' for reference
            print("hadm_id:", hadm_id)
            # Print 'note_id', 'category', and 'description' of the note
            print("note_id:", note.get('note_id', 'N/A'))
            print("category:", note.get('category', 'N/A'))
            print("description:", note.get('description', 'N/A'))
            
            # Extract 'annotations' from the note
            annotations = note.get('annotations', [])
            
            # Iterate through each annotation in the note
            for annotation in annotations:
                # Print annotation details: 'begin', 'end', 'code', 'code_system', 'description', 'type', 'covered_text'
                print("begin:", annotation.get('begin', 'N/A'))
                print("end:", annotation.get('end', 'N/A'))
                print("code:", annotation.get('code', 'N/A'))
                print("code_system:", annotation.get('code_system', 'N/A'))
                print("description:", annotation.get('description', 'N/A'))
                print("type:", annotation.get('type', 'N/A'))
                print("covered_text:", annotation.get('covered_text', 'N/A'))
            
            # Print a separator after each note
            print("-" * 50)


In [17]:
def main(folder_path):
        # Set to store unique hadm_id values
    hadm_id_set = set()
    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.json'):
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, filename)
                print("Processing:", json_file_path)
                # Call extract_info_from_json function to extract information from the JSON file
                extract_info_from_json(json_file_path, hadm_id_set)
                # Print a separator after processing each file
                print("=" * 50)
                    # Print the count of unique hadm_id values
    print("Total unique hadm_id count:", len(hadm_id_set))

In [18]:
if __name__ == "__main__":
    # Specify the path to the folder containing JSON files
    folder_path = r"C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\with_text\gold"
    # Call the main function to start processing JSON files in the folder
    main(folder_path)

Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\with_text\gold\Inpatient\ICD-10\1.0\100197-ICD-10.json
Tokens: []
hadm_id: 100197
note_id: 25762
category: Discharge summary
description: Report
begin: 374
end: 377
code: I61.8
code_system: ICD-10-CM
description: Other nontraumatic intracerebral hemorrhage
type: Human
covered_text: IPH
begin: 383
end: 409
code: I61.5
code_system: ICD-10-CM
description: Nontraumatic intracerebral hemorrhage, intraventricular
type: Human
covered_text: intraventricular extension
begin: 430
end: 443
code: G91.9
code_system: ICD-10-CM
description: Hydrocephalus, unspecified
type: Human
covered_text: hydrocephalus
begin: 835
end: 862
code: I61.8
code_system: ICD-10-CM
description: Other nontraumatic intracerebral hemorrhage
type: Human
covered_text: intraparenchymal hemorrhage
begin: 1577
end: 1580
code: Z66
code_system: ICD-10-CM
description: Do not resuscitate
type: Human
covered_text: DNR
begin: 1840
end: 1856
cod

In [None]:
print(tokens)