In [34]:
import os
import json
import stanza
stanza.download('en') # download English model  
nlp = stanza.Pipeline('en') # initialize English neural pipeline

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 2.59MB/s]                    
2024-03-05 19:48:22 INFO: Downloaded file to C:\Users\okechukwu chude\stanza_resources\resources.json
2024-03-05 19:48:22 INFO: Downloading default packages for language: en (English) ...
2024-03-05 19:48:24 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\en\default.zip
2024-03-05 19:48:28 INFO: Finished downloading models and saved to C:\Users\okechukwu chude\stanza_resources
2024-03-05 19:48:28 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 2.54MB/s]                    
2024-03-05 19:48:29 INFO: Downloaded file to C:\Users\okechukwu chude\stanza_resources\resources.json

In [35]:
# Global variable to store all extracted texts
extracted_texts = []

In [36]:
def extract_info_from_json(json_file_path, hadm_id_set):
    extracted_info = []  # Initialize a list to store extracted information

    # Open the JSON file
    with open(json_file_path, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)
        
        # Extract 'hadm_id' and 'comment' from the JSON data
        hadm_id = data.get('hadm_id', None)
        comment = data.get('comment', None)

        # If 'hadm_id' is not found, print a warning message
        if hadm_id is None:
            print(f"Warning: 'hadm_id' not found in {json_file_path}")
            return None
        
        # Add 'hadm_id' to the set
        hadm_id_set.add(hadm_id)
        
        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])
        
        # Iterate through each note
        for note in notes:
            note_info = {}  # Initialize a dictionary to store note information
            
            # Extract information from the note
            note_info['note_id'] = note.get('note_id', None)
            note_info['category'] = note.get('category', None)
            note_info['description'] = note.get('description', None)
            
            # Extract annotations from the note
            annotations = note.get('annotations', [])
            annotations_info = []  # Initialize a list to store annotation information
            
            # Iterate through each annotation in the note
            for annotation in annotations:
                annotation_info = {}  # Initialize a dictionary to store annotation information
                
                # Extract information from the annotation
                annotation_info['begin'] = annotation.get('begin', None)
                annotation_info['end'] = annotation.get('end', None)
                annotation_info['code'] = annotation.get('code', None)
                annotation_info['code_system'] = annotation.get('code_system', None)
                annotation_info['description'] = annotation.get('description', None)
                annotation_info['type'] = annotation.get('type', None)
                annotation_info['covered_text'] = annotation.get('covered_text', None)
                
                annotations_info.append(annotation_info)  # Append annotation information to the list
            
            note_info['annotations'] = annotations_info  # Add annotations information to the note
            note_info['text'] = note.get('text', None)
            
            extracted_info.append(note_info)  # Append note information to the list

    return extracted_info

In [37]:
def search_files(folder_path):
    # Set to store unique hadm_id values
    hadm_id_set = set()
    
    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.json'):
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, filename)
                print("Processing:", json_file_path)
                # Call extract_info_from_json function to extract information from the JSON file
                extract_info_from_json(json_file_path, hadm_id_set)
                # Print a separator after processing each file
                print("=" * 50)
    
    # Print the count of unique hadm_id values
    print("Total unique hadm_id count:", len(hadm_id_set))


In [38]:
# Specify the path to the main folder containing subfolders with JSON files
main_folder_path = r"C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\with_text\gold"

# Call the search_files function to start searching for JSON files in the main folder and its subfolders
search_files(main_folder_path)

Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\with_text\gold\Inpatient\ICD-10\1.0\100197-ICD-10.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\with_text\gold\Inpatient\ICD-10\1.0\100463-ICD-10.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\with_text\gold\Inpatient\ICD-10\1.0\101173-ICD-10.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\with_text\gold\Inpatient\ICD-10\1.0\101525-ICD-10.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\with_text\gold\Inpatient\ICD-10\1.0\102181-ICD-10.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\with_text\gold\Inpatient\ICD-10\1.0\102242-ICD-10.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\with_text\gold\Inpa

In [39]:
# Perform NLP operations on the extracted texts
doc = nlp('\n'.join(extracted_texts))
# Example NLP operation: print out entities
for sentence in doc.sentences:
    for entity in sentence.ents:
        print(f'Text: {entity.text}\tEntity type: {entity.type}')

In [40]:
# Print only the first sentence
if extracted_texts:
  doc = nlp('\n'.join(extracted_texts))
  print(f"First Sentence: {doc.sentences[0].text}")