In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
pip install stanza



In [11]:
import os
import json
import stanza
stanza.download('en') # download English model
nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,ner') # initialize English neural pipeline

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...
INFO:stanza:File exists: /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| pos       | combined_charlm           |
| lemma     | combined_nocharlm         |
| ner       | ontonotes-ww-multi_charlm |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


In [12]:
# Global variable to store all extracted texts
extracted_texts = []

In [13]:
def process_text_with_stanza(text):
    doc = nlp(text)
    return doc


In [14]:
def extract_info_from_json(json_file_path, hadm_id_set):
    global extracted_texts

    # Open the JSON file
    with open(json_file_path, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)

        # Extract 'hadm_id' and 'comment' from the JSON data
        hadm_id = data.get('hadm_id', None)
        comment = data.get('comment', None)

        # If 'hadm_id' is not found, print a warning message
        if hadm_id is None:
            print(f"Warning: 'hadm_id' not found in {json_file_path}")
            return None

        # Add 'hadm_id' to the set
        hadm_id_set.add(hadm_id)

        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])

        # Iterate through each note
        for note in notes:
            note_info = {}  # Initialize a dictionary to store note information

            # Add 'hadm_id' to the note information
            note_info['hadm_id'] = hadm_id

            # Extract information from the note
            note_info['note_id'] = note.get('note_id', None)
            note_info['category'] = note.get('category', None)
            note_info['description'] = note.get('description', None)

            # Extract annotations from the note
            annotations = note.get('annotations', [])
            annotations_info = []  # Initialize a list to store annotation information

            # Iterate through each annotation in the note
            for annotation in annotations:
                annotation_info = {}  # Initialize a dictionary to store annotation information

                # Extract information from the annotation
                annotation_info['begin'] = annotation.get('begin', None)
                annotation_info['end'] = annotation.get('end', None)
                annotation_info['code'] = annotation.get('code', None)
                annotation_info['code_system'] = annotation.get('code_system', None)
                annotation_info['description'] = annotation.get('description', None)
                annotation_info['type'] = annotation.get('type', None)
                annotation_info['covered_text'] = annotation.get('covered_text', None)

                annotations_info.append(annotation_info)  # Append annotation information to the list

            note_info['annotations'] = annotations_info  # Add annotations information to the note
            note_info['text'] = note.get('text', None)

            # Process text with Stanza
            if note_info['text']:
                processed_text = process_text_with_stanza(note_info['text'])
                note_info['processed_text'] = processed_text

            extracted_texts.append(note_info)  # Append note information to the global variable

In [15]:
def search_files(folder_path):
    # Set to store unique hadm_id values
    hadm_id_set = set()

    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.json'):
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, filename)
                print("Processing:", json_file_path)
                # Call extract_info_from_json function to extract information from the JSON file
                extract_info_from_json(json_file_path, hadm_id_set)
                # Print a separator after processing each file
                print("=" * 50)

    # Print the count of unique hadm_id values
    print("Total unique hadm_id count:", len(hadm_id_set))


In [16]:
# Specify the path to the main folder containing subfolders with JSON files
main_folder_path = "/content/drive/MyDrive/#medical coding/with_text/gold"

# Call the search_files function to start searching for JSON files in the main folder and its subfolders
search_files(main_folder_path)

Processing: /content/drive/MyDrive/#medical coding/with_text/gold/Profee/ICD-10/1.0/187213-ICD-10.json
Processing: /content/drive/MyDrive/#medical coding/with_text/gold/Profee/ICD-10/1.0/196522-ICD-10.json
Processing: /content/drive/MyDrive/#medical coding/with_text/gold/Profee/ICD-10/1.0/193800-ICD-10.json
Processing: /content/drive/MyDrive/#medical coding/with_text/gold/Profee/ICD-10/1.0/162131-ICD-10.json
Processing: /content/drive/MyDrive/#medical coding/with_text/gold/Profee/ICD-10/1.0/142289-ICD-10.json
Processing: /content/drive/MyDrive/#medical coding/with_text/gold/Profee/ICD-10/1.0/128511-ICD-10.json
Processing: /content/drive/MyDrive/#medical coding/with_text/gold/Profee/ICD-10/1.0/142321-ICD-10.json
Processing: /content/drive/MyDrive/#medical coding/with_text/gold/Profee/ICD-10/1.0/190860-ICD-10.json
Processing: /content/drive/MyDrive/#medical coding/with_text/gold/Profee/ICD-10/1.0/192435-ICD-10.json
Processing: /content/drive/MyDrive/#medical coding/with_text/gold/Profee/

In [17]:
# We can access named entities from 'processed_text' field of each note
for note_info in extracted_texts:
    if 'processed_text' in note_info:
        processed_text = note_info['processed_text']
        for sentence in processed_text.sentences:
            for entity in sentence.ents:
                print(f"Entity: {entity.text}\tType: {entity.type}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Entity: 2009	Type: DATE
Entity: 88 year old	Type: DATE
Entity: ICU	Type: ORG
Entity: GI	Type: ORG
Entity: SC	Type: ORG
Entity: 8	Type: CARDINAL
Entity: 3	Type: CARDINAL
Entity: 10	Type: CARDINAL
Entity: 10	Type: CARDINAL
Entity: 8	Type: CARDINAL
Entity: 141/46	Type: CARDINAL
Entity: 62	Type: CARDINAL
Entity: 18 98%	Type: PERCENT
Entity: 1	Type: CARDINAL
Entity: 1	Type: CARDINAL
Entity: 120s-170s	Type: QUANTITY
Entity: 2nd	Type: ORDINAL
Entity: ICU	Type: ORG
Entity: 30 years ago	Type: DATE
Entity: many years ago	Type: DATE
Entity: 68	Type: CARDINAL
Entity: 24	Type: CARDINAL
Entity: 99%	Type: PERCENT
Entity: 25	Type: CARDINAL
Entity: 190	Type: CARDINAL
Entity: 9 -
11 minutes	Type: TIME
Entity: OSH	Type: GPE
Entity: 5U	Type: CARDINAL
Entity: 24	Type: CARDINAL
Entity: MICU	Type: ORG
Entity: GI	Type: ORG
Entity: weekly	Type: DATE
Entity: less than 25	Type: CARDINAL
Entity: Aggrenox	Type: PERSON
Entity: 2 weeks	Type: DATE
Entit

In [19]:
# Print the first sentence and its associated hadm_id
for note_info in extracted_texts:
    if 'processed_text' in note_info:
        processed_text = note_info['processed_text']
        # Extract the first sentence
        first_sentence = processed_text.sentences[0].text
        # Print the first sentence and its associated hadm_id
        print(f"HADM_ID: {note_info['hadm_id']} - First Sentence: {first_sentence}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   HPI:
   55 y.o. male with C6 paraplegia ([**2069**]) c/b spasticity and autonomic
   dysreflexia presents from OSH with UTI and widely labile blood
   pressures.
HADM_ID: 187213 - First Sentence: [**2156-12-11**] 7:36 AM
HADM_ID: 187213 - First Sentence: CVICU
   HPI:
   HD14   POD 4-Redo sternotomy/AVR(#21 St. [**Male First Name (un) 1104**] tissue)
   Ejection Fraction:>65%
   Hemoglobin A1c:5.3
   Pre-Op Weight:267.86 lbs   121.5 kgs
   Baseline Creatinine:1.0
   PMHX: Hypertension,Hyperlipidemia,Aortic Stenosis,H/o Endocarditis from
   PPM wire infection s/p MV repair [**2151**],Atrial Fibrillation s/p sucessful
   DCCV [**2156-11-9**],SSS s/p DDD pacemaker [**9-5**],Moderate Pulmonary
   Hypertension,Barrett's esophagus,Cervical CA,CVA per patient [**10-7**]
   without residual,Pacemaker wire infection [**10-7**],Chronic Back pain,
   Arthritis, s/p Mitral valve repair [**11/2152**],s/p DDD
   Pacemaker([**Company