In [57]:
import os
import json
import stanza
import csv
import pandas as pd

# Build an English pipeline
stanza.download('en', package='mimic', processors={'ner': 'i2b2'}) # download English model
nlp = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'}) # initialize English neural pipeline

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 4.65MB/s]                    
2024-03-18 13:41:57 INFO: Downloaded file to C:\Users\okechukwu chude\stanza_resources\resources.json
2024-03-18 13:41:57 INFO: Downloading these customized packages for language: en (English)...
| Processor       | Package        |
------------------------------------
| tokenize        | mimic          |
| pos             | mimic_charlm   |
| lemma           | mimic_nocharlm |
| depparse        | mimic_charlm   |
| ner             | i2b2           |
| pretrain        | mimic          |
| backward_charlm | mimic          |
| forward_charlm  | mimic          |

2024-03-18 13:41:57 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\en\tokenize\mimic.pt
2024-03-18 13:41:57 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\en\pos\mimic_charlm.pt
2024-03-18 13:41:57 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\

In [58]:
# Function to process text using Stanza
def process_text_with_stanza(text):
    doc = nlp(text)
    return doc

In [59]:
# Global variable to store all extracted texts
extracted_texts = []

In [60]:
def extract_info_from_json(json_file_path, hadm_id_set):
    global extracted_texts

    # Open the JSON file
    with open(json_file_path, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)

        # Extract 'hadm_id' and 'comment' from the JSON data
        hadm_id = data.get('hadm_id', None)
        comment = data.get('comment', None)

        # If 'hadm_id' is not found, print a warning message
        if hadm_id is None:
            print(f"Warning: 'hadm_id' not found in {json_file_path}")
            return None

        # Add 'hadm_id' to the set
        hadm_id_set.add(hadm_id)

        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])

        # Iterate through each note
        for note in notes:
            note_info = {}  # Initialize a dictionary to store note information

            # Add 'hadm_id' to the note information
            note_info['hadm_id'] = hadm_id

            # Extract information from the note
            note_info['note_id'] = note.get('note_id', None)
            note_info['category'] = note.get('category', None)
            note_info['description'] = note.get('description', None)

            # Extract annotations from the note
            annotations = note.get('annotations', [])
            annotations_info = []  # Initialize a list to store annotation information

            # Iterate through each annotation in the note
            for annotation in annotations:
                annotation_info = {}  # Initialize a dictionary to store annotation information

                # Extract information from the annotation
                annotation_info['begin'] = annotation.get('begin', None)
                annotation_info['end'] = annotation.get('end', None)
                annotation_info['code'] = annotation.get('code', None)
                annotation_info['code_system'] = annotation.get('code_system', None)
                annotation_info['description'] = annotation.get('description', None)
                annotation_info['type'] = annotation.get('type', None)
                annotation_info['covered_text'] = annotation.get('covered_text', None)

                annotations_info.append(annotation_info)  # Append annotation information to the list

            note_info['annotations'] = annotations_info  # Add annotations information to the note
            note_info['text'] = note.get('text', None)

            # Process text with Stanza
            if note_info['text']:
                processed_text = process_text_with_stanza(note_info['text'])
                note_info['processed_text'] = processed_text

                # Count number of sentences and store it
                num_sentences = len(processed_text.sentences)
                note_info['num_sentences'] = num_sentences

            extracted_texts.append(note_info)  # Append note information to the global variable

In [61]:
def search_files(folder_path):
    # Set to store unique hadm_id values
    hadm_id_set = set()

    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.json'):
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, filename)
                print("Processing:", json_file_path)
                # Call extract_info_from_json function to extract information from the JSON file
                extract_info_from_json(json_file_path, hadm_id_set)
                # Print a separator after processing each file
                print("=" * 50)

    # Print the count of unique hadm_id values
    print("Total unique hadm_id count:", len(hadm_id_set))

In [62]:
# Specify the path to the main folder containing subfolders with JSON files
main_folder_path = r"C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2"

# Call the search_files function to start searching for JSON files in the main folder and its subfolders
search_files(main_folder_path)

Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\100197-ICD-9.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\ICD-10\1.0\101525-ICD-10.json
Total unique hadm_id count: 2


In [63]:
# Calculate total number of sentences
total_sentences = sum(note_info['num_sentences'] for note_info in extracted_texts if 'num_sentences' in note_info)

# Print total number of sentences
print("Total number of sentences:", total_sentences)

Total number of sentences: 481


In [65]:
from stanza.utils.conll import CoNLL

def tokenize_sentences(doc):

    tokenized_sentences = []  # Initialize an empty list to store tokenized sentences
    for sentence in doc.sentences:  # Iterate through each sentence in the document
        tokens = [token.text for token in sentence.tokens]  # Extract text of each token in the sentence
        tokenized_sentences.append(tokens)  # Append the list of tokens to the tokenized_sentences list
    return tokenized_sentences  # Return the tokenized sentences

# Loop through each note_info in the extracted_texts list
for note_info in extracted_texts:
    # Check if the 'processed_text' key exists in the current note_info
    if 'processed_text' in note_info:
        processed_text = note_info['processed_text']  # Extract the processed text from note_info
        tokenized_sentences = tokenize_sentences(processed_text)  # Tokenize the sentences in the processed text
        note_info['tokenized_sentences'] = tokenized_sentences  # Add the tokenized sentences to note_info

# Loop through each note_info in the extracted_texts list
for i, note_info in enumerate(extracted_texts):
    if i >= 10:  # Stop after processing the first 50 notes
        break

    # Check if the 'tokenized_sentences' key exists in the current note_info
    if 'tokenized_sentences' in note_info:
        print(f"HADM_ID: {note_info['hadm_id']}")  # Print the HADM_ID
        print(f"Note ID: {note_info['note_id']}")  # Print the note ID
        tokenized_sentences = note_info['tokenized_sentences']  # Extract the tokenized sentences from note_info
        # Loop through each sentence_tokens list in the tokenized_sentences list
        for sentence_tokens in tokenized_sentences:
            print(' '.join(sentence_tokens))  # Print the tokens of each sentence separated by spaces
        print()  # Print an empty line after printing all sentences

HADM_ID: 100197
Note ID: 25762
Admission Date : [ **2136-10-23 **] Discharge Date : [ **2136-10-24 **]
Date of Birth : [ ** 2056-7-14 **] Sex : M
Service : NEUROSURGERY
Allergies : No Known Allergies / Adverse Drug Reactions
Attending :[** First Name3 ( LF ) 1835 **] Chief Complaint : Found down
Major Surgical or Invasive Procedure : None
History of Present Illness : 80M p/w a large R IPH with intraventricular extension , midline shift , and hydrocephalus .
The patient was found lying face up in his bathtub after having shaved .
There was no water in the tub and the shower was not turned on .
He was found by a neighbor .
[** Name ( NI ) ** ] EMS , he was moaning and there was " tone in his left arm " which may have been consistent with posturing .
He was taken to [ ** Hospital1 18 ** ] [ ** Location ( un ) 620 ** ] and head CT was performed , revealing a large intraparenchymal hemorrhage extending from the lower midbrain into the hypothalamus , thalamus and basal ganglia on the right ,

In [66]:
# Initialize total word count
total_word_count = 0

# Loop through each note_info in the extracted_texts list
for note_info in extracted_texts:
    # Check if the 'tokenized_sentences' key exists in the current note_info
    if 'tokenized_sentences' in note_info:
        tokenized_sentences = note_info['tokenized_sentences']  # Extract the tokenized sentences from note_info

        # Iterate through each sentence_tokens list in the tokenized_sentences list
        for sentence_tokens in tokenized_sentences:
            # Increment the total word count by the number of words in the current sentence
            total_word_count += len(sentence_tokens)

# Print the total number of words
print("Total number of words:", total_word_count)

Total number of words: 9873
