In [47]:
import os
import json
import stanza
import csv
import pandas as pd

# Build an English pipeline
stanza.download('en', package='mimic', processors={'ner': 'i2b2'}) # download English model
nlp = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'}) # initialize English neural pipeline

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 10.7MB/s]                    
2024-03-17 14:52:44 INFO: Downloaded file to C:\Users\okechukwu chude\stanza_resources\resources.json
2024-03-17 14:52:44 INFO: Downloading these customized packages for language: en (English)...
| Processor       | Package        |
------------------------------------
| tokenize        | mimic          |
| pos             | mimic_charlm   |
| lemma           | mimic_nocharlm |
| depparse        | mimic_charlm   |
| ner             | i2b2           |
| pretrain        | mimic          |
| backward_charlm | mimic          |
| forward_charlm  | mimic          |

2024-03-17 14:52:44 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\en\tokenize\mimic.pt
2024-03-17 14:52:44 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\en\pos\mimic_charlm.pt
2024-03-17 14:52:44 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\

In [48]:
# Function to process text using Stanza
def process_text_with_stanza(text):
    doc = nlp(text)
    return doc

In [49]:
# Global variable to store all extracted texts
extracted_texts = []

In [50]:
# Function to extract information from JSON files
def extract_info_from_json(json_file_path, hadm_id_set):
    global extracted_texts

    # Open the JSON file
    with open(json_file_path, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)

        # Extract 'hadm_id' and 'comment' from the JSON data
        hadm_id = data.get('hadm_id', None)
        comment = data.get('comment', None)

        # If 'hadm_id' is not found, print a warning message
        if hadm_id is None:
            print(f"Warning: 'hadm_id' not found in {json_file_path}")
            return None

        # Add 'hadm_id' to the set
        hadm_id_set.add(hadm_id)

        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])

        # Iterate through each note
        for note in notes:
            note_info = {}  # Initialize a dictionary to store note information

            # Add 'hadm_id' to the note information
            note_info['hadm_id'] = hadm_id

            # Extract information from the note
            note_info['note_id'] = note.get('note_id', None)
            note_info['category'] = note.get('category', None)
            note_info['description'] = note.get('description', None)

            # Extract annotations from the note
            annotations = note.get('annotations', [])
            annotations_info = []  # Initialize a list to store annotation information

            # Iterate through each annotation in the note
            for annotation in annotations:
                annotation_info = {}  # Initialize a dictionary to store annotation information

                # Extract information from the annotation
                annotation_info['begin'] = annotation.get('begin', None)
                annotation_info['end'] = annotation.get('end', None)
                annotation_info['code'] = annotation.get('code', None)
                annotation_info['code_system'] = annotation.get('code_system', None)
                annotation_info['description'] = annotation.get('description', None)
                annotation_info['type'] = annotation.get('type', None)
                annotation_info['covered_text'] = annotation.get('covered_text', None)

                annotations_info.append(annotation_info)  # Append annotation information to the list

            note_info['annotations'] = annotations_info  # Add annotations information to the note
            note_info['text'] = note.get('text', None)

            # Process text with Stanza
            if note_info['text']:
                processed_text = process_text_with_stanza(note_info['text'])
                note_info['processed_text'] = processed_text

                # Extract sentence information and generate labels
                sentence_info = []
                for sent in processed_text.sentences:
                    sentence_start = sent.words[0].start_char
                    sentence_end = sent.words[-1].end_char
                    include_sentence = False

                    for annotation in note_info['annotations']:
                        begin = annotation['begin']
                        end = annotation['end']
                        covered_text = annotation['covered_text']

                        # Check if the sentence contains the covered text
                        if sentence_start <= begin and sentence_end >= end:
                            include_sentence = True
                            tokens = [word.text for word in sent.words]
                            start_token_idx, end_token_idx = find_token_indices(tokens, begin, end, covered_text)
                            
                            # Generate labels for the covered text
                            labels = generate_labels(tokens, [annotation], start_token_idx, end_token_idx)
                            break

                    if include_sentence:
                        sentence_info.append({
                            'sentence_id': sent.index,
                            'words': tokens,
                            'labels': labels
                        })

                note_info['sentence_info'] = sentence_info

            extracted_texts.append(note_info)  # Append note information to the global variable

# Function to find the token indices corresponding to the annotation
def find_token_indices(tokens, begin, end, covered_text):
    char_counter = 0
    start_token_idx = None
    end_token_idx = None

    for idx, token in enumerate(tokens):
        if char_counter == begin:
            start_token_idx = idx
        if char_counter == end:
            end_token_idx = idx
            break
        char_counter += len(token) + 1  # Add 1 for the space after each token

    return start_token_idx, end_token_idx


In [51]:
def find_word_indices(tokens, begin, end):
    # Create an empty list to store the indices of words found within the given range
    word_indices = []

    # Iterate through each token in the list of tokens
    for idx, token in enumerate(tokens):
        # Check if the start character of the token matches the beginning of the range
        # or if the token spans the beginning of the range
        if token.startchar == begin or (token.startchar < begin and token.endchar > begin):
            # If it matches, add the index of the token to the list of word indices
            word_indices.append(idx)
        # Check if the end character of the token matches the end of the range
        # or if the token spans the end of the range
        if token.endchar == end or (token.startchar < end and token.endchar > end):
            # If it matches, add the index of the token to the list of word indices
            word_indices.append(idx)
            break
    
    # Return the list of word indices found within the given range
    return word_indices


In [52]:
def generate_labels(words, annotations, start_token_idx, end_token_idx):
    labels = ['O'] * len(words)

    for annotation in annotations:
        code = annotation['code']

        # Update labels for the matched words
        if start_token_idx is not None and end_token_idx is not None:
            labels[start_token_idx] = f'B-{code}'
            for idx in range(start_token_idx + 1, end_token_idx + 1):
                labels[idx] = f'I-{code}'

    return labels

In [53]:
# Function to search for JSON files in a given folder and its subfolders

def search_files(folder_path):
    # Set to store unique hadm_id values
    hadm_id_set = set()

    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.json'):
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, filename)
                print("Processing:", json_file_path)
                # Call extract_info_from_json function to extract information from the JSON file
                extract_info_from_json(json_file_path, hadm_id_set)
                # Print a separator after processing each file
                print("=" * 50)

    # Print the count of unique hadm_id values
    print("Total unique hadm_id count:", len(hadm_id_set))



In [54]:
# Specify the path to the main folder containing subfolders with JSON files
main_folder_path = r"C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2"

# Call the search_files function to start searching for JSON files in the main folder and its subfolders
search_files(main_folder_path)

Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\100197-ICD-9.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\ICD-10\1.0\101525-ICD-10.json
Total unique hadm_id count: 2


In [55]:
import pandas as pd

# Function to create a DataFrame from the extracted sentence information
def create_dataframe(extracted_texts):
    data = []

    for note_info in extracted_texts:
        if 'sentence_info' in note_info:
            for sentence_info in note_info['sentence_info']:
                words = sentence_info['words']
                labels = sentence_info['labels']

                for word, label in zip(words, labels):
                    data.append({
                        'hadm_id': note_info['hadm_id'],
                        'note_id': note_info['note_id'],
                        'sentence_id': sentence_info['sentence_id'],
                        'word': word,
                        'label': label
                    })

    df = pd.DataFrame(data)
    return df

df = create_dataframe(extracted_texts)
print(df)

      hadm_id  note_id  sentence_id     word label
0      100197    25762            6  History     O
1      100197    25762            6       of     O
2      100197    25762            6  Present     O
3      100197    25762            6  Illness     O
4      100197    25762            6        :     O
...       ...      ...          ...      ...   ...
1627   101525  1071963            4     pain     O
1628   101525  1071963            4        .     O
1629   101525   240524            0    Sinus     O
1630   101525   240524            0   rhythm     O
1631   101525   240524            0        .     O

[1632 rows x 5 columns]


In [56]:
label_counts = df['label'].value_counts()
print(label_counts)

label
O    1632
Name: count, dtype: int64
