In [127]:
#pip install stanza

In [128]:
import os
import json
import stanza
import csv
import pandas as pd

# Build an English pipeline
stanza.download('en', package='mimic', processors={'ner': 'i2b2'}) # download English model
nlp = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'}) # initialize English neural pipeline

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 2.57MB/s]                    
2024-03-15 13:50:05 INFO: Downloaded file to C:\Users\okechukwu chude\stanza_resources\resources.json
2024-03-15 13:50:05 INFO: Downloading these customized packages for language: en (English)...
| Processor       | Package        |
------------------------------------
| tokenize        | mimic          |
| pos             | mimic_charlm   |
| lemma           | mimic_nocharlm |
| depparse        | mimic_charlm   |
| ner             | i2b2           |
| backward_charlm | mimic          |
| forward_charlm  | mimic          |
| pretrain        | mimic          |

2024-03-15 13:50:05 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\en\tokenize\mimic.pt
2024-03-15 13:50:05 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\en\pos\mimic_charlm.pt
2024-03-15 13:50:05 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\

In [129]:
# Global variable to store all extracted texts
extracted_texts = []

In [130]:
# Function to process text using Stanza
def process_text_with_stanza(text):
    doc = nlp(text)
    return doc


In [131]:
# Function to find token indices based on annotations
def find_token_indices(tokens, begin, end, covered_text):
    token_indices = []
    start_idx = 0
    end_idx = 0
    for i, token in enumerate(tokens):
        end_idx += len(token)
        if start_idx >= begin and end_idx <= end:
            token_indices.append(i)
        start_idx = end_idx + 1
    return token_indices

In [132]:
def generate_labels(tokens, annotations):
    labels = ['O'] * len(tokens)

    # Match annotations with tokens
    for annotation in annotations:
        begin = annotation['begin']
        end = annotation['end']
        code = annotation['code']
        covered_text = annotation['covered_text']

        # Find the token indices corresponding to the annotation
        token_indices = find_token_indices(tokens, begin, end, covered_text)

        # Update labels for the matched tokens
        for idx in token_indices:
            labels[idx] = f'B-{code}' if idx == token_indices[0] else f'I-{code}'

    return labels


In [133]:
def extract_info_from_json(json_file_path, hadm_id_set):
    global extracted_texts

    # Open the JSON file
    with open(json_file_path, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)

        # Extract 'hadm_id' and 'comment' from the JSON data
        hadm_id = data.get('hadm_id', None)
        comment = data.get('comment', None)

        # If 'hadm_id' is not found, print a warning message
        if hadm_id is None:
            print(f"Warning: 'hadm_id' not found in {json_file_path}")
            return None

        # Add 'hadm_id' to the set
        hadm_id_set.add(hadm_id)

        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])

        # Iterate through each note
        for note in notes:
            note_info = {}  # Initialize a dictionary to store note information

            # Add 'hadm_id' to the note information
            note_info['hadm_id'] = hadm_id

            # Extract information from the note
            note_info['note_id'] = note.get('note_id', None)
            note_info['category'] = note.get('category', None)
            note_info['description'] = note.get('description', None)

            # Extract annotations from the note
            annotations = note.get('annotations', [])
            annotations_info = []  # Initialize a list to store annotation information

            # Iterate through each annotation in the note
            for annotation in annotations:
                annotation_info = {}  # Initialize a dictionary to store annotation information

                # Extract information from the annotation
                annotation_info['begin'] = annotation.get('begin', None)
                annotation_info['end'] = annotation.get('end', None)
                annotation_info['code'] = annotation.get('code', None)
                annotation_info['code_system'] = annotation.get('code_system', None)
                annotation_info['description'] = annotation.get('description', None)
                annotation_info['type'] = annotation.get('type', None)
                annotation_info['covered_text'] = annotation.get('covered_text', None)

                annotations_info.append(annotation_info)  # Append annotation information to the list

            note_info['annotations'] = annotations_info  # Add annotations information to the note
            note_info['text'] = note.get('text', None)

            # Process text with Stanza
            if note_info['text']:
                tokens, word_positions = tokenize_text(note_info['text'])
                note_info['tokens'] = tokens
                note_info['word_positions'] = word_positions

                # Generate labels for tokens
                if note_info['tokens']:
                    labels = generate_labels(note_info['tokens'], note_info['annotations'])
                    note_info['labels'] = labels

            extracted_texts.append(note_info)  # Append note information to the global variable


In [134]:
# Function to search for JSON files in a given folder and its subfolders

def search_files(folder_path):
    # Set to store unique hadm_id values
    hadm_id_set = set()

    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.json'):
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, filename)
                print("Processing:", json_file_path)
                # Call extract_info_from_json function to extract information from the JSON file
                extract_info_from_json(json_file_path, hadm_id_set)
                # Print a separator after processing each file
                print("=" * 50)

    # Print the count of unique hadm_id values
    print("Total unique hadm_id count:", len(hadm_id_set))



In [135]:
# Function to extract information about sentences and named entities

def search_json_files(folder_path):
    # Initialize a list to store all JSON file paths
    json_files = []

    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                json_files.append(os.path.join(root, file))

    return json_files

In [136]:
# Function to tokenize text using Stanza
def tokenize_text(text):
    # Process the text with the initialized pipeline
    doc = nlp(text)
    # Extract tokens from the document
    tokens = [word.text for sent in doc.sentences for word in sent.words]
    word_positions = []
    
    # Extract word positions if available
    for sent in doc.sentences:
        for word in sent.words:
            if word.misc is not None:  # Check if the 'misc' attribute exists
                start_char = word.misc.get('start_char', None)
                end_char = word.misc.get('end_char', None)
                word_positions.append((start_char, end_char))
            else:
                # If 'misc' attribute is not present, set word positions to None
                word_positions.append((None, None))
    
    return tokens, word_positions


In [137]:
def generate_labels_from_json(json_file):
    data = []  # Initialize an empty list to store data

    # Open the JSON file
    with open(json_file, 'r') as file:
        # Load JSON data from the file
        json_data = json.load(file)

        # Extract 'notes' from the JSON data
        notes = json_data.get('notes', [])

        # Iterate through each note
        for note in notes:
            note_text = note.get('text', None)
            if note_text:
                # Tokenize the note text
                words = tokenize_text(note_text)

                # Extract annotations from the note
                annotations = note.get('annotations', [])

                # Generate labels for words
                labels = generate_labels(words, annotations)

                # Append data to the list
                for i, (word, label) in enumerate(zip(words, labels)):
                    data.append({'sentence_id': 0, 'word': word, 'label': label})

    return data

In [138]:
# Specify the path to the main folder containing subfolders with JSON files
main_folder_path = r"C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2"

# Call the search_files function to start searching for JSON files in the main folder and its subfolders
search_files(main_folder_path)


Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\100197-ICD-9.json


Total unique hadm_id count: 1


In [139]:
data = []

# Iterate through each note information in extracted_texts
for note_info in extracted_texts:
    print("Note Info:", note_info)  # Debug print
    # Check if 'sentence_info' key exists
    if 'sentence_info' in note_info:
        # Extract information for each word in the note
        for sent_info in note_info['sentence_info']:
            print("Sent Info:", sent_info)  # Debug print
            for word, label in zip(sent_info['words'], sent_info['labels']):
                # Append data for each word to the list
                data.append({'sentence_id': note_info['note_id'], 'word': word, 'label': label})

# Create a DataFrame from the data
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

Note Info: {'hadm_id': 100197, 'note_id': 25762, 'category': 'Discharge summary', 'description': 'Report', 'annotations': [{'begin': 374, 'end': 377, 'code': '431', 'code_system': 'ICD-9-CM', 'description': 'Intracerebral hemorrhage', 'type': 'MapType.APPROX', 'covered_text': 'IPH'}, {'begin': 383, 'end': 409, 'code': '431', 'code_system': 'ICD-9-CM', 'description': 'Intracerebral hemorrhage', 'type': 'MapType.APPROX', 'covered_text': 'intraventricular extension'}, {'begin': 430, 'end': 443, 'code': '331.4', 'code_system': 'ICD-9-CM', 'description': 'Obstructive hydrocephalus', 'type': 'MapType.APPROX', 'covered_text': 'hydrocephalus'}, {'begin': 835, 'end': 862, 'code': '431', 'code_system': 'ICD-9-CM', 'description': 'Intracerebral hemorrhage', 'type': 'MapType.APPROX', 'covered_text': 'intraparenchymal hemorrhage'}, {'begin': 1577, 'end': 1580, 'code': 'V49.86', 'code_system': 'ICD-9-CM', 'description': 'Do not resuscitate status', 'type': 'Human', 'covered_text': 'DNR'}, {'begin': 

In [140]:
data = []
for note_info in extracted_texts:
    for sent_info in note_info['sentence_info']:
        for word, label in zip(sent_info['words'], sent_info['labels']):
            data.append({'sentence_id': sent_info['sentence_id'], 'word': word, 'label': label})

word_df = pd.DataFrame(data)

KeyError: 'sentence_info'