In [1]:
#pip install stanza

In [2]:
import os
import json
import stanza
import csv
import pandas as pd

# Build an English pipeline
stanza.download('en', package='mimic', processors={'ner': 'i2b2'}) # download English model
nlp = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'}) # initialize English neural pipeline

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 6.56MB/s]                    
2024-03-16 11:20:28 INFO: Downloaded file to C:\Users\okechukwu chude\stanza_resources\resources.json
2024-03-16 11:20:28 INFO: Downloading these customized packages for language: en (English)...
| Processor       | Package        |
------------------------------------
| tokenize        | mimic          |
| pos             | mimic_charlm   |
| lemma           | mimic_nocharlm |
| depparse        | mimic_charlm   |
| ner             | i2b2           |
| pretrain        | mimic          |
| forward_charlm  | mimic          |
| backward_charlm | mimic          |

2024-03-16 11:20:28 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\en\tokenize\mimic.pt
2024-03-16 11:20:28 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\en\pos\mimic_charlm.pt
2024-03-16 11:20:28 INFO: File 

In [3]:
# Global variable to store all extracted texts
extracted_texts = []

In [4]:
# Function to process text using Stanza
def process_text_with_stanza(text):
    doc = nlp(text)
    return doc


In [5]:
# Function to extract information from JSON files
def extract_info_from_json(json_file_path, hadm_id_set):
    global extracted_texts

    # Open the JSON file
    with open(json_file_path, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)

        # Extract 'hadm_id' and 'comment' from the JSON data
        hadm_id = data.get('hadm_id', None)
        comment = data.get('comment', None)

        # If 'hadm_id' is not found, print a warning message
        if hadm_id is None:
            print(f"Warning: 'hadm_id' not found in {json_file_path}")
            return None

        # Add 'hadm_id' to the set
        hadm_id_set.add(hadm_id)

        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])

        # Iterate through each note
        for note in notes:
            note_info = {}  # Initialize a dictionary to store note information

            # Add 'hadm_id' to the note information
            note_info['hadm_id'] = hadm_id

            # Extract information from the note
            note_info['note_id'] = note.get('note_id', None)
            note_info['category'] = note.get('category', None)
            note_info['description'] = note.get('description', None)

            # Extract annotations from the note
            annotations = note.get('annotations', [])
            annotations_info = []  # Initialize a list to store annotation information

            # Iterate through each annotation in the note
            for annotation in annotations:
                annotation_info = {}  # Initialize a dictionary to store annotation information

                # Extract information from the annotation
                annotation_info['begin'] = annotation.get('begin', None)
                annotation_info['end'] = annotation.get('end', None)
                annotation_info['code'] = annotation.get('code', None)
                annotation_info['code_system'] = annotation.get('code_system', None)
                annotation_info['description'] = annotation.get('description', None)
                annotation_info['type'] = annotation.get('type', None)
                annotation_info['covered_text'] = annotation.get('covered_text', None)

                annotations_info.append(annotation_info)  # Append annotation information to the list

            note_info['annotations'] = annotations_info  # Add annotations information to the note
            note_info['text'] = note.get('text', None)

            # Process text with Stanza
            if note_info['text']:
                processed_text = process_text_with_stanza(note_info['text'])
                note_info['processed_text'] = processed_text

                # Count number of sentences and store it
                num_sentences = len(processed_text.sentences)
                note_info['num_sentences'] = num_sentences

                # Extract sentence information and generate labels
                sentence_info = []
                for sent_id, sent in enumerate(processed_text.sentences):
                    tokens = [word.text for word in sent.words]
                    labels = generate_labels(tokens, note_info['annotations'])
                    sentence_info.append({'sentence_id': sent_id, 'words': tokens, 'labels': labels})

                note_info['sentence_info'] = sentence_info

            extracted_texts.append(note_info)  # Append note information to the global variable



In [6]:
def generate_labels(words, annotations):
    labels = ['O'] * len(words)

    for annotation in annotations:
        begin = annotation['begin']
        end = annotation['end']
        code = annotation['code']
        covered_text = annotation['covered_text']

        # Find the word indices corresponding to the annotation
        word_indices = find_word_indices(words, begin, end, covered_text)

        # Update labels for the matched words
        for idx in word_indices:
            labels[idx] = f'B-{code}' if idx == word_indices[0] else f'I-{code}'

    return labels

In [7]:
# Function to search for JSON files in a given folder and its subfolders

def search_files(folder_path):
    # Set to store unique hadm_id values
    hadm_id_set = set()

    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.json'):
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, filename)
                print("Processing:", json_file_path)
                # Call extract_info_from_json function to extract information from the JSON file
                extract_info_from_json(json_file_path, hadm_id_set)
                # Print a separator after processing each file
                print("=" * 50)

    # Print the count of unique hadm_id values
    print("Total unique hadm_id count:", len(hadm_id_set))



In [8]:
# Function to extract information about sentences and named entities

def search_json_files(folder_path):
    # Initialize a list to store all JSON file paths
    json_files = []

    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                json_files.append(os.path.join(root, file))

    return json_files

In [9]:
# Function to generate labels for tokens from JSON files

def generate_labels_from_json(json_file):
    # Open the JSON file
    with open(json_file, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)

        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])

        # Iterate through each note
        for note in notes:
            note_text = note.get('text', None)
            if note_text:
                # Tokenize the note text
                tokens = tokenize_text(note_text)

                # Extract annotations from the note
                annotations = note.get('annotations', [])

                # Initialize labels for tokens
                labels = ['O'] * len(tokens)

                # Match annotations with tokens
                for annotation in annotations:
                    begin = annotation['begin']
                    end = annotation['end']
                    code = annotation['code']
                    covered_text = annotation['covered_text']

                    # Find the token indices corresponding to the annotation
                    token_indices = find_word_indices(tokens, begin, end, covered_text)

                    # Update labels for the matched tokens
                    for idx in token_indices:
                        labels[idx] = f'B-{code}' if idx == token_indices[0] else f'I-{code}'

                

In [10]:
# Function to tokenize text using Stanza

def tokenize_text(text):
    # Process the text with the initialized pipeline
    doc = nlp(text)
    # Extract tokens from the document
    tokens = [word.text for sent in doc.sentences for word in sent.words]
    print(doc.ents)

    return tokens

In [11]:
# Function to find the word indices corresponding to the annotation
def find_word_indices(words, begin, end, covered_text):
    word_indices = []
    char_counter = 0

    for idx, word in enumerate(words):
        if char_counter + len(word) >= begin and char_counter <= end:
            word_indices.append(idx)
        char_counter += len(word) + 1  # Add 1 for space between words

    return word_indices

In [13]:
# Specify the path to the main folder containing subfolders with JSON files
main_folder_path = r"C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2"

# Call the search_files function to start searching for JSON files in the main folder and its subfolders
search_files(main_folder_path)

Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\100197-ICD-9.json


Total unique hadm_id count: 1


DATA FRAME FOR SENTENCES

In [14]:
# Create a DataFrame from the extracted texts
data = []
for note_info in extracted_texts:
    for sent_info in note_info['sentence_info']:
        data.append({'sentence_id': sent_info['sentence_id'], 'words': sent_info['words'], 'labels': sent_info['labels']})

sent_df = pd.DataFrame(data)

In [15]:
sent_df.head()

Unnamed: 0,sentence_id,words,labels
0,0,"[Admission, Date, :, [, **2136-10-23, **], Dis...","[O, O, O, O, O, O, O, O, O, O, O, O]"
1,1,"[Date, of, Birth, :, [, **, 2056-7-14, **], Se...","[O, O, O, O, O, O, O, O, O, O, O]"
2,2,"[Service, :, NEUROSURGERY]","[O, O, O]"
3,3,"[Allergies, :, No, Known, Allergies, /, Advers...","[O, O, O, O, O, O, O, O, O]"
4,4,"[Attending, :[**, First, Name3, (, LF, ), 1835...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"


DATAFRAME FOR WORDS

In [16]:
data = []
for note_info in extracted_texts:
    for sent_info in note_info['sentence_info']:
        for word, label in zip(sent_info['words'], sent_info['labels']):
            data.append({'sentence_id': sent_info['sentence_id'], 'word': word, 'label': label})

word_df = pd.DataFrame(data)

In [17]:
word_df.head()

Unnamed: 0,sentence_id,word,label
0,0,Admission,O
1,0,Date,O
2,0,:,O
3,0,[,O
4,0,**2136-10-23,O


In [18]:
label_counts = word_df['label'].value_counts()
print(label_counts)

label
O          1294
I-431         7
I-I61.5       6
B-431         2
B-I61.8       1
I-I61.8       1
B-I61.5       1
B-G91.9       1
I-G91.9       1
B-331.4       1
I-331.4       1
Name: count, dtype: int64


In [20]:
# Extracting the first character of each label
word_df['label'] = word_df['label'].str[0]

# Displaying the resulting DataFrame
print(word_df)

      sentence_id            word label
0               0       Admission     O
1               0            Date     O
2               0               :     O
3               0               [     O
4               0    **2136-10-23     O
...           ...             ...   ...
1311           46            None     O
1312           47       Completed     O
1313           47              by     O
1314           47  :[**2136-10-24     O
1315           47             **]     O

[1316 rows x 3 columns]


In [21]:
label_counts = word_df['label'].value_counts()
print(label_counts)

label
O    1294
I      16
B       6
Name: count, dtype: int64


In [22]:
filtered_df = word_df[word_df['label'] == 'B']
print(filtered_df)

     sentence_id       word label
209           11  posterior     B
212           11     Doctor     B
222           11  ventricle     B
867           11  posterior     B
870           11     Doctor     B
880           11  ventricle     B


In [23]:
ventilation = word_df[word_df['word'] == 'hydrocephalus']
print(ventilation)

     sentence_id           word label
75             6  hydrocephalus     O
733            6  hydrocephalus     O
