In [1]:
#pip install stanza

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import json
import stanza
import csv

# Build an English pipeline
stanza.download('en') # download English model
nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,ner') # initialize English neural pipeline

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 2.78MB/s]                    
2024-03-12 10:04:52 INFO: Downloaded file to C:\Users\okechukwu chude\stanza_resources\resources.json
2024-03-12 10:04:52 INFO: Downloading default packages for language: en (English) ...
2024-03-12 10:04:54 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\en\default.zip
2024-03-12 10:04:58 INFO: Finished downloading models and saved to C:\Users\okechukwu chude\stanza_resources
2024-03-12 10:04:58 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 2.91MB/s]                    
2024-03-12 10:04:59 INFO: Downloaded file to C:\Use

In [3]:
# Global variable to store all extracted texts
extracted_texts = []

In [4]:
def process_text_with_stanza(text):
    doc = nlp(text)
    return doc


In [5]:
def extract_info_from_json(json_file_path, hadm_id_set):
    global extracted_texts

    # Open the JSON file
    with open(json_file_path, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)

        # Extract 'hadm_id' and 'comment' from the JSON data
        hadm_id = data.get('hadm_id', None)
        comment = data.get('comment', None)

        # If 'hadm_id' is not found, print a warning message
        if hadm_id is None:
            print(f"Warning: 'hadm_id' not found in {json_file_path}")
            return None

        # Add 'hadm_id' to the set
        hadm_id_set.add(hadm_id)

        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])

        # Iterate through each note
        for note in notes:
            note_info = {}  # Initialize a dictionary to store note information

            # Add 'hadm_id' to the note information
            note_info['hadm_id'] = hadm_id

            # Extract information from the note
            note_info['note_id'] = note.get('note_id', None)
            note_info['category'] = note.get('category', None)
            note_info['description'] = note.get('description', None)

            # Extract annotations from the note
            annotations = note.get('annotations', [])
            annotations_info = []  # Initialize a list to store annotation information

            # Iterate through each annotation in the note
            for annotation in annotations:
                annotation_info = {}  # Initialize a dictionary to store annotation information

                # Extract information from the annotation
                annotation_info['begin'] = annotation.get('begin', None)
                annotation_info['end'] = annotation.get('end', None)
                annotation_info['code'] = annotation.get('code', None)
                annotation_info['code_system'] = annotation.get('code_system', None)
                annotation_info['description'] = annotation.get('description', None)
                annotation_info['type'] = annotation.get('type', None)
                annotation_info['covered_text'] = annotation.get('covered_text', None)

                annotations_info.append(annotation_info)  # Append annotation information to the list

            note_info['annotations'] = annotations_info  # Add annotations information to the note
            note_info['text'] = note.get('text', None)

            # Process text with Stanza
            if note_info['text']:
                processed_text = process_text_with_stanza(note_info['text'])
                note_info['processed_text'] = processed_text

                # Count number of sentences and store it
                num_sentences = len(processed_text.sentences)
                note_info['num_sentences'] = num_sentences

            extracted_texts.append(note_info)  # Append note information to the global variable

In [6]:
def search_files(folder_path):
    # Set to store unique hadm_id values
    hadm_id_set = set()

    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.json'):
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, filename)
                print("Processing:", json_file_path)
                # Call extract_info_from_json function to extract information from the JSON file
                extract_info_from_json(json_file_path, hadm_id_set)
                # Print a separator after processing each file
                print("=" * 50)

    # Print the count of unique hadm_id values
    print("Total unique hadm_id count:", len(hadm_id_set))

    # # Write extracted information to CSV file
    # write_to_csv(extracted_texts)


In [7]:
# def write_to_csv(data):
#     # Specify the path for the CSV file
#     csv_file_path = "extracted_info.csv"

#     # Define the field names for the CSV file
#     fieldnames = ['hadm_id', 'note_id', 'category', 'description', 'begin', 'end', 'code', 'system', 'description','covered_text', 'text']

#     # Write data to the CSV file
#     with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
#         writer = csv.DictWriter(file, fieldnames=fieldnames)
#         writer.writeheader()
#         for item in data:
#             writer.writerow(item)

In [9]:
# Specify the path to the main folder containing subfolders with JSON files
main_folder_path = r"C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing"

# Call the search_files function to start searching for JSON files in the main folder and its subfolders
search_files(main_folder_path)

Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-10\1.0\100197-ICD-10.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-10\1.0\100463-ICD-10.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-10\1.0\101173-ICD-10.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-9\1.0\100197-ICD-9.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-9\1.0\100463-ICD-9.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-9\1.0\101173-ICD-9.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Profee\ICD-10\1.0\101525

In [None]:
# # We can access named entities from 'processed_text' field of each note
# for note_info in extracted_texts:
#     if 'processed_text' in note_info:
#         processed_text = note_info['processed_text']
#         for sentence in processed_text.sentences:
#             for entity in sentence.ents:
#                 print(f"Entity: {entity.text}\tType: {entity.type}")

In [None]:
# Print the first sentence and its associated hadm_id
#for note_info in extracted_texts:
    #if 'processed_text' in note_info:
        #processed_text = note_info['processed_text']
        # Extract the first sentence
        #first_sentence = processed_text.sentences[0].text
        # Print the first sentence and its associated hadm_id
        #print(f"HADM_ID: {note_info['hadm_id']} - First Sentence: {first_sentence}")
        # num_sentences = len(processed_text.sentences)
        # print(f"HADM_ID: {note_info['hadm_id']} - Number of Sentences: {num_sentences}")

In [10]:
# Calculate total number of sentences
total_sentences = sum(note_info['num_sentences'] for note_info in extracted_texts if 'num_sentences' in note_info)

# Print total number of sentences
print("Total number of sentences:", total_sentences)

Total number of sentences: 2750


In [11]:
#pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


**Sentence Tokenisation**

In [12]:
from stanza.utils.conll import CoNLL

def tokenize_sentences(doc):

    tokenized_sentences = []  # Initialize an empty list to store tokenized sentences
    for sentence in doc.sentences:  # Iterate through each sentence in the document
        tokens = [token.text for token in sentence.tokens]  # Extract text of each token in the sentence
        tokenized_sentences.append(tokens)  # Append the list of tokens to the tokenized_sentences list
    return tokenized_sentences  # Return the tokenized sentences

# Loop through each note_info in the extracted_texts list
for note_info in extracted_texts:
    # Check if the 'processed_text' key exists in the current note_info
    if 'processed_text' in note_info:
        processed_text = note_info['processed_text']  # Extract the processed text from note_info
        tokenized_sentences = tokenize_sentences(processed_text)  # Tokenize the sentences in the processed text
        note_info['tokenized_sentences'] = tokenized_sentences  # Add the tokenized sentences to note_info

# Loop through each note_info in the extracted_texts list
for i, note_info in enumerate(extracted_texts):
    if i >= 10:  # Stop after processing the first 50 notes
        break

    # Check if the 'tokenized_sentences' key exists in the current note_info
    if 'tokenized_sentences' in note_info:
        print(f"HADM_ID: {note_info['hadm_id']}")  # Print the HADM_ID
        print(f"Note ID: {note_info['note_id']}")  # Print the note ID
        tokenized_sentences = note_info['tokenized_sentences']  # Extract the tokenized sentences from note_info
        # Loop through each sentence_tokens list in the tokenized_sentences list
        for sentence_tokens in tokenized_sentences:
            print(' '.join(sentence_tokens))  # Print the tokens of each sentence separated by spaces
        print()  # Print an empty line after printing all sentences

HADM_ID: 100197
Note ID: 25762
Admission Date : [ **2136-10-23** ]
Discharge Date : [ **2136-10-24**]
Date of Birth : [ **2056-7-14** ]
Sex : M
Service : NEUROSURGERY
Allergies : No Known Allergies / Adverse Drug Reactions
Attending :[** First Name3 ( LF ) 1835 ** ]
Chief Complaint : Found down
Major Surgical or Invasive Procedure : None
History of Present Illness : 80M p/w a large R IPH with intraventricular extension , midline shift , and hydrocephalus .
The patient was found lying face up in his bathtub after having shaved .
There was no water in the tub and the shower was not turned on .
He was found by a neighbor .
[ ** Name ( NI ) ** ] EMS , he was moaning and there was " tone in his left arm " which may have been consistent with posturing .
He was taken to [ ** Hospital1 18 ** ]
[ ** Location ( un ) 620** ] and head CT was performed , revealing a large intraparenchymal hemorrhage extending from the lower midbrain into the hypothalamus , thalamus and basal ganglia on the right , 

In [13]:
# Initialize total word count
total_word_count = 0

# Loop through each note_info in the extracted_texts list
for note_info in extracted_texts:
    # Check if the 'tokenized_sentences' key exists in the current note_info
    if 'tokenized_sentences' in note_info:
        tokenized_sentences = note_info['tokenized_sentences']  # Extract the tokenized sentences from note_info

        # Iterate through each sentence_tokens list in the tokenized_sentences list
        for sentence_tokens in tokenized_sentences:
            # Increment the total word count by the number of words in the current sentence
            total_word_count += len(sentence_tokens)

# Print the total number of words
print("Total number of words:", total_word_count)


Total number of words: 49160


**PART OF SPEECH TAGGING**

NER DATA FORMAT

In [14]:
import pandas as pd


data = []

for note_info in extracted_texts:
    if 'tokenized_sentences' in note_info and 'annotations' in note_info:
        tokenized_sentences = note_info['tokenized_sentences']
        annotations = note_info['annotations']

        sentence_id = 0
        for sentence_tokens in tokenized_sentences:
            words = sentence_tokens
            labels = ['O'] * len(words)  # Initialize all labels as 'O' (Outside)

            # Match annotations with words in the sentence
            for annotation in annotations:
                begin = annotation['begin']
                end = annotation['end']
                code = annotation['code']
                covered_text = annotation['covered_text']

                # Find the word indices corresponding to the annotation
                word_indices = []
                for i, word in enumerate(words):
                    if covered_text.startswith(word):
                        word_indices.append(i)
                        covered_text = covered_text[len(word):]
                        if not covered_text:
                            break

                # Update labels for the matched words
                for idx in word_indices:
                    labels[idx] = f'B-{code}' if idx == word_indices[0] else f'I-{code}'

            # Append data for each word in the sentence
            for word, label in zip(words, labels):
                data.append({'sentence_id': sentence_id, 'words': word, 'labels': label})

            sentence_id += 1

# Create the DataFrame
ner_df = pd.DataFrame(data)

In [15]:
ner_df.head()

Unnamed: 0,sentence_id,words,labels
0,0,Admission,O
1,0,Date,O
2,0,:,O
3,0,[,O
4,0,**2136-10-23**,O


In [16]:
# Count the occurrences of each unique value in the 'labels' column
label_counts = ner_df['labels'].value_counts()

# Display the counts
print(label_counts)

labels
O             48290
B-800.70         67
B-S02.0XXB       45
B-G47.00         26
B-S06.4X0A       25
              ...  
B-S62.313A        1
B-V45.89          1
B-331.4           1
B-0JQ00ZZ         1
B-532.40          1
Name: count, Length: 166, dtype: int64


In [17]:
# Calculate the total number of evidence words
total_evidence_words = total_word_count - label_counts.get('O', 0)

# Print the total number of evidence words
print("Total number of evidence words:", total_evidence_words)


Total number of evidence words: 870


EXTRACT EVIDENCE FROM THE JSON FILES

In [18]:
def search_json_files(folder_path):
    # Initialize a list to store all JSON file paths
    json_files = []

    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                json_files.append(os.path.join(root, file))

    return json_files


In [19]:
# Initialize a global notes DataFrame
global_df = pd.DataFrame()

In [22]:
def generate_labels_from_json(json_file):
    # Open the JSON file
    with open(json_file, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)

        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])

        # Iterate through each note
        for note in notes:
            note_text = note.get('text', None)
            if note_text:
                # Tokenize the note text
                tokens = tokenize_text(note_text)

                # Extract annotations from the note
                annotations = note.get('annotations', [])

                # Initialize labels for tokens
                labels = ['O'] * len(tokens)

                # Match annotations with tokens
                for annotation in annotations:
                    begin = annotation['begin']
                    end = annotation['end']
                    code = annotation['code']
                    covered_text = annotation['covered_text']

                    # Find the token indices corresponding to the annotation
                    token_indices = find_token_indices(tokens, begin, end, covered_text)

                    # Update labels for the matched tokens
                    for idx in token_indices:
                        labels[idx] = f'B-{code}' if idx == token_indices[0] else f'I-{code}'

                # Create a DataFrame for the current note
                note_df = pd.DataFrame({'tokens': tokens, 'labels': labels})

                # Append the note DataFrame to the global DataFrame
                # Append the note DataFrame to the global DataFrame
                global global_df
                global_df = pd.concat([global_df, note_df], ignore_index=True)

In [23]:
def tokenize_text(text):
    # Process the text with the initialized pipeline
    doc = nlp(text)
    # Extract tokens from the document
    tokens = [word.text for sent in doc.sentences for word in sent.words]
    return tokens

In [24]:
def find_token_indices(tokens, begin, end, covered_text):
    token_indices = []
    start_idx = 0
    end_idx = 0
    for i, token in enumerate(tokens):
        end_idx += len(token)
        if start_idx >= begin and end_idx <= end:
            token_indices.append(i)
        start_idx = end_idx + 1
    return token_indices

In [25]:
# Spath to the main folder containing subfolders with JSON files
main_folder_path = r"C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing"

# Search for JSON files in the main folder and its subfolders
json_files = search_json_files(main_folder_path)

# Iterate through each JSON file and generate labels for tokens
for json_file in json_files:
    print(f"Processing {json_file}...")
    generate_labels_from_json(json_file)


Processing C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-10\1.0\100197-ICD-10.json...
Processing C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-10\1.0\100463-ICD-10.json...
Processing C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-10\1.0\101173-ICD-10.json...
Processing C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-9\1.0\100197-ICD-9.json...
Processing C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-9\1.0\100463-ICD-9.json...
Processing C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-9\1.0\101173-ICD-9.json...
Processing C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Profee\ICD-10

In [28]:
print(global_df)


               tokens labels
0           Admission      O
1                Date      O
2                   :      O
3                   [      O
4      **2136-10-23**      O
...               ...    ...
41509           sleep      O
41510           study      O
41511             for      O
41512             you      O
41513               .      O

[41514 rows x 2 columns]


In [29]:
filtered_df = global_df[global_df['labels'] != 'O']
print(filtered_df)

        tokens    labels
83          in   B-I61.5
84         his   I-I61.5
85     bathtub   I-I61.5
86       after   I-I61.5
87      having   I-I61.5
...        ...       ...
41451       up  B-532.40
41453        .  B-780.57
41454        .  I-780.57
41455      You  I-780.57
41456     will  I-780.57

[1068 rows x 2 columns]


In [34]:
label_for_hydrocephalus = global_df.loc[global_df['tokens'] == 'bathtub', 'labels'].values[0]
print("Label':", label_for_hydrocephalus)

Label': I-I61.5
