In [137]:
#pip install stanza

In [138]:
import os
import json
import stanza
import csv
import pandas as pd

# Build an English pipeline
stanza.download('en', package='mimic', processors={'ner': 'i2b2'}) # download English model
nlp = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'}) # initialize English neural pipeline

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 2.38MB/s]                    
2024-03-13 22:06:25 INFO: Downloaded file to C:\Users\okechukwu chude\stanza_resources\resources.json
2024-03-13 22:06:25 INFO: Downloading these customized packages for language: en (English)...
| Processor       | Package        |
------------------------------------
| tokenize        | mimic          |
| pos             | mimic_charlm   |
| lemma           | mimic_nocharlm |
| depparse        | mimic_charlm   |
| ner             | i2b2           |
| forward_charlm  | mimic          |
| backward_charlm | mimic          |
| pretrain        | mimic          |

2024-03-13 22:06:25 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\en\tokenize\mimic.pt
2024-03-13 22:06:25 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\en\pos\mimic_charlm.pt
2024-03-13 22:06:25 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\

In [139]:
# Global variable to store all extracted texts
extracted_texts = []

In [140]:
# Function to process text using Stanza
def process_text_with_stanza(text):
    doc = nlp(text)
    return doc


In [141]:
# Function to extract information from JSON files
def extract_info_from_json(json_file_path, hadm_id_set):
    global extracted_texts

    # Open the JSON file
    with open(json_file_path, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)

        # Extract 'hadm_id' and 'comment' from the JSON data
        hadm_id = data.get('hadm_id', None)
        comment = data.get('comment', None)

        # If 'hadm_id' is not found, print a warning message
        if hadm_id is None:
            print(f"Warning: 'hadm_id' not found in {json_file_path}")
            return None

        # Add 'hadm_id' to the set
        hadm_id_set.add(hadm_id)

        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])

        # Iterate through each note
        for note in notes:
            note_info = {}  # Initialize a dictionary to store note information

            # Add 'hadm_id' to the note information
            note_info['hadm_id'] = hadm_id

            # Extract information from the note
            note_info['note_id'] = note.get('note_id', None)
            note_info['category'] = note.get('category', None)
            note_info['description'] = note.get('description', None)

            # Extract annotations from the note
            annotations = note.get('annotations', [])
            annotations_info = []  # Initialize a list to store annotation information

            # Iterate through each annotation in the note
            for annotation in annotations:
                annotation_info = {}  # Initialize a dictionary to store annotation information

                # Extract information from the annotation
                annotation_info['begin'] = annotation.get('begin', None)
                annotation_info['end'] = annotation.get('end', None)
                annotation_info['code'] = annotation.get('code', None)
                annotation_info['code_system'] = annotation.get('code_system', None)
                annotation_info['description'] = annotation.get('description', None)
                annotation_info['type'] = annotation.get('type', None)
                annotation_info['covered_text'] = annotation.get('covered_text', None)

                annotations_info.append(annotation_info)  # Append annotation information to the list

            note_info['annotations'] = annotations_info  # Add annotations information to the note
            note_info['text'] = note.get('text', None)

            # Process text with Stanza
            if note_info['text']:
                processed_text = process_text_with_stanza(note_info['text'])
                note_info['processed_text'] = processed_text

                # Count number of sentences and store it
                num_sentences = len(processed_text.sentences)
                note_info['num_sentences'] = num_sentences

                # Extract sentence information and generate labels
                sentence_info = []
                for sent_id, sent in enumerate(processed_text.sentences):
                    tokens = [word.text for word in sent.words]
                    labels = generate_labels(tokens, note_info['annotations'])
                    sentence_info.append({'sentence_id': sent_id, 'words': tokens, 'labels': labels})

                note_info['sentence_info'] = sentence_info

            extracted_texts.append(note_info)  # Append note information to the global variable



In [142]:
# Function to generate labels for tokens based on annotations
def generate_labels(tokens, annotations):
    labels = ['O'] * len(tokens)

    # Match annotations with tokens
    for annotation in annotations:
        begin = annotation['begin']
        end = annotation['end']
        code = annotation['code']
        covered_text = annotation['covered_text']

        # Find the token indices corresponding to the annotation
        token_indices = find_token_indices(tokens, begin, end, covered_text)

        # Update labels for the matched tokens
        for idx in token_indices:
            labels[idx] = f'B-{code}' if idx == token_indices[0] else f'I-{code}'

    return labels

In [143]:
# Function to search for JSON files in a given folder and its subfolders

def search_files(folder_path):
    # Set to store unique hadm_id values
    hadm_id_set = set()

    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.json'):
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, filename)
                print("Processing:", json_file_path)
                # Call extract_info_from_json function to extract information from the JSON file
                extract_info_from_json(json_file_path, hadm_id_set)
                # Print a separator after processing each file
                print("=" * 50)

    # Print the count of unique hadm_id values
    print("Total unique hadm_id count:", len(hadm_id_set))



In [144]:
# Function to extract information about sentences and named entities

def search_json_files(folder_path):
    # Initialize a list to store all JSON file paths
    json_files = []

    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                json_files.append(os.path.join(root, file))

    return json_files

In [145]:
# Function to generate labels for tokens from JSON files

def generate_labels_from_json(json_file):
    # Open the JSON file
    with open(json_file, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)

        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])

        # Iterate through each note
        for note in notes:
            note_text = note.get('text', None)
            if note_text:
                # Tokenize the note text
                tokens = tokenize_text(note_text)

                # Extract annotations from the note
                annotations = note.get('annotations', [])

                # Initialize labels for tokens
                labels = ['O'] * len(tokens)

                # Match annotations with tokens
                for annotation in annotations:
                    begin = annotation['begin']
                    end = annotation['end']
                    code = annotation['code']
                    covered_text = annotation['covered_text']

                    # Find the token indices corresponding to the annotation
                    token_indices = find_token_indices(tokens, begin, end, covered_text)

                    # Update labels for the matched tokens
                    for idx in token_indices:
                        labels[idx] = f'B-{code}' if idx == token_indices[0] else f'I-{code}'

                

In [146]:
# Function to tokenize text using Stanza

def tokenize_text(text):
    # Process the text with the initialized pipeline
    doc = nlp(text)
    # Extract tokens from the document
    tokens = [word.text for sent in doc.sentences for word in sent.words]
    print(doc.ents)

    return tokens

In [147]:
# Function to find token indices based on annotations

def find_token_indices(tokens, begin, end, covered_text):
    token_indices = []
    start_idx = 0
    end_idx = 0
    for i, token in enumerate(tokens):
        end_idx += len(token)
        if start_idx >= begin and end_idx <= end:
            token_indices.append(i)
        start_idx = end_idx + 1
    return token_indices

In [148]:
# Specify the path to the main folder containing subfolders with JSON files
main_folder_path = r"C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing"

# Call the search_files function to start searching for JSON files in the main folder and its subfolders
search_files(main_folder_path)

Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-10\1.0\100197-ICD-10.json


Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-10\1.0\100463-ICD-10.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-10\1.0\101173-ICD-10.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-9\1.0\100197-ICD-9.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-9\1.0\100463-ICD-9.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-9\1.0\101173-ICD-9.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Profee\ICD-10\1.0\101525-ICD-10.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Profee\ICD-10\1.0\103715-IC

DATA FRAME FOR SENTENCES

In [167]:
# Create a DataFrame from the extracted texts
data = []
for note_info in extracted_texts:
    for sent_info in note_info['sentence_info']:
        data.append({'sentence_id': sent_info['sentence_id'], 'words': sent_info['words'], 'labels': sent_info['labels']})

sent_df = pd.DataFrame(data)

In [168]:
sent_df.head()

Unnamed: 0,sentence_id,words,labels
0,0,"[Admission, Date, :, [, **2136-10-23, **], Dis...","[O, O, O, O, O, O, O, O, O, O, O, O]"
1,1,"[Date, of, Birth, :, [, **, 2056-7-14, **], Se...","[O, O, O, O, O, O, O, O, O, O, O]"
2,2,"[Service, :, NEUROSURGERY]","[O, O, O]"
3,3,"[Allergies, :, No, Known, Allergies, /, Advers...","[O, O, O, O, O, O, O, O, O]"
4,4,"[Attending, :[**, First, Name3, (, LF, ), 1835...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"


DATAFRAME FOR WORDS

In [202]:
data = []
for note_info in extracted_texts:
    for sent_info in note_info['sentence_info']:
        for word, label in zip(sent_info['words'], sent_info['labels']):
            data.append({'sentence_id': sent_info['sentence_id'], 'word': word, 'label': label})

word_df = pd.DataFrame(data)

In [203]:
word_df.head()

Unnamed: 0,sentence_id,word,label
0,0,Admission,O
1,0,Date,O
2,0,:,O
3,0,[,O
4,0,**2136-10-23,O


In [204]:
label_counts = word_df['label'].value_counts()
print(label_counts)

label
O             37476
I-V89.2XXA      795
I-E819.9        795
I-74174         472
I-800.70        190
I-S02.0XXB      183
B-V89.2XXA      141
B-E819.9        141
I-880.03         81
I-S41.112A       81
B-74174          74
I-94003          70
I-71046          64
B-800.70         49
B-S02.0XXB       41
B-71046          34
B-880.03         22
B-S41.112A       22
I-93000          16
I-V42.5          15
I-Z94.7          15
B-94003          14
B-93000          12
B-S06.6X0A        6
I-S06.4X0A        5
B-H40.9           3
B-365.9           3
I-S06.6X0A        2
B-S06.4X0A        2
B-R94.31          2
B-Z94.7           2
B-794.31          2
B-V42.5           2
I-E78.5           1
I-272.4           1
B-272.4           1
I-794.31          1
I-H40.9           1
I-365.9           1
B-R19.7           1
I-R94.31          1
B-E78.5           1
B-787.91          1
Name: count, dtype: int64


In [205]:
filtered_df = word_df[word_df['label'] == 'I-V89.2XXA']
print(filtered_df)

       sentence_id          word       label
12559           15             :  I-V89.2XXA
12560           15           non  I-V89.2XXA
12614           15            81  I-V89.2XXA
12615           15            **  I-V89.2XXA
12616           15             ]  I-V89.2XXA
...            ...           ...         ...
19581           27            20  I-V89.2XXA
19582           27         Gauge  I-V89.2XXA
19583           27             -  I-V89.2XXA
19584           27  [**2134-3-22  I-V89.2XXA
19585           27           **]  I-V89.2XXA

[795 rows x 3 columns]


In [195]:
# Extracting the first character of each label
word_df['label'] = word_df['label'].str[0]

# Displaying the resulting DataFrame
print(word_df)

       sentence_id          word label short_label
0                0     Admission     O           O
1                0          Date     O           O
2                0             :     O           O
3                0             [     O           O
4                0  **2136-10-23     O           O
...            ...           ...   ...         ...
40837          121         sleep     O           O
40838          121         study     O           O
40839          121           for     O           O
40840          121           you     O           O
40841          121             .     O           O

[40842 rows x 4 columns]


In [196]:
label_counts = word_df['label'].value_counts()
print(label_counts)

label
O    37476
I     2790
B      576
Name: count, dtype: int64


In [198]:
filtered_df = word_df[word_df['label'] == 'B']
print(filtered_df)

       sentence_id         word label short_label
11768            0  ventilation     B           B
11810            1         Type     B           B
11902            3            (     B           B
11912            4         work     B           B
11947            5            ;     B           B
...            ...          ...   ...         ...
38683           19            .     B           B
38687           20          are     B           B
38706           22     possible     B           B
38717           23      stomach     B           B
38725           24            .     B           B

[576 rows x 4 columns]


In [201]:
ventilation = word_df[word_df['word'] == 'hydrocephalus']
print(ventilation)

      sentence_id           word label short_label
75              6  hydrocephalus     O           O
5955            6  hydrocephalus     O           O


#################################################################################################################

In [149]:
# Search for JSON files in the main folder and its subfolders
json_files = search_json_files(main_folder_path)

# Iterate through each JSON file and generate labels for tokens
for json_file in json_files:
    print(f"Processing {json_file}...")
    generate_labels_from_json(json_file)


Processing C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-10\1.0\100197-ICD-10.json...
[{
  "text": "Known Allergies",
  "type": "PROBLEM",
  "start_char": 174,
  "end_char": 189
}, {
  "text": "Adverse Drug Reactions",
  "type": "PROBLEM",
  "start_char": 192,
  "end_char": 214
}, {
  "text": "a large R IPH",
  "type": "PROBLEM",
  "start_char": 364,
  "end_char": 377
}, {
  "text": "intraventricular extension",
  "type": "PROBLEM",
  "start_char": 383,
  "end_char": 409
}, {
  "text": "midline\nshift",
  "type": "PROBLEM",
  "start_char": 411,
  "end_char": 424
}, {
  "text": "hydrocephalus",
  "type": "PROBLEM",
  "start_char": 430,
  "end_char": 443
}, {
  "text": "water in the tub",
  "type": "PROBLEM",
  "start_char": 531,
  "end_char": 547
}, {
  "text": "moaning",
  "type": "PROBLEM",
  "start_char": 639,
  "end_char": 646
}, {
  "text": "posturing",
  "type": "PROBLEM",
  "start_char": 720,
  "end_char": 729
}, {
  "

Match Extracted Evidence with Json Data

In [188]:
def extract_named_entities(text):
    # Process the text with Stanza pipeline
    doc = nlp(text)

    # Extract named entities from the processed document
    named_entities = []
    for sent_id, sent in enumerate(doc.sentences):
        for ent in sent.ents:
            named_entities.append({'sentence_id': sent_id, 'word': ent.text, 'label': ent.type})

    return named_entities


In [189]:
def display_matching_results(json_file):
    # Open the JSON file
    with open(json_file, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)

        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])

        # Iterate through each note
        for note in notes:
            note_text = note.get('text', None)
            if note_text:
                # Extract annotations from the note
                annotations = note.get('annotations', [])

                # Iterate through each annotation
                for annotation in annotations:
                    begin = annotation['begin']
                    end = annotation['end']
                    covered_text = annotation['covered_text']

                    # Extract named entities from the covered text
                    named_entities = extract_named_entities(covered_text)
                    for entity in named_entities:
                        print("Sentence ID:", entity['sentence_id'], "\tWord:", entity['word'], "\tLabel:", entity['label'])

                        # Create a dataframe
                        df = pd.DataFrame(named_entities)

In [190]:
# Specify the path to the main folder containing subfolders with JSON files
main_folder_path = r"C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing"

# Search for JSON files in the main folder and its subfolders
json_files = []

In [191]:
# Recursively search for JSON files in the folder and its subfolders
for root, dirs, files in os.walk(main_folder_path):
    for file in files:
        if file.endswith('.json'):
            json_files.append(os.path.join(root, file))

# Iterate through each JSON file and display matching results
for json_file in json_files:
    print(f"Processing {json_file}...")
    display_matching_results(json_file)

Processing C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-10\1.0\100197-ICD-10.json...
Sentence ID: 0 	Word: IPH 	Label: PROBLEM
Sentence ID: 0 	Word: intraventricular extension 	Label: PROBLEM
Sentence ID: 0 	Word: hydrocephalus 	Label: PROBLEM
Sentence ID: 0 	Word: intraparenchymal hemorrhage 	Label: PROBLEM
Sentence ID: 0 	Word: DIABETES TYPE II 	Label: PROBLEM
Sentence ID: 0 	Word: HYPERLIPIDEMIA 	Label: PROBLEM
Sentence ID: 0 	Word: GLAUCOMA 	Label: PROBLEM
Sentence ID: 0 	Word: OSTEOARTHRITIS 	Label: PROBLEM
Sentence ID: 0 	Word: CAROTID STENOSIS 	Label: TEST
Sentence ID: 0 	Word: rt 	Label: TEST
Sentence ID: 0 	Word: extubated 	Label: TREATMENT
Processing C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-10\1.0\100463-ICD-10.json...
Sentence ID: 0 	Word: Vent 	Label: TREATMENT
Sentence ID: 0 	Word: SDH 	Label: PROBLEM
Sentence ID: 0 	Word: Alcoholism 	Label: PRO

##############################################

In [192]:
def extract_named_entities(text):
    # Process the text with Stanza pipeline
    doc = nlp(text)

    # Extract named entities from the processed document
    named_entities = []
    for sent_id, sent in enumerate(doc.sentences):
        for ent in sent.ents:
            named_entities.append({'sentence_id': sent_id, 'word': ent.text, 'label': ent.type})

    return named_entities

def display_matching_results(json_file):
    dataframes = []
    # Open the JSON file
    with open(json_file, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)

        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])

        # Iterate through each note
        for note in notes:
            note_text = note.get('text', None)
            if note_text:
                # Extract annotations from the note
                annotations = note.get('annotations', [])

                # Iterate through each annotation
                for annotation in annotations:
                    begin = annotation['begin']
                    end = annotation['end']
                    covered_text = annotation['covered_text']

                    # Extract named entities from the covered text
                    named_entities = extract_named_entities(covered_text)
                    df = pd.DataFrame(named_entities)
                    dataframes.append(df)

    return pd.concat(dataframes, ignore_index=True)

# Specify the path to the main folder containing subfolders with JSON files
main_folder_path = r"C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing"

# Search for JSON files in the main folder and its subfolders
json_files = []

# Recursively search for JSON files in the folder and its subfolders
for root, dirs, files in os.walk(main_folder_path):
    for file in files:
        if file.endswith('.json'):
            json_files.append(os.path.join(root, file))

# Create an empty list to store dataframes from each JSON file
dataframes = []

# Iterate through each JSON file and display matching results
for json_file in json_files:
    print(f"Processing {json_file}...")
    dataframe = display_matching_results(json_file)
    dataframes.append(dataframe)

# Concatenate all dataframes into one
final_df = pd.concat(dataframes, ignore_index=True)

# Display the final dataframe
print(final_df)

Processing C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-10\1.0\100197-ICD-10.json...
Processing C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-10\1.0\100463-ICD-10.json...
Processing C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-10\1.0\101173-ICD-10.json...
Processing C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-9\1.0\100197-ICD-9.json...
Processing C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-9\1.0\100463-ICD-9.json...
Processing C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-9\1.0\101173-ICD-9.json...
Processing C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Profee\ICD-10

###############################################

#################################          ClinicalBERT              ################################


In [154]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

In [155]:
# Load ClinicalBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModelForTokenClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=2)  # binary classification (e.g., entity or not)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [156]:
# Function to extract named entities using ClinicalBERT
def extract_named_entities_clinicalbert(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted labels
    predicted_labels = torch.argmax(outputs.logits, dim=2)

    # Map labels back to tokens
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    entities = []
    entity = {"text": "", "start": None, "end": None}
    for idx, (token, label) in enumerate(zip(tokens, predicted_labels[0])):
        if label == 1:  # Assuming label 1 corresponds to the entity class
            if entity["text"] == "":
                entity["text"] = token
                entity["start"] = idx
            else:
                entity["text"] += " " + token
        else:
            if entity["text"] != "":
                entity["end"] = idx
                entities.append(entity)
                entity = {"text": "", "start": None, "end": None}

    return entities

In [157]:
# Modify the function to display matching results
def display_matching_results_with_clinicalbert(json_file):
    with open(json_file, 'r') as file:
        data = json.load(file)

        notes = data.get('notes', [])

        for note in notes:
            note_text = note.get('text', None)
            if note_text:
                annotations = note.get('annotations', [])
                for annotation in annotations:
                    begin = annotation['begin']
                    end = annotation['end']
                    covered_text = annotation['covered_text']

                    print("Start char:", begin, "\tEnd char:", end, "\tCovered text:", covered_text)

                    # Extract named entities using ClinicalBERT
                    named_entities = extract_named_entities_clinicalbert(covered_text)
                    print("Named Entities (ClinicalBERT):", named_entities)


In [158]:
# Modify the main loop to use the new function
for json_file in json_files:
    print(f"Processing {json_file}...")
    display_matching_results_with_clinicalbert(json_file)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Processing C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-10\1.0\100197-ICD-10.json...
Start char: 374 	End char: 377 	Covered text: IPH
Named Entities (ClinicalBERT): [{'text': 'i', 'start': 1, 'end': 2}]
Start char: 383 	End char: 409 	Covered text: intraventricular extension
Named Entities (ClinicalBERT): [{'text': 'in', 'start': 1, 'end': 2}]
Start char: 430 	End char: 443 	Covered text: hydrocephalus
Named Entities (ClinicalBERT): []
Start char: 835 	End char: 862 	Covered text: intraparenchymal hemorrhage
Named Entities (ClinicalBERT): [{'text': '##chy', 'start': 5, 'end': 6}, {'text': '##hage', 'start': 10, 'end': 11}]
Start char: 1577 	End char: 1580 	Covered text: DNR
Named Entities (ClinicalBERT): []
Start char: 1840 	End char: 1856 	Covered text: DIABETES TYPE II
Named Entities (ClinicalBERT): []
Start char: 1859 	End char: 1873 	Covered text: HYPERLIPIDEMIA
Named Entities (ClinicalBERT): [{'text': '##ide', 'start'

###############################                BiLSTM-CRF

In [159]:
pip install transformers torch torchcrf


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [160]:
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForTokenClassification
from TorchCRF import CRF

In [161]:
# Load ClinicalBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
base_model = AutoModelForTokenClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=2)  #  binary classification (e.g., entity or not)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [162]:
# Extend the base model with CRF layer
class BiLSTM_CRF(nn.Module):
    def __init__(self, base_model, hidden_dim, num_labels):
        super(BiLSTM_CRF, self).__init__()
        self.base_model = base_model
        self.hidden_dim = hidden_dim
        self.num_labels = num_labels

        # BiLSTM layer
        self.lstm = nn.LSTM(base_model.config.hidden_size, hidden_dim // 2,
                            num_layers=1, bidirectional=True, batch_first=True)

        # Linear layer to project LSTM output to label space
        self.hidden2label = nn.Linear(hidden_dim, num_labels)

        # CRF layer
        self.crf = CRF(num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.logits

        # BiLSTM layer
        lstm_out, _ = self.lstm(sequence_output)

        # Project to label space
        emissions = self.hidden2label(lstm_out)

        # CRF layer
        if labels is not None:
            loss = -self.crf(emissions, labels, mask=attention_mask.byte(), reduction='token_mean')
            return loss
        else:
            return self.crf.viterbi_decode(emissions, mask=attention_mask.byte())


In [163]:
# Instantiate the BiLSTM-CRF model
hidden_dim = 256  # You can adjust this hyperparameter
num_labels = 2    
model = BiLSTM_CRF(base_model, hidden_dim, num_labels)

In [164]:
# Function to convert labels to indices
def labels_to_index(labels):
    label_map = {'B': 1, 'I': 1, 'O': 0}  # Assuming 'B' and 'I' as entity labels, 'O' as non-entity
    return torch.tensor([label_map[label] for label in labels], dtype=torch.long)


In [165]:
# Function to convert indices to labels
def index_to_labels(indices, attention_mask):
    labels = []
    for idx, mask in zip(indices, attention_mask):
        if mask:
            labels.append('B' if idx == 1 else 'O')
    return labels

In [166]:
# Modify the function to display matching results with BiLSTM-CRF
def display_matching_results_with_bilstm_crf(json_file):
    with open(json_file, 'r') as file:
        data = json.load(file)

        notes = data.get('notes', [])

        for note in notes:
            note_text = note.get('text', None)
            if note_text:
                annotations = note.get('annotations', [])
                for annotation in annotations:
                    begin = annotation['begin']
                    end = annotation['end']
                    covered_text = annotation['covered_text']

                    print("Start char:", begin, "\tEnd char:", end, "\tCovered text:", covered_text)

                    # Tokenize input text
                    inputs = tokenizer(covered_text, return_tensors="pt", truncation=True, padding=True)

                    # Convert labels to indices
                    label_indices = labels_to_index(['O'] * len(inputs["input_ids"][0]))  # Initialize with 'O' for non-entity

                    # Perform inference
                    with torch.no_grad():
                        outputs = model(inputs["input_ids"], inputs["attention_mask"])

                    # Convert indices to labels
                    predicted_labels = index_to_labels(outputs[0], inputs["attention_mask"][0])
                    print("Predicted Labels (BiLSTM-CRF):", predicted_labels)

# Modify the main loop to use the new function
for json_file in json_files:
    print(f"Processing {json_file}...")
    display_matching_results_with_bilstm_crf(json_file)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Processing C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\testing\gold\Inpatient\ICD-10\1.0\100197-ICD-10.json...
Start char: 374 	End char: 377 	Covered text: IPH
Predicted Labels (BiLSTM-CRF): ['B', 'O', 'B', 'O']
Start char: 383 	End char: 409 	Covered text: intraventricular extension
Predicted Labels (BiLSTM-CRF): ['B', 'O', 'B', 'O', 'B', 'O', 'B', 'O']
Start char: 430 	End char: 443 	Covered text: hydrocephalus
Predicted Labels (BiLSTM-CRF): ['B', 'O', 'B', 'O', 'B', 'O', 'B']
Start char: 835 	End char: 862 	Covered text: intraparenchymal hemorrhage
Predicted Labels (BiLSTM-CRF): ['B', 'O', 'B', 'O', 'B', 'O', 'B', 'O', 'B', 'O', 'B', 'O']
Start char: 1577 	End char: 1580 	Covered text: DNR
Predicted Labels (BiLSTM-CRF): ['B', 'O', 'B', 'O', 'B']
Start char: 1840 	End char: 1856 	Covered text: DIABETES TYPE II
Predicted Labels (BiLSTM-CRF): ['B', 'O', 'B', 'O', 'B']
Start char: 1859 	End char: 1873 	Covered text: HYPERLIPIDEMIA
Predicted Labels (