<a href="https://colab.research.google.com/github/okechukwuchude/Automating-Medical-Coding/blob/main/Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#pip install stanza

In [2]:
import os
import json
import stanza
import csv
import pandas as pd

# Build an English pipeline
stanza.download('en', package='mimic', processors={'ner': 'i2b2'}) # download English model
nlp = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'}) # initialize English neural pipeline

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 2.72MB/s]                    
2024-03-20 23:04:43 INFO: Downloaded file to C:\Users\okechukwu chude\stanza_resources\resources.json
2024-03-20 23:04:43 INFO: Downloading these customized packages for language: en (English)...
| Processor       | Package        |
------------------------------------
| tokenize        | mimic          |
| pos             | mimic_charlm   |
| lemma           | mimic_nocharlm |
| depparse        | mimic_charlm   |
| ner             | i2b2           |
| backward_charlm | mimic          |
| forward_charlm  | mimic          |
| pretrain        | mimic          |

2024-03-20 23:04:43 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\en\tokenize\mimic.pt
2024-03-20 23:04:43 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\en\pos\mimic_charlm.pt
2024-03-20 23:04:43 INFO: File 

In [3]:
# Function to process text using Stanza
def process_text_with_stanza(text):
    doc = nlp(text)
    return doc

In [4]:
# Global variable to store all extracted texts
extracted_texts = []

In [5]:
def extract_info_from_json(json_file_path, hadm_id_set):
    global extracted_texts

    # Open the JSON file
    with open(json_file_path, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)

        # Extract 'hadm_id' and 'comment' from the JSON data
        hadm_id = data.get('hadm_id', None)
        comment = data.get('comment', None)

        # If 'hadm_id' is not found, print a warning message
        if hadm_id is None:
            print(f"Warning: 'hadm_id' not found in {json_file_path}")
            return None

        # Add 'hadm_id' to the set
        hadm_id_set.add(hadm_id)

        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])

        # Iterate through each note
        for note in notes:
            note_info = {}  # Initialize a dictionary to store note information

            # Add 'hadm_id' to the note information
            note_info['hadm_id'] = hadm_id

            # Extract information from the note
            note_info['note_id'] = note.get('note_id', None)
            note_info['category'] = note.get('category', None)
            note_info['description'] = note.get('description', None)

            # Extract annotations from the note
            annotations = note.get('annotations', [])
            annotations_info = []  # Initialize a list to store annotation information

            # Iterate through each annotation in the note
            for annotation in annotations:
                annotation_info = {}  # Initialize a dictionary to store annotation information

                # Extract information from the annotation
                annotation_info['begin'] = annotation.get('begin', None)
                annotation_info['end'] = annotation.get('end', None)
                annotation_info['code'] = annotation.get('code', None)
                annotation_info['code_system'] = annotation.get('code_system', None)
                annotation_info['description'] = annotation.get('description', None)
                annotation_info['type'] = annotation.get('type', None)
                annotation_info['covered_text'] = annotation.get('covered_text', None)

                annotations_info.append(annotation_info)  # Append annotation information to the list

            note_info['annotations'] = annotations_info  # Add annotations information to the note
            note_info['text'] = note.get('text', None)

            # Process text with Stanza
            if note_info['text']:
                processed_text = process_text_with_stanza(note_info['text'])
                note_info['processed_text'] = processed_text

                # Extract sentence information and generate labels
                sentence_info = []
                for sent in processed_text.sentences:
                    tokens = [word.text for word in sent.words]
                    labels = ['O'] * len(tokens)  # Initialize labels as 'O'

                    for annotation in note_info['annotations']:
                        begin = annotation['begin']
                        end = annotation['end']

                        word_indices = find_word_indices(sent.words, begin, end)

                        # Assign labels
                        if word_indices:
                            start_idx, end_idx = word_indices[0], word_indices[-1]
                            labels[start_idx] = 'B'
                            for idx in range(start_idx + 1, end_idx + 1):
                                labels[idx] = 'I'
                        # for idx in word_indices:
                        #     if sent.words[idx].start_char == begin:
                        #         labels[idx] = 'B'
                        #     elif sent.words[idx].start_char > begin and sent.words[idx].end_char < end:
                        #         labels[idx] = 'I'

                    sentence_info.append({
                        'sentence_id': sent.index,
                        'words': tokens,
                        'labels': labels
                    })

                    # Print the result for each sentence
                    print('Sentence:', ' '.join(tokens))
                    print('Labels:', ' '.join(labels))
                    print()

                note_info['sentence_info'] = sentence_info

            extracted_texts.append(note_info)  # Append note information to the global variable

#

In [6]:
def find_word_indices(words, begin, end):
    # Create an empty list to store the indices of words found within the given range
    word_indices = []

    # Iterate through each token in the list of tokens
    for idx, word in enumerate(words):
        # Check if the start character of the token matches the beginning of the range
        # or if the token spans the beginning of the range
        if word.start_char == begin or (word.start_char < begin and word.end_char > begin):
            # If it matches, add the index of the token to the list of word indices
            word_indices.append(idx)
        # Check if the end character of the token matches the end of the range
        # or if the token spans the end of the range
        if word.end_char == end or (word.start_char < end and word.end_char > end):
            # If it matches, add the index of the token to the list of word indices
            word_indices.append(idx)
            break

    # Return the list of word indices found within the given range
    return word_indices

In [7]:
def generate_labels(words, annotations, start_token_idx, end_token_idx):
    labels = ['O'] * len(words)

    for annotation in annotations:
        code = annotation['code']

        # # Update labels for the matched words
        # if start_token_idx is not None and end_token_idx is not None:
        #     labels[start_token_idx] = f'B-{code}'
        #     for idx in range(start_token_idx + 1, end_token_idx + 1):
        #         labels[idx] = f'I-{code}'

    return labels

In [8]:
# Function to search for JSON files in a given folder and its subfolders

def search_files(folder_path):
    # Set to store unique hadm_id values
    hadm_id_set = set()

    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.json'):
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, filename)
                print("Processing:", json_file_path)
                # Call extract_info_from_json function to extract information from the JSON file
                extract_info_from_json(json_file_path, hadm_id_set)
                # Print a separator after processing each file
                print("=" * 50)

    # Print the count of unique hadm_id values
    print("Total unique hadm_id count:", len(hadm_id_set))



In [9]:
# Specify the path to the main folder containing subfolders with JSON files
main_folder_path =  r"C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2"


# Call the search_files function to start searching for JSON files in the main folder and its subfolders
search_files(main_folder_path)

Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\100197-ICD-9.json


Sentence: Admission Date : [ **2136-10-23 **] Discharge Date : [ **2136-10-24 **]
Labels: O O O O O O O O O O O O

Sentence: Date of Birth : [ ** 2056-7-14 **] Sex : M
Labels: O O O O O O O O O O O

Sentence: Service : NEUROSURGERY
Labels: O O O

Sentence: Allergies : No Known Allergies / Adverse Drug Reactions
Labels: O O O O O O O O O

Sentence: Attending :[** First Name3 ( LF ) 1835 **] Chief Complaint : Found down
Labels: O O O O O O O O O O O O O O

Sentence: Major Surgical or Invasive Procedure : None
Labels: O O O O O O O

Sentence: History of Present Illness : 80M p/w a large R IPH with intraventricular extension , midline shift , and hydrocephalus .
Labels: O O O O O O O O O O B O B I O O O O O B O

Sentence: The patient was found lying face up in his bathtub after having shaved .
Labels: O O O O O O O O O O O O O O

Sentence: There was no water in the tub and the shower was not turned on .
Labels: O O O O O O O O O O O O O O O

Sentence: He was found by a neighbor .
Labels: O

In [10]:
import pandas as pd

# Function to create a DataFrame from the extracted sentence information
def create_dataframe(extracted_texts):
    data = []

    for note_info in extracted_texts:
        if 'sentence_info' in note_info:
            for sentence_info in note_info['sentence_info']:
                words = sentence_info['words']
                labels = sentence_info['labels']

                for word, label in zip(words, labels):
                    data.append({
                        'hadm_id': note_info['hadm_id'],
                        'note_id': note_info['note_id'],
                        'sentence_id': sentence_info['sentence_id'],
                        'word': word,
                        'label': label
                    })

    df = pd.DataFrame(data)
    return df

df = create_dataframe(extracted_texts)
print(df)

      hadm_id  note_id  sentence_id          word label
0      100197    25762            0     Admission     O
1      100197    25762            0          Date     O
2      100197    25762            0             :     O
3      100197    25762            0             [     O
4      100197    25762            0  **2136-10-23     O
...       ...      ...          ...           ...   ...
9868   101525   240524            2       tracing     O
9869   101525   240524            2     available     O
9870   101525   240524            2           for     O
9871   101525   240524            2    comparison     O
9872   101525   240524            2             .     O

[9873 rows x 5 columns]


In [11]:
label_counts = df['label'].value_counts()
print(label_counts)

label
O    9647
I     140
B      86
Name: count, dtype: int64


In [12]:
token_df= df.drop(['hadm_id', 'note_id',], axis=1)

In [13]:
token_df

Unnamed: 0,sentence_id,word,label
0,0,Admission,O
1,0,Date,O
2,0,:,O
3,0,[,O
4,0,**2136-10-23,O
...,...,...,...
9868,2,tracing,O
9869,2,available,O
9870,2,for,O
9871,2,comparison,O


In [14]:
token_df.rename(columns = {'word':'words', 'label':'labels'}, inplace = True)

In [15]:
token_df.head()

Unnamed: 0,sentence_id,words,labels
0,0,Admission,O
1,0,Date,O
2,0,:,O
3,0,[,O
4,0,**2136-10-23,O


In [16]:
x = token_df[['sentence_id', 'words']]
y = token_df['labels']

SPLITTING DATA

In [17]:
# Splitting data into train and test sets with stratified sampling

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, stratify=y)

In [18]:
#build the training and test data
train_data = pd.DataFrame({'sentence_id': x_train['sentence_id'], 'words': x_train['words'], 'labels': y_train})
test_data = pd.DataFrame({'sentence_id': x_test['sentence_id'], 'words': x_test['words'], 'labels': y_test})

In [19]:
train_data

Unnamed: 0,sentence_id,words,labels
5436,5,Events,O
3721,9,current,O
8924,12,7,O
3963,16,mg,O
9502,7,the,O
...,...,...,...
3890,14,Dorsalis,O
2299,58,.,O
9803,3,woman,O
8620,6,[**2134-3-24,O


MODEL TRAINING

BERT

In [20]:
#pip install simpletransformers

In [21]:
from simpletransformers.ner import NERModel, NERArgs

In [22]:
label = token_df['labels'].unique().tolist()
label

['O', 'B', 'I']

In [23]:
args = NERArgs()
args.num_train_epochs = 1
args.learning_rate = 1e-4
args.overwrite_output_dir = True
args.train_batch_size = 32
args.eval_batch_size = 32

In [24]:
model = NERModel('bert', 'bert-base-cased', labels=label, args=args, use_cuda = False)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
from sklearn.metrics import accuracy_score

model.train_model(train_data, eval_data = test_data, acc = accuracy_score)

100%|██████████| 1/1 [00:09<00:00,  9.19s/it]
Epochs 1/1. Running Loss:    0.2241: 100%|██████████| 3/3 [00:49<00:00, 16.53s/it]
Epoch 1 of 1: 100%|██████████| 1/1 [00:51<00:00, 51.65s/it]


(3, 0.9740698536237081)

In [26]:
result, model_outputs, preds_list = model.eval_model(test_data)

100%|██████████| 1/1 [00:09<00:00,  9.74s/it]
Running Evaluation: 100%|██████████| 3/3 [00:17<00:00,  5.96s/it]
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
result

{'eval_loss': 0.24045485258102417,
 'precision': 0.0,
 'recall': 0.0,
 'f1_score': 0.0}

In [28]:
prediction, model_output = model.predict(['There is also intraventricular extension into the ipsilateral and contralateral lateral ventricles.'])

100%|██████████| 1/1 [00:10<00:00, 10.38s/it]
Running Prediction: 100%|██████████| 1/1 [00:00<00:00,  3.25it/s]


In [29]:
prediction

[[{'There': 'O'},
  {'is': 'O'},
  {'also': 'O'},
  {'intraventricular': 'O'},
  {'extension': 'O'},
  {'into': 'O'},
  {'the': 'O'},
  {'ipsilateral': 'O'},
  {'and': 'O'},
  {'contralateral': 'O'},
  {'lateral': 'O'},
  {'ventricles.': 'O'}]]

BioBERT

In [30]:
bio_label = token_df['labels'].unique().tolist()
bio_label

['O', 'B', 'I']

In [45]:
from transformers import BertTokenizer, BertForMaskedLM, BertForTokenClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


Converting tokens to token IDs and create attention masks

In [46]:
# # Convert tokens to token IDs and create attention masks
# tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.2')
# x_train_tokenized = tokenizer(x_train['words'].tolist(), padding=True, truncation=True, return_tensors="pt")
# x_test_tokenized = tokenizer(x_test['words'].tolist(), padding=True, truncation=True, return_tensors="pt")

In [47]:
# Define the model configuration
model_name = "dmis-lab/biobert-base-cased-v1.2"
num_labels = 3  # Number of NER labels (excluding 'O')

In [49]:
# Define optimizer and loss function
model = BertForTokenClassification.from_pretrained(model_name, num_labels=(num_labels))
#optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
optimizer = AdamW(model.parameters(), lr=2e-5)
#loss_fn = torch.nn.CrossEntropyLoss()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Convert labels to label IDs

In [58]:
# # Convert labels to label IDs
label_map = {label: idx for idx, label in enumerate(token_df['labels'].unique())}
# y_train_ids = torch.tensor([label2id[label] for label in y_train.tolist()])
# y_test_ids = torch.tensor([label2id[label] for label in y_test.tolist()])

CREATE DATALOADER

In [59]:
formatted_trainset = []
formatted_valset = []

for _, row in train_data.iterrows():
    input_ids = tokenizer.encode(row['words'], add_special_tokens=True)
    attention_mask = [1] * len(input_ids)
    labels = [label_map[label] for label in row['labels']]

    formatted_trainset.append({
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    })

for _, row in test_data.iterrows():
    input_ids = tokenizer.encode(row['words'], add_special_tokens=True)
    attention_mask = [1] * len(input_ids)
    labels = [label_map[label] for label in row['labels']]

    formatted_valset.append({
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    })

In [60]:
train_dataloader = DataLoader(formatted_trainset, batch_size=16, shuffle=True, collate_fn=lambda x: dict(
    input_ids=pad_sequence([torch.tensor(example['input_ids']) for example in x], batch_first=True),
    attention_mask=pad_sequence([torch.tensor(example['attention_mask']) for example in x], batch_first=True),
    labels=pad_sequence([torch.tensor([label_map[label] for label in example['labels']]) for example in x], batch_first=True)
))

val_dataloader = DataLoader(formatted_valset, batch_size=32, shuffle=False, collate_fn=lambda x: dict(
    input_ids=pad_sequence([torch.tensor(example['input_ids']) for example in x], batch_first=True),
    attention_mask=pad_sequence([torch.tensor(example['attention_mask']) for example in x], batch_first=True),
    labels=pad_sequence([torch.tensor([label_map[label] for label in example['labels']]) for example in x], batch_first=True)
))

In [61]:
# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [62]:
from tqdm import tqdm

In [63]:
# Fine-tuning loop
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    # Iterate over the training dataloader
    for batch in tqdm(train_dataloader, desc="Epoch {}".format(epoch + 1)):
        # Move batch tensors to the correct device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Validation loop
    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Validation"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            val_loss += loss.item()

    # Calculate average loss
    train_loss /= len(train_dataloader)
    val_loss /= len(val_dataloader)

    print("Epoch {} - Train Loss: {:.4f} - Val Loss: {:.4f}".format(epoch + 1, train_loss, val_loss))

Epoch 1:   0%|          | 0/494 [00:00<?, ?it/s]


KeyError: 0