<a href="https://colab.research.google.com/github/okechukwuchude/Automating-Medical-Coding/blob/main/Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
pip install stanza



In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import os
import json
import stanza
import csv
import pandas as pd

# Build an English pipeline
stanza.download('en', package='mimic', processors={'ner': 'i2b2'}) # download English model
nlp = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'}) # initialize English neural pipeline

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading these customized packages for language: en (English)...
| Processor       | Package        |
------------------------------------
| tokenize        | mimic          |
| pos             | mimic_charlm   |
| lemma           | mimic_nocharlm |
| depparse        | mimic_charlm   |
| ner             | i2b2           |
| forward_charlm  | mimic          |
| backward_charlm | mimic          |
| pretrain        | mimic          |

INFO:stanza:File exists: /root/stanza_resources/en/tokenize/mimic.pt
INFO:stanza:File exists: /root/stanza_resources/en/pos/mimic_charlm.pt
INFO:stanza:File exists: /root/stanza_resources/en/lemma/mimic_nocharlm.pt
INFO:stanza:File exists: /root/stanza_resources/en/depparse/mimic_charlm.pt
INFO:stanza:File exists: /root/stanza_resources/en/ner/i2b2.pt
INFO:stanza:File exists: /root/stanza_resources/en/forward_charlm/mimic.pt
INFO:stanza:File exists: /root/stanza_resources/en/

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package        |
------------------------------
| tokenize  | mimic          |
| pos       | mimic_charlm   |
| lemma     | mimic_nocharlm |
| depparse  | mimic_charlm   |
| ner       | i2b2           |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


In [7]:
# Function to process text using Stanza
def process_text_with_stanza(text):
    doc = nlp(text)
    return doc

In [8]:
# Global variable to store all extracted texts
extracted_texts = []

In [9]:
def extract_info_from_json(json_file_path, hadm_id_set):
    global extracted_texts

    # Open the JSON file
    with open(json_file_path, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)

        # Extract 'hadm_id' and 'comment' from the JSON data
        hadm_id = data.get('hadm_id', None)
        comment = data.get('comment', None)

        # If 'hadm_id' is not found, print a warning message
        if hadm_id is None:
            print(f"Warning: 'hadm_id' not found in {json_file_path}")
            return None

        # Add 'hadm_id' to the set
        hadm_id_set.add(hadm_id)

        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])

        # Iterate through each note
        for note in notes:
            note_info = {}  # Initialize a dictionary to store note information

            # Add 'hadm_id' to the note information
            note_info['hadm_id'] = hadm_id

            # Extract information from the note
            note_info['note_id'] = note.get('note_id', None)
            note_info['category'] = note.get('category', None)
            note_info['description'] = note.get('description', None)

            # Extract annotations from the note
            annotations = note.get('annotations', [])
            annotations_info = []  # Initialize a list to store annotation information

            # Iterate through each annotation in the note
            for annotation in annotations:
                annotation_info = {}  # Initialize a dictionary to store annotation information

                # Extract information from the annotation
                annotation_info['begin'] = annotation.get('begin', None)
                annotation_info['end'] = annotation.get('end', None)
                annotation_info['code'] = annotation.get('code', None)
                annotation_info['code_system'] = annotation.get('code_system', None)
                annotation_info['description'] = annotation.get('description', None)
                annotation_info['type'] = annotation.get('type', None)
                annotation_info['covered_text'] = annotation.get('covered_text', None)

                annotations_info.append(annotation_info)  # Append annotation information to the list

            note_info['annotations'] = annotations_info  # Add annotations information to the note
            note_info['text'] = note.get('text', None)

            # Process text with Stanza
            if note_info['text']:
                processed_text = process_text_with_stanza(note_info['text'])
                note_info['processed_text'] = processed_text

                # Extract sentence information and generate labels
                sentence_info = []
                for sent in processed_text.sentences:
                    tokens = [word.text for word in sent.words]
                    labels = ['O'] * len(tokens)  # Initialize labels as 'O'

                    for annotation in note_info['annotations']:
                        begin = annotation['begin']
                        end = annotation['end']

                        word_indices = find_word_indices(sent.words, begin, end)

                        # Assign labels
                        if word_indices:
                            start_idx, end_idx = word_indices[0], word_indices[-1]
                            labels[start_idx] = 'B'
                            for idx in range(start_idx + 1, end_idx + 1):
                                labels[idx] = 'I'
                        # for idx in word_indices:
                        #     if sent.words[idx].start_char == begin:
                        #         labels[idx] = 'B'
                        #     elif sent.words[idx].start_char > begin and sent.words[idx].end_char < end:
                        #         labels[idx] = 'I'

                    sentence_info.append({
                        'sentence_id': sent.index,
                        'words': tokens,
                        'labels': labels
                    })

                    # Print the result for each sentence
                    print('Sentence:', ' '.join(tokens))
                    print('Labels:', ' '.join(labels))
                    print()

                note_info['sentence_info'] = sentence_info

            extracted_texts.append(note_info)  # Append note information to the global variable

#

In [10]:
def find_word_indices(words, begin, end):
    # Create an empty list to store the indices of words found within the given range
    word_indices = []

    # Iterate through each token in the list of tokens
    for idx, word in enumerate(words):
        # Check if the start character of the token matches the beginning of the range
        # or if the token spans the beginning of the range
        if word.start_char == begin or (word.start_char < begin and word.end_char > begin):
            # If it matches, add the index of the token to the list of word indices
            word_indices.append(idx)
        # Check if the end character of the token matches the end of the range
        # or if the token spans the end of the range
        if word.end_char == end or (word.start_char < end and word.end_char > end):
            # If it matches, add the index of the token to the list of word indices
            word_indices.append(idx)
            break

    # Return the list of word indices found within the given range
    return word_indices

In [11]:
def generate_labels(words, annotations, start_token_idx, end_token_idx):
    labels = ['O'] * len(words)

    for annotation in annotations:
        code = annotation['code']

        # # Update labels for the matched words
        # if start_token_idx is not None and end_token_idx is not None:
        #     labels[start_token_idx] = f'B-{code}'
        #     for idx in range(start_token_idx + 1, end_token_idx + 1):
        #         labels[idx] = f'I-{code}'

    return labels

In [12]:
# Function to search for JSON files in a given folder and its subfolders

def search_files(folder_path):
    # Set to store unique hadm_id values
    hadm_id_set = set()

    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.json'):
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, filename)
                print("Processing:", json_file_path)
                # Call extract_info_from_json function to extract information from the JSON file
                extract_info_from_json(json_file_path, hadm_id_set)
                # Print a separator after processing each file
                print("=" * 50)

    # Print the count of unique hadm_id values
    print("Total unique hadm_id count:", len(hadm_id_set))



In [13]:
# Specify the path to the main folder containing subfolders with JSON files
main_folder_path = '/content/drive/MyDrive/#medical coding/testing'

# Call the search_files function to start searching for JSON files in the main folder and its subfolders
search_files(main_folder_path)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Sentence: Brief Hospital Course : # Lower GI Bleed : Admitted with a Hct of 29 from baseline 43 and orthostatic by vital signs .
Labels: O O O O O O O O O O O O O O O O O O O O O O O O

Sentence: She was aggressively volume resuscitated with 5 L of crystalloid and transfused 2 units of PRBCs after continuning to pass dilute blood with a Golytely prep , which was then held the first night of the hospitalization after completing half of the prep .
Labels: O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O

Sentence: On hospital day 2 , she underwent colonoscopy , which was remarkable for bleeding in the cecum , the site of 2 of her polypectomies 9 days prior to admission ; 4 clips were placed with adequate hemostasis .
Labels: O O O O O O O O O O O O O B I I I I I I I I I I I O O O O O O O O O O O O O O

Sentence: Her volume and hematocrit subsequently remained stable .
Labels: O O O O 

In [14]:
import pandas as pd

# Function to create a DataFrame from the extracted sentence information
def create_dataframe(extracted_texts):
    data = []

    for note_info in extracted_texts:
        if 'sentence_info' in note_info:
            for sentence_info in note_info['sentence_info']:
                words = sentence_info['words']
                labels = sentence_info['labels']

                for word, label in zip(words, labels):
                    data.append({
                        'hadm_id': note_info['hadm_id'],
                        'note_id': note_info['note_id'],
                        'sentence_id': sentence_info['sentence_id'],
                        'word': word,
                        'label': label
                    })

    df = pd.DataFrame(data)
    return df

df = create_dataframe(extracted_texts)
print(df)

       hadm_id  note_id  sentence_id           word label
0       101525   566615            0   Demographics     O
1       101525   566615            0            Day     O
2       101525   566615            0             of     O
3       101525   566615            0     intubation     O
4       101525   566615            0              :     O
...        ...      ...          ...            ...   ...
40837   100463    55049          107            **]     O
40838   100463    55049          108      Completed     O
40839   100463    55049          108             by     O
40840   100463    55049          108  :[**2109-7-31     O
40841   100463    55049          108            **]     O

[40842 rows x 5 columns]


In [15]:
label_counts = df['label'].value_counts()
print(label_counts)

O    39926
I      582
B      334
Name: label, dtype: int64


In [16]:
token_df= df.drop(['hadm_id', 'note_id',], axis=1)

In [17]:
token_df

Unnamed: 0,sentence_id,word,label
0,0,Demographics,O
1,0,Day,O
2,0,of,O
3,0,intubation,O
4,0,:,O
...,...,...,...
40837,107,**],O
40838,108,Completed,O
40839,108,by,O
40840,108,:[**2109-7-31,O


In [18]:
token_df.rename(columns = {'word':'words', 'label':'labels'}, inplace = True)

In [19]:
token_df.head()

Unnamed: 0,sentence_id,words,labels
0,0,Demographics,O
1,0,Day,O
2,0,of,O
3,0,intubation,O
4,0,:,O


In [20]:
x = token_df[['sentence_id', 'words']]
y = token_df['labels']

SPLITTING DATA

In [21]:
# Splitting data into train and test sets with stratified sampling

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, stratify=y)

In [22]:
#build the training and test data
train_data = pd.DataFrame({'sentence_id': x_train['sentence_id'], 'words': x_train['words'], 'labels': y_train})
test_data = pd.DataFrame({'sentence_id': x_test['sentence_id'], 'words': x_test['words'], 'labels': y_test})

In [23]:
train_data

Unnamed: 0,sentence_id,words,labels
37630,187,Fax,O
17339,78,take,O
952,19,],O
26404,13,base,O
19850,12,Delivery,O
...,...,...,...
4722,4,recieved,O
7362,13,RSBI,O
24824,13,-,O
37815,195,1-13,O


MODEL TRAINING

BERT

In [24]:
pip install simpletransformers



In [23]:
from simpletransformers.ner import NERModel, NERArgs

In [24]:
label = token_df['labels'].unique().tolist()
label

['O', 'B', 'I']

In [25]:
args = NERArgs()
args.num_train_epochs = 1
args.learning_rate = 1e-4
args.overwrite_output_dir = True
args.train_batch_size = 32
args.eval_batch_size = 32

In [26]:
model = NERModel('bert', 'bert-base-cased', labels=label, args=args, use_cuda = False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
from sklearn.metrics import accuracy_score

model.train_model(train_data, eval_data = test_data, acc = accuracy_score)

  return [


  0%|          | 0/1 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 1 of 1:   0%|          | 0/7 [00:00<?, ?it/s]

(7, 0.42912608810833525)

In [28]:
result, model_outputs, preds_list = model.eval_model(test_data)

  return [


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/6 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
result

{'eval_loss': 0.07785882630075018,
 'precision': 0.0,
 'recall': 0.0,
 'f1_score': 0.0}

In [30]:
prediction, model_output = model.predict(['There is also intraventricular extension into the ipsilateral and contralateral lateral ventricles.'])

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

In [31]:
prediction

[[{'There': 'O'},
  {'is': 'O'},
  {'also': 'O'},
  {'intraventricular': 'O'},
  {'extension': 'O'},
  {'into': 'O'},
  {'the': 'O'},
  {'ipsilateral': 'O'},
  {'and': 'O'},
  {'contralateral': 'O'},
  {'lateral': 'O'},
  {'ventricles.': 'O'}]]

BioBERT

In [32]:
bio_label = token_df['labels'].unique().tolist()
bio_label

['O', 'B', 'I']

In [33]:
from transformers import BertTokenizer, BertForMaskedLM, BertForTokenClassification
import torch

Converting tokens to token IDs and create attention masks

In [34]:
# Convert tokens to token IDs and create attention masks
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.2')
x_train_tokenized = tokenizer(x_train['words'].tolist(), padding=True, truncation=True, return_tensors="pt")
x_test_tokenized = tokenizer(x_test['words'].tolist(), padding=True, truncation=True, return_tensors="pt")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Convert labels to label IDs

In [35]:
# Convert labels to label IDs
label2id = {label: idx for idx, label in enumerate(df['label'].unique())}
y_train_ids = torch.tensor([label2id[label] for label in y_train.tolist()])
y_test_ids = torch.tensor([label2id[label] for label in y_test.tolist()])

CREATE DATALOADER

In [36]:
from torch.utils.data import TensorDataset, DataLoader

In [37]:
# Set the desired batch size
batch_size = 32

# Create DataLoader for training and validation
train_dataset = TensorDataset(x_train_tokenized['input_ids'],
                              x_train_tokenized['attention_mask'],
                              y_train_ids)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(x_test_tokenized['input_ids'],
                            x_test_tokenized['attention_mask'],
                            y_test_ids)
val_loader = DataLoader(val_dataset, batch_size = batch_size , shuffle=False)

In [38]:
# Define optimizer and loss function
model = BertForTokenClassification.from_pretrained('dmis-lab/biobert-base-cased-v1.2', num_labels=len(label2id))
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
# Fine-tuning loop
num_epochs = 10
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    # Training loop
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Get the logits from the model output
        loss = loss_fn(logits.view(-1, len(label2id)), labels.view(-1))  # Calculate the loss
        loss.backward()
        optimizer.step()

    # Evaluation loop
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            val_loss += loss_fn(logits.view(-1, len(label2id)), labels.view(-1)).item()  # Calculate validation loss
            _, predicted = torch.max(logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    # Print training and validation metrics
    train_loss = loss.item()
    val_loss /= len(val_loader)
    accuracy = correct / total
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {accuracy:.4f}")

Epoch 1/10


ValueError: Expected input batch_size (2560) to match target batch_size (32).

In [25]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from simpletransformers.ner import NERModel, NERArgs



In [None]:
# Initialize NER model arguments
args = NERArgs()
args.num_train_epochs = 10  # Number of training epochs
args.learning_rate = 2e-5   # Learning rate
args.overwrite_output_dir = True
args.train_batch_size = 32
args.eval_batch_size = 32

# Create a list to store the F1 scores for each fold
f1_scores = []

# Use stratified sampling to split the data into folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over the folds
for fold, (train_idx, val_idx) in enumerate(skf.split(x, y)):
    print(f"Fold {fold + 1}")

    # Split the data into train and validation sets
    X_train, X_val = x.iloc[train_idx], x.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Create pandas DataFrames for train and validation data
    train_data = pd.DataFrame({'sentence_id': X_train['sentence_id'], 'words': X_train['words'], 'labels': y_train})
    val_data = pd.DataFrame({'sentence_id': X_val['sentence_id'], 'words': X_val['words'], 'labels': y_val})

    # Convert labels to binary format
    mlb = MultiLabelBinarizer()
    y_train_binary = mlb.fit_transform(train_data['labels'].values)
    y_val_binary = mlb.transform(val_data['labels'].values)

    # Flatten y_val_binary for accuracy calculation
    y_val_flat = y_val_binary.flatten()

    # Create the NER model with BioBERT
    model = NERModel('bert', 'dmis-lab/biobert-base-cased-v1.2', args=args, labels=mlb.classes_.tolist(), use_cuda=False)

    # Train the model
    model.train_model(train_data, train_labels=y_train_binary, eval_data=val_data, eval_labels=y_val_binary)

    # Evaluate the model on the validation data
    result, model_outputs, preds_list = model.eval_model(val_data, eval_labels=y_val_binary, acc=accuracy_score)

    # Calculate and store the F1 score for this fold
    f1 = f1_score(y_val_flat, preds_list, average='micro')
    f1_scores.append(f1)

    print(f"Validation F1 score: {f1:.4f}")
    print()

# Print the average F1 score across all folds
print(f"Average F1 score: {sum(f1_scores) / len(f1_scores):.4f}")

Fold 1


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return [


  0%|          | 0/1 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/7 [00:00<?, ?it/s]