In [66]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification


In [67]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


In [68]:
train_data = pd.read_csv(r'C:\Users\okechukwu chude\Documents\NLP\text extraction\bert-crf\#project files\training.csv')
test_data = pd.read_csv(r'C:\Users\okechukwu chude\Documents\NLP\text extraction\bert-crf\#project files\testing.csv')

In [69]:
len(train_data), len(test_data)

(6748, 1688)

In [70]:
test_data.head()

Unnamed: 0,sentence,word_labels
0,Disp :* 10 Tablet ( s ) * Refills :* 0 * 10 . ...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,Hematology : Mod anemia post-op Endocrine : RI...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O"
2,Compared to tracing # 1 no diagnostic interim ...,"O,O,O,O,O,O,O,O,O,O"
3,Dilated RV cavity .,"O,O,O,O"
4,FEW RBC'S .,"O,O,O"


In [71]:
print(test_data.to_string())

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [72]:
test_data.iloc[6].sentence

'There is an expansile 3.5 cm mass in the tail of the pancreas .'

In [73]:
test_data.iloc[6].word_labels

'O,O,O,O,O,O,B,I,I,I,I,I,I,O'

In [74]:
# Function to generate the dictionaries
def generate_label_dicts(test_data):
    label2id = {}
    id2label = {}
    label_counts = {}
    
    for labels in test_data['word_labels']:
        for label in labels.split(','):
            if label not in label_counts:
                label_counts[label] = len(label_counts)
                label2id[label] = label_counts[label]
                id2label[label_counts[label]] = label
    
    return label2id, id2label

# Generate dictionaries
label2id, id2label = generate_label_dicts(test_data)

print("label2id:", label2id)
print("id2label:", id2label)

label2id: {'O': 0, 'B': 1, 'I': 2}
id2label: {0: 'O', 1: 'B', 2: 'I'}


In [75]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [76]:
tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")
MAX_LEN = 128


In [77]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [78]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]
        word_labels = self.data.word_labels[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [79]:
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

TRAIN Dataset: (6748, 2)
TEST Dataset: (1688, 2)


In [80]:
training_set = dataset(train_data, tokenizer, MAX_LEN)
testing_set = dataset(test_data, tokenizer, MAX_LEN)

In [81]:
testing_set[0]["ids"]

tensor([  101, 27920, 10410,   131,   115, 10150, 21783, 10123,   113,   187,
          114,   115, 48056, 19503, 10107,   131,   115,   121,   115, 10150,
          119, 10360, 28229, 25750, 10157, 43093, 12214, 25699, 21783, 10123,
        11546,   131, 10464,   113,   122,   114, 21783, 10123, 10514,   185,
        53398, 10237,   113, 14234, 10233, 19573,   114, 10142,   124, 13990,
          119,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [82]:
# print the first 30 tokens and corresponding labels
for token, label in zip(tokenizer.convert_ids_to_tokens(testing_set[0]["ids"][:30]), testing_set[0]["targets"][:30]):
  print('{0:10}  {1}'.format(token, id2label[label.item()]))

[CLS]       O
dis         O
##p         O
:           O
*           O
10          O
table       O
##t         O
(           O
s           O
)           O
*           O
ref         O
##ill       O
##s         O
:           O
*           O
0           O
*           O
10          O
.           O
az          O
##ith       O
##rom       O
##y         O
##cin       O
250         O
mg          O
table       O
##t         O


In [83]:
import json
import os

# Generate labeled_texts.jsonl
labeled_texts = []

for index in range(len(testing_set)):
    sample = testing_set[index]
    sample_dict = {
        "id": index,
        "input_ids": sample["ids"].tolist(),
        "text_labels": [id2label[label.item()] for label in sample["targets"]],
        "labels": sample["targets"].tolist()
    }
    labeled_texts.append(sample_dict)

with open("ClinicalBERT_test_labeled_texts.jsonl", "w") as outfile:
    for sample in labeled_texts:
        json.dump(sample, outfile)
        outfile.write("\n")

# Generate label2id.jsonl
with open("ClinicalBERT_test_label2id.jsonl", "w") as outfile:
    json.dump(label2id, outfile)
