In [41]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification


In [42]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


In [43]:
train_data = pd.read_csv(r'C:\Users\okechukwu chude\Documents\NLP\text extraction\bert-crf\#project files\training.csv')
test_data = pd.read_csv(r'C:\Users\okechukwu chude\Documents\NLP\text extraction\bert-crf\#project files\testing.csv')

In [44]:
len(train_data), len(test_data)

(6748, 1688)

In [45]:
train_data.head()

Unnamed: 0,sentence,word_labels
0,[ **2167-7-7 ** ] 1:18 AM CHEST ( SINGLE VIEW ...,"O,O,O,O,O,O,B,I,I,I,I,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,Overall left ventricular systolic function is ...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
2,( every 6 hours ) as needed for Pain / HA : Do...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
3,Successful embolization of a right hepatic art...,"O,O,O,O,O,O,O,O,B,O,O,O,O,O,O,O,O"
4,INDICATION : 23 -year-old female with traumati...,"O,O,O,O,O,O,B,I,I,O"


In [46]:
train_data.iloc[1].sentence

'Overall left ventricular systolic function is low normal ( LVEF 50 - 55 % ) .'

In [47]:
train_data.iloc[1].word_labels

'O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O'

In [48]:
# Function to generate the dictionaries
def generate_label_dicts(train_data):
    label2id = {}
    id2label = {}
    label_counts = {}
    
    for labels in train_data['word_labels']:
        for label in labels.split(','):
            if label not in label_counts:
                label_counts[label] = len(label_counts)
                label2id[label] = label_counts[label]
                id2label[label_counts[label]] = label
    
    return label2id, id2label

# Generate dictionaries
label2id, id2label = generate_label_dicts(train_data)

print("label2id:", label2id)
print("id2label:", id2label)

label2id: {'O': 0, 'B': 1, 'I': 2}
id2label: {0: 'O', 1: 'B', 2: 'I'}


In [49]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [50]:
tokenizer = AutoTokenizer.from_pretrained("praneethvasarla/med-bert")
MAX_LEN = 128


tokenizer_config.json: 100%|██████████| 1.19k/1.19k [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.56MB/s]
tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 1.02MB/s]
special_tokens_map.json: 100%|██████████| 125/125 [00:00<?, ?B/s] 


In [51]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [52]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]
        word_labels = self.data.word_labels[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [53]:
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

TRAIN Dataset: (6748, 2)
TEST Dataset: (1688, 2)


In [54]:
training_set = dataset(train_data, tokenizer, MAX_LEN)
testing_set = dataset(test_data, tokenizer, MAX_LEN)

In [55]:
training_set[0]["ids"]

tensor([  101,  1031,  1008,  1008, 20294,  2581,  1011,  1021,  1011,  1021,
         1008,  1008,  1033,  1015,  1024,  2324,  2572,  3108,  1006,  2309,
         3193,  1007, 12528,  1001,  1031,  1008,  1008, 12528,  2193,  1006,
         2557,  6483,  1007,  2753, 22907,  2629,  1008,  1008,  1033,  3114,
         1024,  1054,  1013,  1051,  1052,  2532,  1035,  1035,  1035,  1035,
         1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,
         1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,
         1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,
         1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,
         1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,
         1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,
         1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,  1035,
         1035,  1035,  1035,  1035,   102,     0,     0,     0])

In [56]:
# print the first 30 tokens and corresponding labels
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"][:30]), training_set[0]["targets"][:30]):
  print('{0:10}  {1}'.format(token, id2label[label.item()]))

[CLS]       O
[           O
*           O
*           O
216         O
##7         O
-           O
7           O
-           O
7           O
*           O
*           O
]           O
1           O
:           O
18          O
am          O
chest       B
(           I
single      I
view        I
)           I
clip        O
#           O
[           O
*           O
*           O
clip        O
number      O
(           O


In [57]:
import json
import os

# Generate labeled_texts.jsonl
labeled_texts = []

for index in range(len(training_set)):
    sample = training_set[index]
    sample_dict = {
        "id": index,
        "input_ids": sample["ids"].tolist(),
        "text_labels": [id2label[label.item()] for label in sample["targets"]],
        "labels": sample["targets"].tolist()
    }
    labeled_texts.append(sample_dict)

with open("med-bert_labeled_texts.jsonl", "w") as outfile:
    for sample in labeled_texts:
        json.dump(sample, outfile)
        outfile.write("\n")

# Generate label2id.jsonl
with open("med-bert_label2id.jsonl", "w") as outfile:
    json.dump(label2id, outfile)
