In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification


In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [5]:
train_data = pd.read_csv(r'/home/chudeo/bert-crf-project/#project files/training.csv')
test_data = pd.read_csv(r'/home/chudeo/bert-crf-project/#project files/testing.csv')
validation_data = pd.read_csv(r'/home/chudeo/bert-crf-project/#project files/validation.csv')

In [6]:
len(train_data), len(test_data), len(validation_data)

(5905, 1266, 1265)

In [7]:
validation_data.head()

Unnamed: 0,sentence,word_labels
0,Attending :[** First Name3 ( LF ) 5084 **] Chi...,"O,O,O,O,O,O,O,O,O,O,O,O,O,B,I,O"
1,Attending :[** First Name3 ( LF ) 1505 **] Chi...,"O,O,O,O,O,O,O,O,O,O,O,O,B"
2,The HCT remained stable .,"O,O,O,O,O"
3,2 . New or newly apparent left upper lobe opac...,"O,O,O,O,O,O,B,I,I,I,O"
4,"History : relative immobility , spends a lot o...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [8]:
validation_data.iloc[1].sentence

'Attending :[** First Name3 ( LF ) 1505 **] Chief Complaint : Dyspnea'

In [9]:
validation_data.iloc[1].word_labels

'O,O,O,O,O,O,O,O,O,O,O,O,B'

In [10]:
# Function to generate the dictionaries
def generate_label_dicts(validation_data):
    label2id = {}
    id2label = {}
    label_counts = {}
    
    for labels in validation_data['word_labels']:
        for label in labels.split(','):
            if label not in label_counts:
                label_counts[label] = len(label_counts)
                label2id[label] = label_counts[label]
                id2label[label_counts[label]] = label
    
    return label2id, id2label

# Generate dictionaries
label2id, id2label = generate_label_dicts(validation_data)

print("label2id:", label2id)
print("id2label:", id2label)

label2id: {'O': 0, 'B': 1, 'I': 2}
id2label: {0: 'O', 1: 'B', 2: 'I'}


In [11]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [12]:
tokenizer = AutoTokenizer.from_pretrained("praneethvasarla/med-bert")
MAX_LEN = 128


Downloading:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/695k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [13]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [14]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]
        word_labels = self.data.word_labels[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [15]:
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))
print("Validation Dataset: {}".format(validation_data.shape))

TRAIN Dataset: (5905, 2)
TEST Dataset: (1266, 2)
Validation Dataset: (1265, 2)


In [16]:
training_set = dataset(train_data, tokenizer, MAX_LEN)
testing_set = dataset(test_data, tokenizer, MAX_LEN)
validation_set = dataset(validation_data, tokenizer, MAX_LEN)

In [17]:
validation_set[0]["ids"]

tensor([  101,  7052,  1024,  1031,  1008,  1008,  2034,  2171,  2509,  1006,
         1048,  2546,  1007,  2753,  2620,  2549,  1008,  1008,  1033,  2708,
        12087,  1024,  2157,  8292,  2890, 21700,  2099,  4649,  3258,  1012,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [18]:
# print the first 30 tokens and corresponding labels
for token, label in zip(tokenizer.convert_ids_to_tokens(validation_set[0]["ids"][:30]), validation_set[0]["targets"][:30]):
  print('{0:10}  {1}'.format(token, id2label[label.item()]))

[CLS]       O
attending   O
:           O
[           O
*           O
*           O
first       O
name        O
##3         O
(           O
l           O
##f         O
)           O
50          O
##8         O
##4         O
*           O
*           O
]           O
chief       O
complaint   O
:           O
right       O
ce          B
##re        B
##bella     B
##r         B
les         I
##ion       I
.           O


In [19]:
import json
import os

# Generate labeled_texts.jsonl
labeled_texts = []

for index in range(len(validation_set)):
    sample = validation_set[index]
    sample_dict = {
        "id": index,
        "input_ids": sample["ids"].tolist(),
        "text_labels": [id2label[label.item()] for label in sample["targets"]],
        "labels": sample["targets"].tolist()
    }
    labeled_texts.append(sample_dict)

with open("med-bert_labeled_texts.jsonl", "w") as outfile:
    for sample in labeled_texts:
        json.dump(sample, outfile)
        outfile.write("\n")

# Generate label2id.jsonl
with open("med-bert_label2id.jsonl", "w") as outfile:
    json.dump(label2id, outfile)
