In [14]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaConfig, RobertaForTokenClassification


In [15]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [16]:
token_df = pd.read_csv('//home/chudeo/project/33k_sentence.csv')

  data = token_df.fillna(method='ffill')


In [18]:
token_df.head()

Unnamed: 0,sentence_id,words,labels
0,0,Baseline,O
1,0,artifact,O
2,0,.,O
3,1,Probable,O
4,1,sinus,B


In [19]:
#checking for null values
token_df.isnull().sum()

sentence_id      0
words          248
labels           0
dtype: int64

In [20]:
data = token_df.fillna(method='ffill')
data.head()

  data = token_df.fillna(method='ffill')


Unnamed: 0,sentence_id,words,labels
0,0,Baseline,O
1,0,artifact,O
2,0,.,O
3,1,Probable,O
4,1,sinus,B


In [21]:
# let's create a new column called "sentence" which groups the words by sentence
data['sentence'] = data[['sentence_id','words','labels']].groupby(['sentence_id'])['words'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence
data['word_labels'] = data[['sentence_id','words','labels']].groupby(['sentence_id'])['labels'].transform(lambda x: ','.join(x))
data.head()

Unnamed: 0,sentence_id,words,labels,sentence,word_labels
0,0,Baseline,O,Baseline artifact .,"O,O,O"
1,0,artifact,O,Baseline artifact .,"O,O,O"
2,0,.,O,Baseline artifact .,"O,O,O"
3,1,Probable,O,Probable sinus tachycardia with atrial prematu...,"O,B,I,O,O,O,O,O"
4,1,sinus,B,Probable sinus tachycardia with atrial prematu...,"O,B,I,O,O,O,O,O"


In [22]:

label2id = {k: v for v, k in enumerate(data.labels.unique())}
id2label = {v: k for v, k in enumerate(data.labels.unique())}
label2id

{'O': 0, 'B': 1, 'I': 2}

In [23]:
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sentence,word_labels
0,Baseline artifact .,"O,O,O"
1,Probable sinus tachycardia with atrial prematu...,"O,B,I,O,O,O,O,O"
2,Low limb lead voltage .,"O,O,O,O,O"
3,Leftward axis .,"O,O,O"
4,Late R wave progression .,"O,O,O,O,O"


In [24]:
filtered_df = data[~data['word_labels'].apply(lambda x: all(label == 'O' for label in x.split(',')))]
data = filtered_df.reset_index(drop=True)

In [25]:
data.head()

Unnamed: 0,sentence,word_labels
0,Probable sinus tachycardia with atrial prematu...,"O,B,I,O,O,O,O,O"
1,Other ST - T wave abnormalities .,"O,B,I,I,I,I,O"
2,Sinus tachycardia with occasional aberrancy .,"B,I,O,B,I,O"
3,ST segment abnormalities likely secondary to r...,"B,I,I,O,O,O,O,O"
4,Slight ST segment abnormalities at baseline li...,"O,B,I,I,O,O,O,O,O,O,O"


In [26]:
# Extract words and labels from DataFrame
split_data = {
    "words": [word for sent in data["sentence"].str.split() for word in sent],
    "labels": [label.split(',') for label in data["word_labels"]]
}

# Split data into sentences and labels
sentences = split_data["words"]
labels = split_data["labels"]

In [27]:
# Tokenization
tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")
tokenized_sentences = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)


In [28]:
# Define classes
tag2idx = {"O": 0, "B": 1, "I": 2}
idx2tag = {idx: tag for tag, idx in tag2idx.items()}
num_classes = len(tag2idx)
MAX_LEN= 128
TRAIN_BATCH_SIZE =16
VALID_BATCH_SIZE = 8

In [29]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [30]:
class NERDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        sentence = self.data.sentence[index]
        word_labels = self.data.word_labels[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"]
        labels.insert(0, "O")
        labels.insert(-1, "O")

        if len(tokenized_sentence) > self.max_len:
            tokenized_sentence = tokenized_sentence[:self.max_len]
            labels = labels[:self.max_len]
        else:
            tokenized_sentence = tokenized_sentence + ['[PAD]' for _ in range(self.max_len - len(tokenized_sentence))]
            labels = labels + ["O" for _ in range(self.max_len - len(labels))]

        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)
        label_ids = [label2id[label] for label in labels]

        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(attn_mask, dtype=torch.long),
            'labels': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len


In [31]:
# Assuming you have initialized tokenizer, label2id, and MAX_LEN earlier
train_size = 0.8
train_data = data.sample(frac=train_size, random_state=200)
test_data = data.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

train_dataset = NERDataset(train_data, tokenizer, MAX_LEN)
test_dataset = NERDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (2189, 2)
TRAIN Dataset: (1751, 2)
TEST Dataset: (438, 2)


In [32]:
train_dataset[0]

{'input_ids': tensor([    3, 38261,  1116, 45195, 24873,  1825,    35,  2940,   597,  5632,
         25398,  2336,  1640, 10975, 12606, 46571,  1640,   879,    43, 18616,
         12606,   742,   288,     4,   406, 13753,   176,  1409,   565,  6433,
         10975, 12606,   176, 27915,    12,  1225,    12,  1978, 12606,   742,
            43,     6,  8573,  1452,    29,    73,   642, 41589,   495,  3376,
           846, 10975, 12606,   176, 27915,    12,  1225,    12,   466, 12606,
           742,     6,  8108,   104,    29,    73,   642,   510,  5683,     6,
          1397, 43682, 10100,    29,    73,   642,   448,   846, 36427,     6,
           463, 19397,   642,   922, 43462, 11108, 11497, 17699,  8155,   642,
         40826,  5632,   698,  1208, 37283,  1116,   605,   994,  4226,   417,
          2459,   642, 22423,     4,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,   

In [33]:
train_dataset[0]['input_ids']

tensor([    3, 38261,  1116, 45195, 24873,  1825,    35,  2940,   597,  5632,
        25398,  2336,  1640, 10975, 12606, 46571,  1640,   879,    43, 18616,
        12606,   742,   288,     4,   406, 13753,   176,  1409,   565,  6433,
        10975, 12606,   176, 27915,    12,  1225,    12,  1978, 12606,   742,
           43,     6,  8573,  1452,    29,    73,   642, 41589,   495,  3376,
          846, 10975, 12606,   176, 27915,    12,  1225,    12,   466, 12606,
          742,     6,  8108,   104,    29,    73,   642,   510,  5683,     6,
         1397, 43682, 10100,    29,    73,   642,   448,   846, 36427,     6,
          463, 19397,   642,   922, 43462, 11108, 11497, 17699,  8155,   642,
        40826,  5632,   698,  1208, 37283,  1116,   605,   994,  4226,   417,
         2459,   642, 22423,     4,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3])

In [34]:
# print the first 30 tokens and corresponding labels
for token, label in zip(tokenizer.convert_ids_to_tokens(train_dataset[0]["input_ids"][:30]), train_dataset[0]["labels"][:30]):
  print('{0:10}  {1}'.format(token, id2label[label.item()]))

<unk>       O
History     O
of          O
Present     O
Ill         O
ness        O
:           O
80          O
F           O
with        O
critical    O
AS          B
(           O
[           O
**          O
Location    O
(           O
un          O
)           O
109         O
**          O
]           O
0           O
.           O
7           O
cm          O
2           O
by          O
T           O
TE          O


In [35]:
# Define your DataLoader parameters
train_params = {'batch_size': TRAIN_BATCH_SIZE, 'shuffle': True, 'num_workers': 0}
test_params = {'batch_size': VALID_BATCH_SIZE, 'shuffle': False, 'num_workers': 0}

# Create your DataLoader
train_loader = DataLoader(train_dataset, **train_params)
test_loader = DataLoader(test_dataset, **test_params)


In [41]:
# Load model directly
from transformers import AutoTokenizer, RobertaModel
from transformers import AdamW

model = RobertaModel.from_pretrained("FacebookAI/roberta-base")

# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-05)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
import torch
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [44]:
ids = train_dataset[0]["input_ids"].unsqueeze(0)
mask = train_dataset[0]["attention_mask"].unsqueeze(0)
targets = train_dataset[0]["labels"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask)
initial_loss = outputs[0]
initial_loss

tensor([[[-0.0311,  0.0826,  0.0029,  ..., -0.1956, -0.0363, -0.0276],
         [-0.0712,  0.0966,  0.2057,  ..., -0.5175,  0.0230,  0.1683],
         [-0.1321, -0.0217,  0.0505,  ..., -0.3311, -0.0199, -0.0772],
         ...,
         [-0.0356, -0.0057,  0.1431,  ..., -0.2721, -0.0549,  0.0409],
         [-0.0407,  0.0514,  0.0745,  ..., -0.1618, -0.0230,  0.0375],
         [-0.0619,  0.0386,  0.0564,  ..., -0.1142, -0.0150,  0.0568]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)

In [45]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 768])

In [46]:
MAX_GRAD_NORM = 10

In [49]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(train_loader):

        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        targets = batch['labels'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_preds.extend(predictions)
        tr_labels.extend(targets)

        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [50]:
EPOCHS = 10
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1


AttributeError: 'BaseModelOutputWithPoolingAndCrossAttentions' object has no attribute 'loss'