In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification

# Build an English pipeline
#stanza.download('en', package='mimic', processors={'ner': 'i2b2'}) # download English model
#nlp = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'}) # initialize English neural pipeline

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
token_df = pd.read_csv('//home/chudeo/project/33k_sentence.csv')

In [4]:
token_df.head()

Unnamed: 0,sentence_id,words,labels
0,0,Baseline,O
1,0,artifact,O
2,0,.,O
3,1,Probable,O
4,1,sinus,B


In [5]:
token_df.count()

sentence_id    625510
words          625262
labels         625510
dtype: int64

In [6]:
#checking for null values
token_df.isnull().sum()

sentence_id      0
words          248
labels           0
dtype: int64

In [7]:
data = token_df.fillna(method='ffill')
data.head()

  data = token_df.fillna(method='ffill')


Unnamed: 0,sentence_id,words,labels
0,0,Baseline,O
1,0,artifact,O
2,0,.,O
3,1,Probable,O
4,1,sinus,B


In [8]:
# let's create a new column called "sentence" which groups the words by sentence
data['sentence'] = data[['sentence_id','words','labels']].groupby(['sentence_id'])['words'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence
data['word_labels'] = data[['sentence_id','words','labels']].groupby(['sentence_id'])['labels'].transform(lambda x: ','.join(x))
data.head()

Unnamed: 0,sentence_id,words,labels,sentence,word_labels
0,0,Baseline,O,Baseline artifact .,"O,O,O"
1,0,artifact,O,Baseline artifact .,"O,O,O"
2,0,.,O,Baseline artifact .,"O,O,O"
3,1,Probable,O,Probable sinus tachycardia with atrial prematu...,"O,B,I,O,O,O,O,O"
4,1,sinus,B,Probable sinus tachycardia with atrial prematu...,"O,B,I,O,O,O,O,O"


In [9]:

label2id = {k: v for v, k in enumerate(data.labels.unique())}
id2label = {v: k for v, k in enumerate(data.labels.unique())}
label2id

{'O': 0, 'B': 1, 'I': 2}

In [10]:
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sentence,word_labels
0,Baseline artifact .,"O,O,O"
1,Probable sinus tachycardia with atrial prematu...,"O,B,I,O,O,O,O,O"
2,Low limb lead voltage .,"O,O,O,O,O"
3,Leftward axis .,"O,O,O"
4,Late R wave progression .,"O,O,O,O,O"


In [11]:
len(data)


11262

In [12]:
data.iloc[1].sentence

'Probable sinus tachycardia with atrial premature beats .'

In [13]:
data.iloc[1].word_labels

'O,B,I,O,O,O,O,O'

#### **Keeping only Evidence sentence**

In [14]:
filtered_df = data[~data['word_labels'].apply(lambda x: all(label == 'O' for label in x.split(',')))]


In [15]:
len(filtered_df)

2189

In [16]:
data = filtered_df.reset_index(drop=True)

In [17]:
data.iloc[1].sentence

'Other ST - T wave abnormalities .'

In [18]:
data.iloc[1].word_labels

'O,B,I,I,I,I,O'

In [19]:
from transformers import BertTokenizer, BertForTokenClassification, AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

2024-04-17 23:54:26.791593: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [20]:
# Extract words and labels from DataFrame
split_data = {
    "words": [word for sent in data["sentence"].str.split() for word in sent],
    "labels": [label.split(',') for label in data["word_labels"]]
}

In [21]:
# Split data into sentences and labels
sentences = split_data["words"]
labels = split_data["labels"]

In [22]:
# Tokenization
tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1")
tokenized_sentences = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [23]:
# Define classes
tag2idx = {"O": 0, "B": 1, "I": 2}
idx2tag = {idx: tag for tag, idx in tag2idx.items()}
num_classes = len(tag2idx)
MAX_LEN= 128
TRAIN_BATCH_SIZE =16
VALID_BATCH_SIZE = 8

In [24]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [25]:
class NERDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        sentence = self.data.sentence[index]
        word_labels = self.data.word_labels[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"]
        labels.insert(0, "O")
        labels.insert(-1, "O")

        if len(tokenized_sentence) > self.max_len:
            tokenized_sentence = tokenized_sentence[:self.max_len]
            labels = labels[:self.max_len]
        else:
            tokenized_sentence = tokenized_sentence + ['[PAD]' for _ in range(self.max_len - len(tokenized_sentence))]
            labels = labels + ["O" for _ in range(self.max_len - len(labels))]

        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)
        label_ids = [label2id[label] for label in labels]

        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(attn_mask, dtype=torch.long),
            'labels': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len


In [26]:
# Assuming you have initialized tokenizer, label2id, and MAX_LEN earlier
train_size = 0.8
train_data = data.sample(frac=train_size, random_state=200)
test_data = data.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

train_dataset = NERDataset(train_data, tokenizer, MAX_LEN)
test_dataset = NERDataset(test_data, tokenizer, MAX_LEN)


FULL Dataset: (2189, 2)
TRAIN Dataset: (1751, 2)
TEST Dataset: (438, 2)


In [27]:
train_dataset[0]

{'input_ids': tensor([  101,  1607,  1104,  1675,  6946,   131,  2908,   175,  1114,  3607,
          1112,   113,   164,   115,   115,  2450,   113,  8362,   114, 11523,
           115,   115,   166,   121,   119,   128,  3975,  1477,  1118, 34029,
           164,   115,   115, 18615,  1545,   118,  1429,   118,  1572,   115,
           115,   166,   114,   117, 52781,  1830,   188,   120,   185,  2265,
         47776,  1665,  1964,   164,   115,   115, 18615,  1545,   118,  1429,
           118,   130,   115,   115,   166,   117, 34574,  1116,   188,   120,
           185,  4329,  1306,   117,  1322, 13335,  2881, 10721,   188,   120,
           185,   182,  1964,  6949,   117,  1105,  8828, 26600, 58355,  1150,
          8218,  1114,  1275,  1285,  1607,  1104,  4146,  3381, 57477, 20080,
         25362,   119,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [28]:
train_dataset[0]['input_ids']

tensor([  101,  1607,  1104,  1675,  6946,   131,  2908,   175,  1114,  3607,
         1112,   113,   164,   115,   115,  2450,   113,  8362,   114, 11523,
          115,   115,   166,   121,   119,   128,  3975,  1477,  1118, 34029,
          164,   115,   115, 18615,  1545,   118,  1429,   118,  1572,   115,
          115,   166,   114,   117, 52781,  1830,   188,   120,   185,  2265,
        47776,  1665,  1964,   164,   115,   115, 18615,  1545,   118,  1429,
          118,   130,   115,   115,   166,   117, 34574,  1116,   188,   120,
          185,  4329,  1306,   117,  1322, 13335,  2881, 10721,   188,   120,
          185,   182,  1964,  6949,   117,  1105,  8828, 26600, 58355,  1150,
         8218,  1114,  1275,  1285,  1607,  1104,  4146,  3381, 57477, 20080,
        25362,   119,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [29]:
# print the first 30 tokens and corresponding labels
for token, label in zip(tokenizer.convert_ids_to_tokens(train_dataset[0]["input_ids"][:30]), train_dataset[0]["labels"][:30]):
  print('{0:10}  {1}'.format(token, id2label[label.item()]))

[CLS]       O
history     O
of          O
present     O
illness     O
:           O
80          O
f           O
with        O
critical    O
as          B
(           O
[           O
*           O
*           O
location    O
(           O
un          O
)           O
109         O
*           O
*           O
]           O
0           O
.           O
7           O
cm          O
##2         O
by          O
tte         O


In [30]:
# Define your DataLoader parameters
train_params = {'batch_size': TRAIN_BATCH_SIZE, 'shuffle': True, 'num_workers': 0}
test_params = {'batch_size': VALID_BATCH_SIZE, 'shuffle': False, 'num_workers': 0}

# Create your DataLoader
train_loader = DataLoader(train_dataset, **train_params)
test_loader = DataLoader(test_dataset, **test_params)

In [31]:

# Define model
model = BertForTokenClassification.from_pretrained("dmis-lab/biobert-large-cased-v1.1", num_labels=num_classes)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-05)

Some weights of the model checkpoint at dmis-lab/biobert-large-cased-v1.1 were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not init

In [32]:
import torch
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(58996, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-1

In [33]:
ids = train_dataset[0]["input_ids"].unsqueeze(0)
mask = train_dataset[0]["attention_mask"].unsqueeze(0)
targets = train_dataset[0]["labels"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss

tensor(0.9104, device='cuda:0', grad_fn=<NllLossBackward0>)

In [34]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 3])

In [35]:
MAX_GRAD_NORM = 10

In [36]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(train_loader):

        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        targets = batch['labels'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_preds.extend(predictions)
        tr_labels.extend(targets)

        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [37]:
EPOCHS = 10
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 0.9713519811630249
Training loss per 100 training steps: 0.18953767920484638
Training loss epoch: 0.1829959372227842
Training accuracy epoch: 0.8543510790302025
Training epoch: 2
Training loss per 100 training steps: 0.15377211570739746
Training loss per 100 training steps: 0.09225820591396625
Training loss epoch: 0.09085429466583513
Training accuracy epoch: 0.8879519994701747
Training epoch: 3
Training loss per 100 training steps: 0.07668819278478622
Training loss per 100 training steps: 0.07050679649899502
Training loss epoch: 0.06933418078856035
Training accuracy epoch: 0.914399501287691
Training epoch: 4
Training loss per 100 training steps: 0.07705891132354736
Training loss per 100 training steps: 0.05437370922674637
Training loss epoch: 0.05478924531489611
Training accuracy epoch: 0.9331763833610405
Training epoch: 5
Training loss per 100 training steps: 0.05463910102844238
Training loss per 100 training steps: 0.04408973252

In [38]:
from seqeval.metrics import classification_report as seqeval_classification_report

def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss = 0
    nb_eval_steps = 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            targets = batch['labels'].to(device, dtype=torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits

            eval_loss += loss.item()

            nb_eval_steps += 1

            # compute evaluation accuracy
            active_logits = eval_logits.view(-1, model.num_labels)
            active_labels = targets.view(-1)

            eval_preds.extend(torch.argmax(active_logits, axis=1).cpu().numpy())
            eval_labels.extend(active_labels.cpu().numpy())

    eval_loss = eval_loss / nb_eval_steps

    labels = [[id2label[id_]] for id_ in eval_labels]
    predictions = [[id2label[id_]] for id_ in eval_preds]

    print(f"Validation Loss: {eval_loss}")
    print("Classification Report:")
    print(seqeval_classification_report(labels, predictions))

    return labels, predictions

labels, predictions = valid(model, test_loader)


Validation Loss: 0.0772979489442977
Classification Report:
              precision    recall  f1-score   support

           _       0.82      0.78      0.80      2446

   micro avg       0.82      0.78      0.80      2446
   macro avg       0.82      0.78      0.80      2446
weighted avg       0.82      0.78      0.80      2446



In [39]:
from seqeval.metrics import classification_report as seqeval_classification_report
from sklearn.metrics import classification_report as sklearn_classification_report

def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss = 0
    nb_eval_steps = 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            targets = batch['labels'].to(device, dtype=torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits

            eval_loss += loss.item()

            nb_eval_steps += 1

            # compute evaluation accuracy
            active_logits = eval_logits.view(-1, model.num_labels)
            active_labels = targets.view(-1)

            eval_preds.extend(torch.argmax(active_logits, axis=1).cpu().numpy())
            eval_labels.extend(active_labels.cpu().numpy())

    eval_loss = eval_loss / nb_eval_steps

    labels = [[id2label[id_]] for id_ in eval_labels]
    predictions = [[id2label[id_]] for id_ in eval_preds]

    print(f"Validation Loss: {eval_loss}")

    # Get unique labels
    unique_labels = set([label for sublist in labels for label in sublist] + [label for sublist in predictions for label in sublist])
    unique_labels = sorted(unique_labels)

    # Fill in empty lists for missing labels
    filled_labels = []
    filled_predictions = []
    for lbl, pred in zip(labels, predictions):
        filled_lbl = lbl + [l for l in unique_labels if l not in lbl]
        filled_pred = pred + [l for l in unique_labels if l not in pred]
        filled_labels.append(filled_lbl)
        filled_predictions.append(filled_pred)

    print("Classification Report (SeqEval):")
    print(seqeval_classification_report(filled_labels, filled_predictions))

    print("Classification Report (Sklearn):")
    print(sklearn_classification_report([label[0] for label in filled_labels], [pred[0] for pred in filled_predictions]))

    return labels, predictions

labels, predictions = valid(model, test_loader)


Validation Loss: 0.0772979489442977
Classification Report (SeqEval):
              precision    recall  f1-score   support

           _       0.98      0.97      0.98     57125

   micro avg       0.98      0.97      0.98     57125
   macro avg       0.98      0.97      0.98     57125
weighted avg       0.98      0.97      0.98     57125

Classification Report (Sklearn):
              precision    recall  f1-score   support

           B       0.76      0.75      0.75      1385
           I       0.76      0.68      0.72      1061
           O       0.99      0.99      0.99     53618

    accuracy                           0.98     56064
   macro avg       0.84      0.81      0.82     56064
weighted avg       0.98      0.98      0.98     56064



#### **Predicted Labels** 

In [48]:
def get_predicted_labels(model, dataloader):
    all_pred_labels = []
    
    model.eval()
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            # Forward pass through the model
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            # Get the predicted labels
            predicted_labels = torch.argmax(logits, dim=-1)
            
            # Convert predicted labels to the required format
            batch_pred_labels = []
            for i in range(len(predicted_labels)):
                sentence_tokens = tokenizer.convert_ids_to_tokens(input_ids[i])
                sentence_labels = [id2label[label.item()] for label in predicted_labels[i]]
                
                # Convert the sentence-level predictions to the required format
                for j, label in enumerate(sentence_labels):
                    if label != 'O':
                        start = 0
                        for k in range(j):
                            start += len(tokenizer.tokenize(sentence_tokens[k]))
                        end = start + len(tokenizer.tokenize(sentence_tokens[j]))
                        batch_pred_labels.append({
                            'label': label,
                            'start': start,
                            'end': end
                        })
            all_pred_labels.append(batch_pred_labels)
    
    return all_pred_labels

In [49]:
all_pred_labels = get_predicted_labels(model, test_loader)

#### **NERVALUATE**

In [42]:
import os
import json
import pandas as pd
from nervaluate import Evaluator
import torch

In [53]:
# Load the CSV file
data = pd.read_csv('/home/chudeo/project/Evidence_sentences.csv')

In [59]:
# Extract the true labels
all_true_labels = []
for _, row in data.iterrows():
    sentence = row['sentence']
    word_labels = row['word_labels'].split(',')
    true_labels = []
    for i, label in enumerate(word_labels):
        if label != 'O':
            true_labels.append({
                'label': label,
                'start': 0,
                'end': 0  # You'll need to calculate the start and end positions based on the tokenization
            })
    all_true_labels.append(true_labels)

In [60]:
# Get the predicted labels from the model
all_pred_labels = []
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=-1)

    batch_pred_labels = []
    for i in range(len(predicted_labels)):
        sentence_tokens = tokenizer.convert_ids_to_tokens(input_ids[i])
        sentence_labels = [id2label[label.item()] for label in predicted_labels[i]]

        for j, label in enumerate(sentence_labels):
            if label != 'O':
                start = 0
                for k in range(j):
                    start += len(tokenizer.tokenize(sentence_tokens[k]))
                end = start + len(tokenizer.tokenize(sentence_tokens[j]))
                batch_pred_labels.append({
                    'label': label,
                    'start': start,
                    'end': end
                })
    all_pred_labels.append(batch_pred_labels)

In [63]:
# Ensure that the lengths of all_true_labels and all_pred_labels match
if len(all_true_labels) > len(all_pred_labels):
    all_pred_labels.extend([[] for _ in range(len(all_true_labels) - len(all_pred_labels))])
elif len(all_true_labels) < len(all_pred_labels):
    all_true_labels.extend([[] for _ in range(len(all_pred_labels) - len(all_true_labels))])


In [64]:
# Get the unique labels from the true labels
unique_labels = set()
for true_labels in all_true_labels:
    for label_dict in true_labels:
        unique_labels.add(label_dict['label'])
unique_labels = list(unique_labels)


In [65]:
# Pass the unique labels as the tags parameter
evaluator = Evaluator(all_true_labels, all_pred_labels, tags=unique_labels)

# Returns overall metrics and metrics for each tag
results, results_per_tag = evaluator.evaluate()

print(results)

{'ent_type': {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 14255, 'spurious': 2317, 'possible': 14255, 'actual': 2317, 'precision': 0.0, 'recall': 0.0, 'f1': 0}, 'partial': {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 14255, 'spurious': 2317, 'possible': 14255, 'actual': 2317, 'precision': 0.0, 'recall': 0.0, 'f1': 0}, 'strict': {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 14255, 'spurious': 2317, 'possible': 14255, 'actual': 2317, 'precision': 0.0, 'recall': 0.0, 'f1': 0}, 'exact': {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 14255, 'spurious': 2317, 'possible': 14255, 'actual': 2317, 'precision': 0.0, 'recall': 0.0, 'f1': 0}}


In [66]:
print(results_per_tag)

{'B': {'ent_type': {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 6577, 'spurious': 1371, 'possible': 6577, 'actual': 1371, 'precision': 0.0, 'recall': 0.0, 'f1': 0}, 'partial': {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 6577, 'spurious': 1371, 'possible': 6577, 'actual': 1371, 'precision': 0.0, 'recall': 0.0, 'f1': 0}, 'strict': {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 6577, 'spurious': 1371, 'possible': 6577, 'actual': 1371, 'precision': 0.0, 'recall': 0.0, 'f1': 0}, 'exact': {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 6577, 'spurious': 1371, 'possible': 6577, 'actual': 1371, 'precision': 0.0, 'recall': 0.0, 'f1': 0}}, 'I': {'ent_type': {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 7678, 'spurious': 946, 'possible': 7678, 'actual': 946, 'precision': 0.0, 'recall': 0.0, 'f1': 0}, 'partial': {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 7678, 'spurious': 946, 'possible': 7678, 'actual': 946, 'precision': 0.0, 'recall': 0.0,