In [1]:
import argparse
import os

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PrefixTuningConfig,
    PromptEncoderConfig,
    PromptTuningConfig,
)


import evaluate
from seqeval.metrics import f1_score, precision_score, recall_score
from datasets import load_dataset, load_from_disk
from transformers import (AutoModel,AutoModelForSequenceClassification,
                          AutoModelForTokenClassification, AutoTokenizer,
                          DataCollatorForTokenClassification,
                          get_linear_schedule_with_warmup, set_seed)
from tqdm import tqdm
from loguru import logger as loguru_logger
import numpy as np

import sys
sys.path.append("../")


2023-06-23 15:26:54.280063: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda-11.2/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 112
CUDA SETUP: Loading binary /mnt/sdc/niallt/venvs/39nlp/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda112.so...


In [2]:

class DatasetInfo:
  def __init__(self, name,
               type="ner", 
               metric=None, 
               load_from_disk=True,
               isMultiSentence=False, 
               validationSubsets=["test"],
               lr=[5e-5, 2e-5, 1e-5], 
               batch_size=[32], 
               epochs=3, 
               runs=1,
               num_labels=None):

    self.name = name
    self.isMultiSentence = isMultiSentence
    self.validationSubsets = validationSubsets
    self.lr = lr
    self.batch_size = batch_size
    self.epochs = epochs
    self.runs = runs
    self.load_from_disk = load_from_disk
    self.type = type
    self.num_labels = num_labels

    if metric == None:
      self.metric = "accuracy"
    else:
      self.metric = metric

    self.fullName = name + "-" + self.metric

class ModelInfo:
  def __init__(self, pretrainedPath, modelPath, isCustom=False, isAdapterTuning=False, use_token_type_ids=True):
    self.pretrainedPath = pretrainedPath
    self.modelPath = modelPath

    self.logsPath = pretrainedPath + f"/"

    self.isCustom = isCustom
    self.isAdapterTuning = isAdapterTuning
    self.use_token_type_ids = use_token_type_ids

  def get_logs_path(self, datasetName):
    return self.logsPath + f"{datasetName}.txt" if not self.isAdapterTuning else self.logsPath + f"{datasetName}-adapter.txt"
  
  def load_model(self, num_labels, ds):
    if self.isCustom:
      if ds.type == "classification":
        model = AutoModelForSequenceClassification.from_pretrained(self.modelPath, num_labels=num_labels)
      elif ds.type == "ner":
        model = AutoModelForTokenClassification.from_pretrained(self.modelPath, num_labels=num_labels)

      if self.isAdapterTuning:
        model.trainAdaptersOnly()
    else:
      if ds.type == "classification":
        model = AutoModelForSequenceClassification.from_pretrained(self.modelPath, num_labels=num_labels)
      elif ds.type == "ner":
        model = AutoModelForTokenClassification.from_pretrained(self.modelPath, num_labels=num_labels)
    
    return model

In [3]:
peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1, bias="all"
)
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, base_model_name_or_path=None, revision=None, task_type=<TaskType.TOKEN_CLS: 'TOKEN_CLS'>, inference_mode=False, r=8, target_modules=None, lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='all', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)

### Test with i2b2 2010 dataset for now

In [4]:
i2b2_2010_data_dir = "/mnt/sdd/niallt/bio-lm/data/tasks/I2B22010NER_hf_dataset/"
i2b2_2010_dataset = load_from_disk(i2b2_2010_data_dir)

In [5]:
i2b2_2010_dataset['train'][2]

{'tokens': ['23238893'], 'ner_tags_str': ['O'], 'ner_tags': [6]}

In [5]:
# load a tokenizer and model
model_name_or_path = "roberta-base"
if model_name_or_path == "roberta-base":
    
    tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)
else:
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [29]:
tokenizer

RobertaTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [6]:
def load_datasets(info):
  """#Dataset Utilities"""
  
  if not info.load_from_disk:
    dataset = load_dataset(info.name)
  else:
    dataset = load_from_disk(info.name)

  if info.type == "classification":
    num_labels = len(set(dataset["train"]["labels"]))
    def mappingFunction(samples, **kargs):
      if info.isMultiSentence:
        outputs = tokenizer(samples[dataset["train"].column_names[0]],
                            samples[dataset["train"].column_names[1]],
                            max_length=512,
                            truncation=True,
                            padding=kargs["padding"])
      else:
        outputs = tokenizer(samples[dataset["train"].column_names[0]],
                            truncation=True,
                            max_length=512,
                            padding=kargs["padding"])

      outputs["labels"] = samples["labels"]

      return outputs
  elif info.type == "ner":
    # print(dataset)
    num_labels = len(dataset["info"][0]["all_ner_tags"])
    def mappingFunction(all_samples_per_split, **kargs):
      tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["tokens"],
                                                      is_split_into_words=True, 
                                                      truncation=True,
                                                      max_length=512,
                                                      padding=kargs["padding"])  
      total_adjusted_labels = []

      for k in range(0, len(tokenized_samples["input_ids"])):
        prev_wid = -1
        word_ids_list = tokenized_samples.word_ids(batch_index=k)
        existing_label_ids = all_samples_per_split["ner_tags"][k]
        i = -1
        adjusted_label_ids = []

        for wid in word_ids_list:
          if(wid is None):
            adjusted_label_ids.append(-100)
          elif(wid!=prev_wid):
            i = i + 1
            adjusted_label_ids.append(existing_label_ids[i])
            prev_wid = wid
          else:
            adjusted_label_ids.append(existing_label_ids[i])
            
        total_adjusted_labels.append(adjusted_label_ids)

      tokenized_samples["labels"] = total_adjusted_labels
      
      return tokenized_samples

  tokenizedTrainDataset = dataset["train"].map(mappingFunction,
                                              batched=True,
                                              remove_columns=dataset["train"].column_names,
                                              fn_kwargs={"padding": "do_not_pad"})
    
  tokenizedValDatasets = []

  for name in info.validationSubsets:
    tokenizedValDataset = dataset[name].map(mappingFunction,
                                            batched=True,
                                            remove_columns=dataset[name].column_names,
                                            fn_kwargs={"padding": "do_not_pad"})
    
    tokenizedValDatasets.append(tokenizedValDataset)

  if info.num_labels != None:
    num_labels = info.num_labels

  return tokenizedTrainDataset, tokenizedValDatasets, num_labels, dataset["info"][0]["all_ner_tags"]

### encode datasets

In [7]:
datasets = [
    DatasetInfo("/mnt/sdd/niallt/bio-lm/data/tasks/I2B22010NER_hf_dataset/", #Use the pre-processing code in BioLM (https://github.com/facebookresearch/bio-lm)
                metric="f1",
                load_from_disk=True,
                type="ner",
                isMultiSentence=False,
                lr=[5e-5, 2e-5, 1e-5],
                epochs=3,
                batch_size=[16],
                runs=1)]

In [17]:
datasets[0].metric

'f1'

In [10]:
train_dataset, valid_dataset, num_labels, all_ner_tags = load_datasets(datasets[0])
    

Loading cached processed dataset at /mnt/sdd/niallt/bio-lm/data/tasks/I2B22010NER_hf_dataset/train/cache-35c6a2e5c9e31505.arrow
Loading cached processed dataset at /mnt/sdd/niallt/bio-lm/data/tasks/I2B22010NER_hf_dataset/test/cache-3af080fc1df5e84d.arrow


In [182]:
all_ner_tags

['I-test',
 'B-problem',
 'I-treatment',
 'B-test',
 'B-treatment',
 'I-problem',
 'O']

In [75]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 6726
})

In [76]:
valid_dataset[0]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 27626
})

In [12]:
i2b2_2010_dataset['train'][30]

{'tokens': ['The',
  'pathology',
  ',',
  'unfortunately',
  ',',
  'revealed',
  'an',
  'aggressive',
  'adenocarcinoma',
  '(',
  'micropapillary',
  'type',
  ';',
  'mucin',
  'producing',
  ')',
  '.'],
 'ner_tags_str': ['B-test',
  'I-test',
  'O',
  'O',
  'O',
  'O',
  'B-problem',
  'I-problem',
  'I-problem',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 'ner_tags': [3, 0, 6, 6, 6, 6, 1, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6]}

In [11]:
train_dataset[30]

{'input_ids': [0,
  20,
  38951,
  2156,
  9574,
  2156,
  1487,
  41,
  4353,
  2329,
  225,
  1975,
  9636,
  179,
  4982,
  36,
  14926,
  6884,
  1115,
  31867,
  1907,
  25606,
  38791,
  179,
  5591,
  4839,
  479,
  2],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [-100,
  3,
  0,
  6,
  6,
  6,
  6,
  1,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  -100]}

In [13]:
tokenizer.decode(train_dataset[30]["input_ids"])

'<s> The pathology, unfortunately, revealed an aggressive adenocarcinoma ( micropapillary type ; mucin producing ).</s>'

In [40]:
num_labels

7

#### Setup dataloaders

In [58]:
i2b2_2010_dataset['train'][0]

{'tokens': ['910458031'], 'ner_tags_str': ['O'], 'ner_tags': [6]}

In [8]:
collate_fn = DataCollatorForTokenClassification(tokenizer)


In [47]:
train_dataset.select(range(0, 10))

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 10
})

In [53]:
# Instantiate dataloaders.
train_dataloader = DataLoader(train_dataset.select(range(0, 500)), shuffle=True, collate_fn=collate_fn, batch_size=8)
eval_dataloader = DataLoader(
    valid_dataset[0].select(range(0, 1000)), shuffle=False, collate_fn=collate_fn, batch_size=8
)

In [49]:
for batch in train_dataloader:
    
    
    print(batch.keys())
    
    break

dict_keys(['input_ids', 'attention_mask', 'labels'])


#### debug


In [46]:
# sanity check what comes out and goes into token classifier
base_model = AutoModel.from_pretrained("roberta-base")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
inputs = tokenizer("Hello, my dog is cute and stinky", return_tensors="pt")

In [48]:
outputs = base_model(**inputs)

In [51]:
outputs[0].shape

torch.Size([1, 8, 768])

### load peft model

In [12]:
model = AutoModelForTokenClassification.from_pretrained(model_name_or_path,
                                                        num_labels = num_labels,
                                                        return_dict=True)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

In [13]:
num_labels

7

In [20]:
# outputs = model(**inputs)

In [21]:
# outputs.logits.shape

torch.Size([1, 11, 7])

In [42]:
model

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (L

In [30]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, base_model_name_or_path=None, revision=None, task_type=<TaskType.TOKEN_CLS: 'TOKEN_CLS'>, inference_mode=False, r=16, target_modules=['query', 'value'], lora_alpha=8, lora_dropout=0.1, fan_in_fan_out=False, bias='all', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)

In [14]:


# load peft model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 407,822 || all params: 124,360,718 || trainable%: 0.32793474222302255


In [29]:
model

PeftModelForTokenClassification(
  (base_model): LoraModel(
    (model): RobertaForTokenClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (defau

In [39]:
model.model.roberta.encoder.layer[0].attention.self.query.weight.shape

torch.Size([768, 768])

In [43]:
model.model.roberta.encoder.layer[0].attention.self.query.lora_A.default.weight.shape

torch.Size([8, 768])

#### Setup training

In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu" 

In [21]:
model.to(device)

PeftModelForTokenClassification(
  (base_model): LoraModel(
    (model): RobertaForTokenClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (defau

In [62]:
# metrics
metric = evaluate.load("accuracy")

In [55]:
num_epochs = 2

In [56]:
optimizer = AdamW(params=model.parameters(), lr=0.001)

# Instantiate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [57]:
def evaluate_model(model, eval_dataloader, all_labels):
    model.eval()
    predictions = []
    references = []

    for batch in tqdm(eval_dataloader):
              

        for key, value in batch.items():
            batch[key] = value.cuda()
        with torch.no_grad():
            output = np.argmax(model(**batch).logits.cpu().detach().numpy(), axis=-1)

            # print(f"output shape: {output.shape}")
            predictions.append([])
            references.append([])
            for prediction, label in zip(output.reshape(-1), batch["labels"].cpu().detach().numpy().reshape(-1)):
                if label != -100:
                    predictions[-1].append(all_labels[prediction])
                    references[-1].append(all_labels[label])





    print(predictions[-1])
    print(references[-1])
    
    # print(f"length of references: {len(references)}")
    metrics_dict = {"precision": precision_score(references, predictions), 
                "recall": recall_score(references, predictions),
                "f1": f1_score(references, predictions)}
    
    

    return metrics_dict, predictions, references

In [58]:

for epoch in range(num_epochs):
    model.train()
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch.to(device)
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    eval_metric, preds, labels = evaluate_model(model, eval_dataloader, all_ner_tags)
    print(f"epoch {epoch}:", eval_metric)

100%|██████████| 63/63 [00:04<00:00, 15.42it/s]
100%|██████████| 125/125 [00:02<00:00, 53.80it/s]


['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-test', 'I-test', 'O', 'O', 'O', 'O', 'O', 'O', 'B-problem', 'B-problem', 'B-problem', 'I-problem', 'O', 'O', 'O', 'B-problem', 'I-problem', 'I-problem', 'I-problem', 'I-problem', 'I-problem', 'I-problem', 'O', 'O', 'O', 'O', 'O', 'B-test', 'I-test', 'I-test', 'O', 'O', 'B-problem', 'O', 'B-problem', 'B-problem', 'O', 'B-test', 'B-test', 'I-test', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-problem', 'I-problem', 'I-problem', 'O', 'B-treatment', 'I-treatment', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-treatment', 'I-treatment', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-treatment', 'B-treatment', 'O', 'B-treatment', 'I-treatment', 'I-treatment', 'O', 'B-treatment', 'I-treatment', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-problem', 'B-problem

100%|██████████| 63/63 [00:03<00:00, 16.55it/s]
100%|██████████| 125/125 [00:02<00:00, 54.92it/s]


['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-test', 'I-test', 'O', 'O', 'O', 'O', 'O', 'O', 'B-problem', 'B-problem', 'B-problem', 'I-problem', 'O', 'O', 'O', 'B-problem', 'I-problem', 'I-problem', 'I-problem', 'I-problem', 'I-problem', 'I-problem', 'O', 'I-test', 'O', 'O', 'O', 'B-test', 'I-test', 'I-test', 'O', 'O', 'B-problem', 'O', 'B-problem', 'B-problem', 'O', 'B-test', 'B-test', 'I-test', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-problem', 'I-problem', 'I-problem', 'O', 'B-treatment', 'I-treatment', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-treatment', 'I-treatment', 'I-treatment', 'I-treatment', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-treatment', 'B-treatment', 'O', 'B-treatment', 'I-treatment', 'I-treatment', 'O', 'B-treatment', 'I-treatment', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

In [81]:
model.device

device(type='cuda', index=0)

In [95]:
tokenizer

RobertaTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [151]:
len(eval_dataloader)

3454

In [93]:
references.shape

torch.Size([8, 14])

In [89]:
predictions.shape

torch.Size([8, 14])

In [24]:
all_labels = all_ner_tags

In [189]:
datasets[0].metric

'f1'

In [20]:
model.device

device(type='cpu')

100%|██████████| 3454/3454 [01:17<00:00, 44.53it/s]

['I-treatment', 'B-treatment', 'B-treatment', 'B-treatment', 'B-treatment', 'B-treatment', 'B-treatment', 'B-treatment', 'B-treatment', 'O', 'B-treatment', 'B-treatment', 'B-treatment', 'B-problem', 'B-treatment', 'I-treatment', 'B-treatment', 'I-treatment', 'O', 'I-treatment', 'O', 'I-treatment', 'B-treatment', 'B-treatment', 'I-treatment']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']





In [26]:
print(f"precision: {precision_score(references, predictions)}\nrecall: {recall_score(references, predictions)}\nf1: {f1_score(references, predictions)}")

precision: 0.03969077955278091
recall: 0.19139855548260012
f1: 0.06574736883653659
