In [2]:
import argparse

import os
os.environ["CUDA_VISIBLE_DEVICES"]="1" #6,7import pandas as pd 2080s = 0,3,5,6,8 Nvidia-smi ids: 0, 3, 5, 6, 8 Actual id: 5,6,7,8,9 

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PrefixTuningConfig,
    PromptEncoderConfig,
    PromptTuningConfig,
)


import evaluate
from seqeval.metrics import f1_score, precision_score, recall_score
from datasets import load_dataset, load_from_disk, load_metric
from transformers import (AutoModel,AutoModelForSequenceClassification,
                          AutoModelForTokenClassification, AutoTokenizer,
                          DataCollatorForTokenClassification, Trainer, TrainingArguments,
                          get_linear_schedule_with_warmup, set_seed)
from tqdm import tqdm
from loguru import logger as loguru_logger
import numpy as np

import sys
sys.path.append("../")


In [3]:

class DatasetInfo:
  def __init__(self, name,
               type="ner", 
               metric=None, 
               load_from_disk=True,
               isMultiSentence=False, 
               validationSubsets=["test"],
               lr=[5e-5, 2e-5, 1e-5], 
               batch_size=[32], 
               epochs=3, 
               runs=1,
               num_labels=None):

    self.name = name
    self.isMultiSentence = isMultiSentence
    self.validationSubsets = validationSubsets
    self.lr = lr
    self.batch_size = batch_size
    self.epochs = epochs
    self.runs = runs
    self.load_from_disk = load_from_disk
    self.type = type
    self.num_labels = num_labels

    if metric == None:
      self.metric = "accuracy"
    else:
      self.metric = metric

    self.fullName = name + "-" + self.metric

class ModelInfo:
  def __init__(self, pretrainedPath, modelPath, isCustom=False, isAdapterTuning=False, use_token_type_ids=True):
    self.pretrainedPath = pretrainedPath
    self.modelPath = modelPath

    self.logsPath = pretrainedPath + f"/"

    self.isCustom = isCustom
    self.isAdapterTuning = isAdapterTuning
    self.use_token_type_ids = use_token_type_ids

  def get_logs_path(self, datasetName):
    return self.logsPath + f"{datasetName}.txt" if not self.isAdapterTuning else self.logsPath + f"{datasetName}-adapter.txt"
  
  def load_model(self, num_labels, ds):
    if self.isCustom:
      if ds.type == "classification":
        model = AutoModelForSequenceClassification.from_pretrained(self.modelPath, num_labels=num_labels)
      elif ds.type == "ner":
        model = AutoModelForTokenClassification.from_pretrained(self.modelPath, num_labels=num_labels)

      if self.isAdapterTuning:
        model.trainAdaptersOnly()
    else:
      if ds.type == "classification":
        model = AutoModelForSequenceClassification.from_pretrained(self.modelPath, num_labels=num_labels)
      elif ds.type == "ner":
        model = AutoModelForTokenClassification.from_pretrained(self.modelPath, num_labels=num_labels)
    
    return model

In [4]:
peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1, bias="all"
)
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, base_model_name_or_path=None, task_type=<TaskType.TOKEN_CLS: 'TOKEN_CLS'>, inference_mode=False, r=8, target_modules=None, lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='all', modules_to_save=None, init_lora_weights=True)

In [5]:
def load_datasets(info):
  """#Dataset Utilities"""
  
  if not info.load_from_disk:
    dataset = load_dataset(info.name)
  else:
    dataset = load_from_disk(info.name)

  if info.type == "classification":
    num_labels = len(set(dataset["train"]["labels"]))
    def mappingFunction(samples, **kargs):
      if info.isMultiSentence:
        outputs = tokenizer(samples[dataset["train"].column_names[0]],
                            samples[dataset["train"].column_names[1]],
                            max_length=512,
                            truncation=True,
                            padding=kargs["padding"])
      else:
        outputs = tokenizer(samples[dataset["train"].column_names[0]],
                            truncation=True,
                            max_length=512,
                            padding=kargs["padding"])

      outputs["labels"] = samples["labels"]

      return outputs
  elif info.type == "ner":
    # print(dataset)
    num_labels = len(dataset["info"][0]["all_ner_tags"])
    def mappingFunction(all_samples_per_split, **kargs):
      tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["tokens"],
                                                      is_split_into_words=True, 
                                                      truncation=True,
                                                      max_length=512,
                                                      padding=kargs["padding"])  
      total_adjusted_labels = []

      for k in range(0, len(tokenized_samples["input_ids"])):
        prev_wid = -1
        word_ids_list = tokenized_samples.word_ids(batch_index=k)
        existing_label_ids = all_samples_per_split["ner_tags"][k]
        i = -1
        adjusted_label_ids = []

        for wid in word_ids_list:
          if(wid is None):
            adjusted_label_ids.append(-100)
          elif(wid!=prev_wid):
            i = i + 1
            adjusted_label_ids.append(existing_label_ids[i])
            prev_wid = wid
          else:
            adjusted_label_ids.append(existing_label_ids[i])
            
        total_adjusted_labels.append(adjusted_label_ids)

      tokenized_samples["labels"] = total_adjusted_labels
      
      return tokenized_samples

  tokenizedTrainDataset = dataset["train"].map(mappingFunction,
                                              batched=True,
                                              remove_columns=dataset["train"].column_names,
                                              fn_kwargs={"padding": "do_not_pad"})
    
  tokenizedValDatasets = []

  for name in info.validationSubsets:
    tokenizedValDataset = dataset[name].map(mappingFunction,
                                            batched=True,
                                            remove_columns=dataset[name].column_names,
                                            fn_kwargs={"padding": "do_not_pad"})
    
    tokenizedValDatasets.append(tokenizedValDataset)

  if info.num_labels != None:
    num_labels = info.num_labels

  return tokenizedTrainDataset, tokenizedValDatasets, num_labels, dataset["info"][0]["all_ner_tags"]

### Test with i2b2 2010 dataset for now

In [4]:
i2b2_2010_data_dir = "/mnt/sdd/niallt/bio-lm/data/tasks/I2B22010NER_hf_dataset/"
i2b2_2010_dataset = load_from_disk(i2b2_2010_data_dir)

In [5]:
i2b2_2010_dataset['train'][2]

{'tokens': ['23238893'], 'ner_tags_str': ['O'], 'ner_tags': [6]}

In [7]:
"roberta-base" in model_name_or_path

True

In [8]:
tokenizer

RobertaTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

### encode datasets

In [7]:
# load a tokenizer and model
model_name_or_path = "roberta-base"
# model_name_or_path = "/mnt/sdc/niallt/saved_models/declutr/mimic/few_epoch/mimic-roberta-base/2_anch_2_pos_min_1024/transformer_format/"
# model_name_or_path = "emilyalsentzer/Bio_ClinicalBERT"
if model_name_or_path == "roberta-base" or "roberta" in model_name_or_path:
    
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, add_prefix_space=True)
else:
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [8]:
dataset_path = "/mnt/sdd/niallt/bio-lm/data/tasks/i2b2-2012_hf_dataset/"

In [9]:
datasets = [
    DatasetInfo(dataset_path, #Use the pre-processing code in BioLM (https://github.com/facebookresearch/bio-lm)
                metric="f1",
                load_from_disk=True,
                type="ner",
                isMultiSentence=False,
                lr=[5e-5, 2e-5, 1e-5],
                epochs=3,
                batch_size=[32],
                runs=1)]

In [10]:
datasets[0].metric

'f1'

In [11]:
train_dataset, valid_dataset, num_labels, all_ner_tags = load_datasets(datasets[0])
    

Loading cached processed dataset at /mnt/sdd/niallt/bio-lm/data/tasks/i2b2-2012_hf_dataset/train/cache-94d1197983df291b.arrow
Loading cached processed dataset at /mnt/sdd/niallt/bio-lm/data/tasks/i2b2-2012_hf_dataset/test/cache-e6d1f9f51471284d.arrow


In [12]:
all_ner_tags

['O',
 'I-OCCURRENCE',
 'B-PROBLEM',
 'I-TEST',
 'B-EVIDENTIAL',
 'B-CLINICAL_DEPT',
 'I-TREATMENT',
 'I-EVIDENTIAL',
 'B-TEST',
 'B-TREATMENT',
 'I-PROBLEM',
 'B-OCCURRENCE',
 'I-CLINICAL_DEPT']

In [13]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 6797
})

In [14]:
valid_dataset[0]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 5664
})

In [15]:
train_dataset[6611]

{'input_ids': [0,
  20,
  3186,
  21,
  3447,
  66,
  13,
  10,
  127,
  43682,
  2617,
  4047,
  271,
  14970,
  2156,
  576,
  39,
  1136,
  23,
  5,
  3062,
  8,
  380,
  991,
  11287,
  450,
  15,
  29541,
  6940,
  118,
  19673,
  14,
  37,
  2226,
  148,
  13316,
  10195,
  1258,
  148,
  4878,
  9,
  5,
  22259,
  479,
  2],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [-100,
  0,
  0,
  0,
  8,
  3,
  0,
  2,
  10,
  10,
  10,
  10,
  10,
  10,
  0,
  0,
  11,
  1,
  0,
  0,
  0,
  0,
  2,
  2,
  2,
  0,
  0,
  8,
  8,
  8,
  8,
  0,
  0,
  0,
  0,
  9,
  6,
  6,
  0,
  9,
  6,
  6,
  6,
  0,
  -100]}

In [16]:
tokenizer.decode(train_dataset[6611]["input_ids"])

'<s> The patient was ruled out for a myocardial infarction, given his fall at the airport and bigeminy seen on electrocardiogram that he developed during conscious sedation during reduction of the fracture.</s>'

In [17]:
num_labels

13

#### Setup dataloaders

In [16]:
i2b2_2010_dataset['train'][0]

NameError: name 'i2b2_2010_dataset' is not defined

In [18]:
collate_fn = DataCollatorForTokenClassification(tokenizer)


In [25]:
train_dataset.select(range(0, 10))

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 10
})

In [26]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 6797
})

In [27]:
# Instantiate dataloaders.
subset = True

if subset:


    train_dataloader = DataLoader(train_dataset.select(range(0, 3000)), shuffle=True, collate_fn=collate_fn, batch_size=8)
    eval_dataloader = DataLoader(
        valid_dataset[0].select(range(0, 1000)), shuffle=False, collate_fn=collate_fn, batch_size=8
    )
else:
    train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=collate_fn, batch_size=8)
    eval_dataloader = DataLoader(
        valid_dataset[0], shuffle=False, collate_fn=collate_fn, batch_size=8
    )
print(f"Train dataset size: {len(train_dataloader)}")
print(f"Validation dataset size: {len(eval_dataloader)}")

Train dataset size: 375
Validation dataset size: 125


In [28]:
subset

True

In [29]:
valid_dataset

[Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 5664
 })]

In [31]:
for batch in train_dataloader:
    print(batch.keys())
    break

dict_keys(['input_ids', 'attention_mask', 'labels'])


#### debug


In [28]:
# sanity check what comes out and goes into token classifier
base_model = AutoModel.from_pretrained(model_name_or_path)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
inputs = tokenizer("Hello, my dog is cute and stinky", return_tensors="pt")

In [30]:
outputs = base_model(**inputs)

In [31]:
outputs[0].shape

torch.Size([1, 11, 768])

### load peft model

In [19]:
model = AutoModelForTokenClassification.from_pretrained(model_name_or_path,
                                                        num_labels = num_labels,
                                                        return_dict=True)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able

In [20]:
num_labels

13

In [20]:
# outputs = model(**inputs)

In [21]:
# outputs.logits.shape

torch.Size([1, 11, 7])

In [21]:
model

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (L

In [22]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, base_model_name_or_path=None, task_type=<TaskType.TOKEN_CLS: 'TOKEN_CLS'>, inference_mode=False, r=8, target_modules=None, lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='all', modules_to_save=None, init_lora_weights=True)

# load peft model

In [23]:

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 417050 || all params: 124369946 || trainable%: 0.335330209116598


In [24]:
model

PeftModelForTokenClassification(
  (base_model): LoraModel(
    (model): RobertaForTokenClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (defau

In [25]:
model.model.roberta.encoder.layer[0].attention.self.query.weight.shape

torch.Size([768, 768])

In [26]:
model.model.roberta.encoder.layer[0].attention.self.query.lora_A.default.weight.shape

torch.Size([8, 768])

#### Setup training

In [27]:
device = "cuda" if torch.cuda.is_available() else "cpu" 

In [28]:
model.to(device)

PeftModelForTokenClassification(
  (base_model): LoraModel(
    (model): RobertaForTokenClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (defau

In [29]:
num_epochs = datasets[0].epochs

In [30]:
datasets[0].batch_size[0]

32

In [31]:
optimizer = AdamW(params=model.parameters(), lr=0.001)

# Instantiate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(train_dataset)/datasets[0].batch_size[0] * num_epochs),
    num_training_steps=(len(train_dataset)/datasets[0].batch_size[0] * num_epochs),
)

# Using hf trainer and compute metrics

In [32]:
metric = evaluate.load("seqeval")

In [33]:
metric

EvaluationModule(name: "seqeval", module_type: "metric", features: {'predictions': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence')}, usage: """
Produces labelling scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions: List of List of predicted labels (Estimated targets as returned by a tagger)
    references: List of List of reference labels (Ground truth (correct) target values)
    suffix: True if the IOB prefix is after type, False otherwise. default: False
    scheme: Specify target tagging scheme. Should be one of ["IOB1", "IOB2", "IOE1", "IOE2", "IOBES", "BILOU"].
        default: None
    mode: Whether to count correct entity labels with incorrect I/B tags as true positives or not.
        If you want to only count exact matches, pass mode="strict". default: None.
    sample_weight: Array-like of sha

In [34]:
label_list = all_ner_tags


In [35]:
label_list

['O',
 'I-OCCURRENCE',
 'B-PROBLEM',
 'I-TEST',
 'B-EVIDENTIAL',
 'B-CLINICAL_DEPT',
 'I-TREATMENT',
 'I-EVIDENTIAL',
 'B-TEST',
 'B-TREATMENT',
 'I-PROBLEM',
 'B-OCCURRENCE',
 'I-CLINICAL_DEPT']

In [36]:
def compute_metrics(p):
    predictions, labels = p
    # print(f"logits shape: {predictions.shape}")
    # print(f"labels: {labels}")
    predictions = np.argmax(predictions, axis=2)
    # print(f"predictions: {predictions}")
    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    # print(f"True predictions: {true_predictions}")
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    # print(f"True labels: {true_labels}")
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [37]:
# model

In [40]:
args = TrainingArguments(
    output_dir = f"/mnt/sdd/efficient_ml_data/saved_models/peft/i2b2_2012/roberta-base/",
    evaluation_strategy = "epoch",
    # learning_rate=2e-5,
    # learning_rate=0.001,
    per_device_train_batch_size=datasets[0].batch_size[0],
    per_device_eval_batch_size=datasets[0].batch_size[0],
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    push_to_hub=False,
    remove_unused_columns=False
)

In [41]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset[0],
    data_collator=collate_fn,
    tokenizer=tokenizer,
    optimizers = (optimizer, lr_scheduler),
    compute_metrics=compute_metrics
)

In [42]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


inside if
KAKAKAKAKAK
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 6797
})
213
BBBBLALALALALALA
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 6797
})
{'input_ids': tensor([[    0,    91,    21,  ...,     1,     1,     1],
        [    0,   221, 18373,  ...,     1,     1,     1],
        [    0,   777,    12,  ...,     1,     1,     1],
        ...,
        [    0,   289, 41713,  ...,     1,     1,     1],
        [    0,  6310, 15040,  ...,     1,     1,     1],
        [    0,    20, 12464,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[-100,    0,    0,  ..., -100, -100, -100],
        [-100,    2,    2,  ..., -100, -100, -100],
        [-100,    0,    0,  ..., -100, -100, -100],

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.400842,0.748335,0.727252,0.737643,0.879257


KOKOKOKOKO
{'input_ids': tensor([[    0,  4516,   448,  ...,     1,     1,     1],
        [    0,   158,    73,  ...,     1,     1,     1],
        [    0, 15421, 29146,  ...,     1,     1,     1],
        ...,
        [    0,    91,    21,  ...,     1,     1,     1],
        [    0,   832, 30960,  ...,     1,     1,     1],
        [    0,    91,  1143,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[-100,   11,   11,  ..., -100, -100, -100],
        [-100,    0,    0,  ..., -100, -100, -100],
        [-100,   11,   11,  ..., -100, -100, -100],
        ...,
        [-100,    0,    0,  ..., -100, -100, -100],
        [-100,    2,   10,  ..., -100, -100, -100],
        [-100,    0,    0,  ..., -100, -100, -100]])}
cheeeeeeeeeeeeeeeeeeeeeeese
['labels']

KeyboardInterrupt: 

# manual train and eval 

In [18]:
def evaluate_model(model, eval_dataloader, all_labels):
    model.eval()
    predictions = []
    references = []

    for batch in tqdm(eval_dataloader):
              

        for key, value in batch.items():
            batch[key] = value.cuda()
        with torch.no_grad():
            output = np.argmax(model(**batch).logits.cpu().detach().numpy(), axis=-1)

            # print(f"output shape: {output.shape}")
            predictions.append([])
            references.append([])
            for prediction, label in zip(output.reshape(-1), batch["labels"].cpu().detach().numpy().reshape(-1)):
                if label != -100:
                    predictions[-1].append(all_labels[prediction])
                    references[-1].append(all_labels[label])





    print(predictions[-1])
    print(references[-1])
    
    # print(f"length of references: {len(references)}")
    metrics_dict = {"precision": precision_score(references, predictions), 
                "recall": recall_score(references, predictions),
                "f1": f1_score(references, predictions)}
    
    

    return metrics_dict, predictions, references

In [19]:
def train_model(model, optimizer, lr_scheduler, train_dataloader, num_epochs):
    
    # main training loop
    for epoch in range(num_epochs):
        model.train()
        for step, batch in enumerate(tqdm(train_dataloader)):
            batch.to(device)
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        eval_metric, preds, labels = evaluate_model(model, eval_dataloader, all_ner_tags)
        print(f"epoch {epoch}:", eval_metric)
        
    # return trained model and metrics etc
    return model, eval_metric, preds, labels


In [20]:
# below is mimic declutr
trained_model, eval_metric, preds, labels = train_model(model, optimizer, lr_scheduler, train_dataloader, num_epochs)

  0%|          | 0/841 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 841/841 [00:54<00:00, 15.40it/s]
100%|██████████| 3454/3454 [01:03<00:00, 54.57it/s]


['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
epoch 0: {'precision': 0.7925563354266716, 'recall': 0.704349967170059, 'f1': 0.7458543368677212}


100%|██████████| 841/841 [00:52<00:00, 16.10it/s]
100%|██████████| 3454/3454 [01:27<00:00, 39.49it/s]


['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
epoch 1: {'precision': 0.8215936860180966, 'recall': 0.8167925147734734, 'f1': 0.8191860656547365}


100%|██████████| 841/841 [00:56<00:00, 15.01it/s]
100%|██████████| 3454/3454 [01:27<00:00, 39.31it/s]


['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
epoch 2: {'precision': 0.8213666830145496, 'recall': 0.8247373604727511, 'f1': 0.8230485707265133}


100%|██████████| 841/841 [00:53<00:00, 15.62it/s]
100%|██████████| 3454/3454 [01:03<00:00, 54.80it/s]


['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
epoch 3: {'precision': 0.8480925050641458, 'recall': 0.824704530531845, 'f1': 0.8362350199733689}


100%|██████████| 841/841 [00:51<00:00, 16.46it/s]
100%|██████████| 3454/3454 [01:03<00:00, 54.48it/s]


['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
epoch 4: {'precision': 0.846059683716006, 'recall': 0.8395600787918581, 'f1': 0.8427973502949608}


In [20]:
# below is bioclinical bert
trained_model, eval_metric, preds, labels = train_model(model, optimizer, lr_scheduler, train_dataloader, num_epochs)

  0%|          | 0/841 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 841/841 [00:53<00:00, 15.77it/s]
100%|██████████| 3454/3454 [01:03<00:00, 54.66it/s]


['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
epoch 0: {'precision': 0.7775513900208276, 'recall': 0.7567198378426016, 'f1': 0.7669941938365341}


100%|██████████| 841/841 [00:50<00:00, 16.52it/s]
100%|██████████| 3454/3454 [01:02<00:00, 54.97it/s]


['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
epoch 1: {'precision': 0.8188947319863457, 'recall': 0.7822331893892659, 'f1': 0.8001442351032183}


100%|██████████| 841/841 [00:53<00:00, 15.63it/s]
100%|██████████| 3454/3454 [01:02<00:00, 55.11it/s]


['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
epoch 2: {'precision': 0.8492880238776271, 'recall': 0.8024587996827355, 'f1': 0.8252095763159881}


100%|██████████| 841/841 [00:50<00:00, 16.77it/s]
100%|██████████| 3454/3454 [01:02<00:00, 55.41it/s]


['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
epoch 3: {'precision': 0.8376754412384796, 'recall': 0.8170294644693165, 'f1': 0.8272236515325014}


100%|██████████| 841/841 [00:51<00:00, 16.39it/s]
100%|██████████| 3454/3454 [01:02<00:00, 55.50it/s]


['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
epoch 4: {'precision': 0.8443005952380952, 'recall': 0.8333627096736289, 'f1': 0.8387959965109918}


In [81]:
model.device

device(type='cuda', index=0)

In [24]:
model

PeftModelForTokenClassification(
  (base_model): LoraModel(
    (model): RobertaForTokenClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (defau

In [23]:
tokenizer

RobertaTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [151]:
len(eval_dataloader)

3454

In [36]:
unique_preds = np.unique([np.unique(p[0])[0] for p in preds])

In [37]:
unique_preds

array(['O'], dtype='<U1')

In [38]:
unique_labels = np.unique([np.unique(l[0])[0] for l in labels])

In [39]:
unique_labels

array(['B-problem', 'B-test', 'B-treatment', 'O'], dtype='<U11')

In [24]:
all_labels = all_ner_tags

In [189]:
datasets[0].metric

'f1'

In [20]:
model.device

device(type='cpu')

100%|██████████| 3454/3454 [01:17<00:00, 44.53it/s]

['I-treatment', 'B-treatment', 'B-treatment', 'B-treatment', 'B-treatment', 'B-treatment', 'B-treatment', 'B-treatment', 'B-treatment', 'O', 'B-treatment', 'B-treatment', 'B-treatment', 'B-problem', 'B-treatment', 'I-treatment', 'B-treatment', 'I-treatment', 'O', 'I-treatment', 'O', 'I-treatment', 'B-treatment', 'B-treatment', 'I-treatment']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']





In [26]:
print(f"precision: {precision_score(references, predictions)}\nrecall: {recall_score(references, predictions)}\nf1: {f1_score(references, predictions)}")

precision: 0.03969077955278091
recall: 0.19139855548260012
f1: 0.06574736883653659
