In [1]:
import argparse
import os

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PrefixTuningConfig,
    PromptEncoderConfig,
    PromptTuningConfig,
)


import evaluate
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
from tqdm import tqdm
from loguru import logger as loguru_logger
import numpy as np

2023-05-13 14:51:01.191554: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda-11.2/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 112
CUDA SETUP: Loading binary /mnt/sdc/niallt/venvs/39nlp/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda112.so...


## Setup some parameters 

In [2]:

batch_size = 32
# model_name_or_path = "/mnt/sdc/niallt/saved_models/language_modelling/mimic/roberta-base-mimic-wecho/sampled_250000/08-03-2023--13-06/checkpoint-84000/" # 
# model_name_or_path = "/mnt/sdc/niallt/saved_models/declutr/mimic/few_epoch/mimic-roberta-base/2_anch_2_pos_min_1024/transformer_format/"
model_name_or_path = "/mnt/sdc/niallt/saved_models/language_modelling/mimic/mimic-roberta-base/sampled_250000/22-12-2022--12-45/checkpoint-100000/"
# model_name_or_path = "roberta-base" # | roberta-large
peft_method = "LORA" # | PROMPT_TUNING | PREFIX_TUNING | P_TUNING
device = "cuda"
num_epochs = 5

In [4]:
# peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)
# lr = 3e-4

### Function to load different peft setups

In [18]:
def setup_peft_model(model_name_or_path,                     
                     peft_method,
                     task_type,                     
                     device,
                     num_virtual_tokens= 20,
                     num_labels = 7):
    '''
    Function to setup the peft model for training and return a peft model based on the peft method specified.
    
    '''
    
    if peft_method == "LORA":
        loguru_logger.info("Using LORA")
        peft_type = PeftType.LORA
        lr = 3e-4
        peft_config = LoraConfig(task_type=task_type, inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)
    elif peft_method == "PREFIX_TUNING":
        loguru_logger.info("Using PREFIX_TUNING")
        peft_type = PeftType.PREFIX_TUNING
        peft_config = PrefixTuningConfig(task_type=task_type, num_virtual_tokens=20)
        lr = 1e-2
    elif peft_method == "PROMPT_TUNING":
        loguru_logger.info("Using PROMPT_TUNING")
        peft_type = PeftType.PROMPT_TUNING
        peft_config = PromptTuningConfig(task_type=task_type, num_virtual_tokens=10)
        lr = 1e-3
    elif peft_method == "P_TUNING":
        loguru_logger.info("Using P_TUNING")
        peft_type = PeftType.P_TUNING
        peft_config = PromptEncoderConfig(task_type=task_type, num_virtual_tokens=20, encoder_hidden_size=128)
        lr = 1e-3
        

    # load peft model
    model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, num_labels = num_labels,return_dict=True)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
   
    
    model.to(device)
    
    # setup optimizer and lr_scheduler
    optimizer = AdamW(params=model.parameters(), lr=lr)

    # Instantiate scheduler
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
        num_training_steps=(len(train_dataloader) * num_epochs),
    )
    return model, peft_config, optimizer, lr_scheduler

### setup task and dataset

In [4]:
# create dictionary of various datasets and their sentence keys
task_to_keys ={
                "cola": ("sentence", None),
                "mnli": ("premise", "hypothesis"),
                "mnli-mm": ("premise", "hypothesis"),
                "mrpc": ("sentence1", "sentence2"),
                "qnli": ("question", "sentence"),
                "qqp": ("question1", "question2"),
                "rte": ("sentence1", "sentence2"),
                "sst2": ("sentence", None),
                "stsb": ("sentence1", "sentence2"),
                "wnli": ("sentence1", "sentence2"),
                "mimic-note-category": ("TEXT", None),
                "icd9-triage":("text", None),
                "icd9-triage-no-category-in-text":("text", None),
                }

In [5]:
# task for now is icd9-triage
task = "icd9-triage"

# task = "mrpc"

In [8]:
# datasets = load_dataset("glue", task)
# metric = evaluate.load("glue", "mrpc")

In [6]:
metric = evaluate.load("f1")

In [10]:
metric

EvaluationModule(name: "f1", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    labels (`list` of `int`): The set of labels to include when `average` is not set to `'binary'`, and the order of the labels if `average` is `None`. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class. Labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in `predictions` and `references` are used in sorted order. Defaults to None.
    pos_label (`int`): The class to be considered the positive class, in the case where `average` is set to `binary`. Defaults to 1.
    average (`string`): This parameter is required for multiclass/multilabel t

In [11]:
# metric.add_batch

### load dataset 

In [7]:
# load mimic dataset

training_data_dir = "/mnt/sdc/niallt/mimic_iii/processed/HADM_ID_split/icd9-triage/fewshot_128/"
eval_data_dir = "/mnt/sdc/niallt/mimic_iii/processed/HADM_ID_split/icd9-triage/"
datasets = load_dataset("csv", 
                        data_files = {"train":f"{training_data_dir}/train.csv",
                                        "validation":f"{eval_data_dir}/valid.csv",
                                        "test":f"{eval_data_dir}/test.csv"},
                        cache_dir = "/mnt/sdc/niallt/.cache/")

loguru_logger.info(f"Number of training samples: {len(datasets['train'])}\n and validation samples:{len(datasets['validation'])}")

Found cached dataset csv (/mnt/sdc/niallt/.cache/csv/default-a2aa7db9a148ce03/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/3 [00:00<?, ?it/s]

2023-05-13 14:51:50.070 | INFO     | __main__:<cell line: 11>:11 - Number of training samples: 896
 and validation samples:3114


In [13]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'triage-category', 'label'],
        num_rows: 896
    })
    validation: Dataset({
        features: ['text', 'triage-category', 'label'],
        num_rows: 3114
    })
    test: Dataset({
        features: ['text', 'triage-category', 'label'],
        num_rows: 3172
    })
})

In [8]:
# get number of labels
num_labels = len(np.unique(datasets["train"]["label"]))

In [15]:
num_labels

7

In [19]:
len(datasets["train"])/batch_size * num_epochs

140.0

In [14]:
num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
num_training_steps=(len(train_dataloader) * num_epochs),

In [15]:
num_training_steps

(140,)

### pre-process/encode dataset

In [9]:
sentence1_key, sentence2_key = task_to_keys[task]

In [10]:
if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id


# set the sentence/task keys
# sentence1_key, sentence2_key = task_to_keys[task]

# for glue
# def tokenize_function(examples):
#     # max_length=None => use the model max length (it's actually the default)
#     outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=480)
#     return outputs

# own
def tokenize_function(examples):
    # max_length is important when using prompt tuning  or prefix tuning or p tuning as virtual tokens are added - which can overshoot the max length in pefts current form
    # for now set to 480 and see how it goes
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True, max_length = 480)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True, max_length=480)

# own
tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=['text', 'triage-category'],
)

# for glue
# tokenized_datasets = datasets.map(
#     tokenize_function,
#     batched=True,
#     remove_columns=["idx", "sentence1", "sentence2"],
# )
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
# transformers library
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")


def collate_fn(examples):
    return tokenizer.pad(examples, padding="longest", return_tensors="pt")


# Instantiate dataloaders.
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size
)

Map:   0%|          | 0/896 [00:00<?, ? examples/s]

Map:   0%|          | 0/3114 [00:00<?, ? examples/s]

Map:   0%|          | 0/3172 [00:00<?, ? examples/s]

In [18]:
for batch in train_dataloader:
    print(batch["input_ids"].shape)
    

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])
torch.Size([32, 480])


In [19]:
# train_dataloader.dataset[0]

In [11]:
def compute_metrics(eval_pred):
        precision_score = evaluate.load("precision")
        recall_score = evaluate.load("recall")
        accuracy_score = evaluate.load("accuracy")
        f1_score = evaluate.load("f1")        
        roc_auc_score = evaluate.load("roc_auc", "multiclass")        

        logits, labels = eval_pred
        
        # print(f"logits are: {logits} of shape: {logits.shape}")
        #TODO add softmax to convert logits to probs
        # print(f"logits shape is: {logits.shape}")
        pred_scores = softmax(logits, axis = -1)        
        predictions = np.argmax(logits, axis = -1)
        
        # print(f"Labels are: {labels}\n")
        # print(f"Preds are: {predictions}")
        precision = precision_score.compute(predictions=predictions, references=labels, average = "macro")["precision"]
        recall = recall_score.compute(predictions=predictions, references=labels, average = "macro")["recall"]
        accuracy = accuracy_score.compute(predictions=predictions, references=labels)["accuracy"]
        f1_macro = f1_score.compute(predictions=predictions, references=labels, average = "macro")["f1"]
        f1_weighted = f1_score.compute(predictions=predictions, references=labels, average = "weighted")["f1"]
        # roc_auc has slightly different format - needs the probs/scores rather than predicted labels
        roc_auc = roc_auc_score.compute(references=labels,
                                        prediction_scores = pred_scores,
                                        multi_class = 'ovr', 
                                        average = "macro")['roc_auc']
        
        return {"precision": precision, 
                "recall": recall,
                "accuracy": accuracy,
                "f1_macro":f1_macro,
                "f1_weighted":f1_weighted,
                "roc_auc_macro":roc_auc}

## Setup PEFT model

In [19]:
model, peft_config, optimizer, lr_scheduler = setup_peft_model(model_name_or_path, peft_method = "LORA", task_type = "SEQ_CLS", device = "cuda", num_labels = num_labels)

2023-05-13 15:15:07.512 | INFO     | __main__:setup_peft_model:13 - Using LORA
Some weights of the model checkpoint at /mnt/sdc/niallt/saved_models/language_modelling/mimic/mimic-roberta-base/sampled_250000/22-12-2022--12-45/checkpoint-100000/ were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized fr

trainable params: 1486862 || all params: 125541902 || trainable%: 1.1843551645409993


In [23]:
294912/1486862

0.1983452398406846

In [20]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, base_model_name_or_path='/mnt/sdc/niallt/saved_models/language_modelling/mimic/mimic-roberta-base/sampled_250000/22-12-2022--12-45/checkpoint-100000/', task_type='SEQ_CLS', inference_mode=False, r=8, target_modules=['query', 'value'], lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True)

In [21]:
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      

In [23]:
# optimizer.param_groups

In [24]:
# model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, num_labels = num_labels,return_dict=True)
# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()
# model

In [25]:
lr_scheduler

<torch.optim.lr_scheduler.LambdaLR at 0x7ff934fa4f40>

In [26]:
# optimizer = AdamW(params=model.parameters(), lr=lr)

# # Instantiate scheduler
# lr_scheduler = get_linear_schedule_with_warmup(
#     optimizer=optimizer,
#     num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
#     num_training_steps=(len(train_dataloader) * num_epochs),
# )

In [27]:
device

'cuda'

In [13]:
model.to(device)
for epoch in range(num_epochs):
    model.train()
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch.to(device)
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch.to(device)
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        predictions, references = predictions, batch["labels"]
        metric.add_batch(
            predictions=predictions,
            references=references,
        )

    eval_metric = metric.compute(average = "macro")
    print(f"epoch {epoch}:", eval_metric)

  0%|          | 0/28 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 28/28 [00:15<00:00,  1.86it/s]
100%|██████████| 98/98 [00:24<00:00,  4.05it/s]


epoch 0: {'f1': 0.4277379326469163}


100%|██████████| 28/28 [00:14<00:00,  1.96it/s]
100%|██████████| 98/98 [00:24<00:00,  4.01it/s]


epoch 1: {'f1': 0.7469826364838471}


100%|██████████| 28/28 [00:14<00:00,  1.95it/s]
100%|██████████| 98/98 [00:24<00:00,  4.01it/s]


epoch 2: {'f1': 0.8215974876511716}


100%|██████████| 28/28 [00:14<00:00,  1.94it/s]
100%|██████████| 98/98 [00:24<00:00,  4.00it/s]


epoch 3: {'f1': 0.8353677486601272}


100%|██████████| 28/28 [00:14<00:00,  1.94it/s]
100%|██████████| 98/98 [00:24<00:00,  4.00it/s]

epoch 4: {'f1': 0.835936696393503}





In [29]:
tokenizer.model_max_length

512

In [30]:
model.peft_config

{'default': PromptTuningConfig(peft_type=<PeftType.PROMPT_TUNING: 'PROMPT_TUNING'>, base_model_name_or_path='roberta-base', task_type='SEQ_CLS', inference_mode=False, num_virtual_tokens=10, token_dim=768, num_transformer_submodules=1, num_attention_heads=12, num_layers=12, prompt_tuning_init=<PromptTuningInit.RANDOM: 'RANDOM'>, prompt_tuning_init_text=None, tokenizer_name_or_path=None)}