In [1]:

import argparse
import os

#set visible cuda devices
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from peft import get_peft_config, get_peft_model, LoraConfig, IA3Config,TaskType
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PeftConfig,
    PeftModel,
    PrefixTuningConfig,
    PromptEncoderConfig,
    PromptTuningConfig,
    prepare_model_for_int8_training,
    # AutoPeftModel,
    prepare_model_for_kbit_training # only for latest dev version of peft
)


import evaluate
from datasets import load_dataset, load_from_disk
from transformers import (AutoModelForSequenceClassification,
                          AutoModelForTokenClassification, 
                          AutoModelForCausalLM,
                          AutoModelForMaskedLM,
                          AutoModel,
                        AutoTokenizer,
                        get_linear_schedule_with_warmup,
                        set_seed,
                        LlamaForSequenceClassification,
                        LlamaForCausalLM,
                        LlamaTokenizer, LongformerForMaskedLM, LongformerForSequenceClassification)
import yaml
from tqdm import tqdm
from loguru import logger as loguru_logger
import numpy as np

import sys
sys.path.append("../")

from data_utils.model_utils import count_trainable_parameters, freeze_model, unfreeze_model
from sklearn.metrics import roc_auc_score

In [4]:
# # med dataset instructions
# from datasets import load_dataset

# dataset = load_dataset("nlpie/Llama2-MedTuned-Instructions")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 205048
    })
})

## Setup some parameters 

In [None]:
# function to count trainable params of a model


In [15]:
LlamaTokenizer

transformers.models.llama.tokenization_llama.LlamaTokenizer

In [2]:

batch_size = 2
# model_name_or_path = "/mnt/sdc/niallt/saved_models/language_modelling/mimic/roberta-base-mimic-wecho/sampled_250000/08-03-2023--13-06/checkpoint-84000/" # 
# model_name_or_path = "/mnt/sdc/niallt/saved_models/declutr/mimic/few_epoch/mimic-roberta-base/2_anch_2_pos_min_1024/transformer_format/"
# model_name_or_path = "/mnt/sdc/niallt/saved_models/language_modelling/mimic/mimic-roberta-base/sampled_250000/22-12-2022--12-45/checkpoint-100000/"
# model_name_or_path = "/mnt/sdc/niallt/saved_models/declutr/mimic/few_epoch/mimic-roberta-base/2_anch_2_pos_min_1024/transformer_format/"
model_name_or_path = "roberta-base" # | roberta-large
peft_method = "LORA" # | PROMPT_TUNING | PREFIX_TUNING | P_TUNING
device = "cuda"
num_epochs = 3

In [19]:
## llama model
# model_name_or_path = "decapoda-research/llama-7b-hf" # ybelkada/falcon-7b-sharded-bf16
model_name_or_path ="ybelkada/falcon-7b-sharded-bf16"

In [3]:
## open llama 3b
model_path = 'openlm-research/open_llama_3b'
model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map = "auto")

Downloading (…)lve/main/config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/6.85G [00:00<?, ?B/s]

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [4]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 3200, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (k_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (v_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (o_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3200, out_features=8640, bias=False)
          (down_proj): Linear(in_features=8640, out_features=3200, bias=False)
          (up_proj): Linear(in_features=3200, out_features=8640, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm(

In [8]:
auto_model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map = "auto")

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


In [9]:
auto_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 3200, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (k_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (v_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (o_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3200, out_features=8640, bias=False)
          (down_proj): Linear(in_features=8640, out_features=3200, bias=False)
          (up_proj): Linear(in_features=3200, out_features=8640, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm(

## debugging

In [2]:
# load standard model with peft
model_name_or_path = "roberta-base"

# model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map = "auto")
model = AutoModelForTokenClassification.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map = "auto")
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [28]:
# get model hidden states
# create random prompt
prompt = "this is a test prompt and lets make it bit longer for funsies"
batch = tokenizer(prompt, return_tensors="pt", padding=True)
# pass to model and get hidden states
outputs = model(**batch, output_hidden_states=True)

In [30]:
outputs.logits.shape

torch.Size([1, 17, 2])

In [31]:
outputs.logits

tensor([[[ 0.0184,  0.1240],
         [-0.0780,  0.4109],
         [-0.2964,  0.3555],
         [-0.2192,  0.1602],
         [-0.0996,  0.2478],
         [-0.0077,  0.3706],
         [-0.2177,  0.5303],
         [-0.0495,  0.4836],
         [-0.2133,  0.2869],
         [-0.0471,  0.3550],
         [-0.0575,  0.3772],
         [-0.1148,  0.1251],
         [-0.2037,  0.4351],
         [-0.0977,  0.2900],
         [-0.1378,  0.4795],
         [-0.2089,  0.2837],
         [ 0.0451,  0.1555]]], dtype=torch.float16, grad_fn=<ToCopyBackward0>)

In [37]:
outputs.hidden_states[12].device

device(type='cpu')

In [32]:
outputs.hidden_states[12].shape

torch.Size([1, 17, 768])

In [39]:
model.classifier

Linear(in_features=768, out_features=2, bias=True)

In [40]:
# pass manually to token classifier head
model.classifier(outputs.hidden_states[12])

tensor([[[ 0.0184,  0.1240],
         [-0.0780,  0.4109],
         [-0.2964,  0.3555],
         [-0.2192,  0.1602],
         [-0.0996,  0.2478],
         [-0.0077,  0.3706],
         [-0.2177,  0.5303],
         [-0.0495,  0.4836],
         [-0.2133,  0.2869],
         [-0.0471,  0.3550],
         [-0.0575,  0.3772],
         [-0.1148,  0.1251],
         [-0.2037,  0.4351],
         [-0.0977,  0.2900],
         [-0.1378,  0.4795],
         [-0.2089,  0.2837],
         [ 0.0451,  0.1555]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<ViewBackward0>)

In [45]:
# decompose the classifier

outputs.hidden_states[12].cuda() @ model.classifier.weight.T + model.classifier.bias

tensor([[[ 0.0184,  0.1240],
         [-0.0780,  0.4109],
         [-0.2964,  0.3555],
         [-0.2192,  0.1602],
         [-0.0996,  0.2478],
         [-0.0077,  0.3706],
         [-0.2177,  0.5303],
         [-0.0495,  0.4836],
         [-0.2133,  0.2869],
         [-0.0471,  0.3550],
         [-0.0575,  0.3772],
         [-0.1148,  0.1251],
         [-0.2037,  0.4351],
         [-0.0977,  0.2900],
         [-0.1378,  0.4795],
         [-0.2089,  0.2837],
         [ 0.0451,  0.1555]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)

In [11]:
# just freeze the plm
freeze_model(model.base_model)


In [12]:
count_trainable_parameters(model)

592130

In [6]:
# load standard model with peft
model_name_or_path = "roberta-base"

model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map = "auto")
peft_type = PeftType.LORA
lr = 3e-4
peft_config = LoraConfig(task_type = "SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1, modules_to_save=["classifier"])
peft_model = get_peft_model(model, peft_config)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
peft_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      

In [8]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='roberta-base', revision=None, task_type='SEQ_CLS', inference_mode=False, r=8, target_modules={'value', 'query'}, lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=['classifier'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={})

In [66]:
count_trainable_parameters(peft_model.base_model.classifier)

1184260

### can we load in a base peft model with no spcific task

What can happen is the get_peft_model will end up freezing all layers if no task is specified

[x] - Works for AutoModel with no LM head
[]  - Does not work for AutoModelForMLM

In [23]:
auto_model = AutoModel.from_pretrained(model_name_or_path, torch_dtype=torch.float16)
mlm_model = AutoModelForMaskedLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
auto_model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [25]:
mlm_model

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

In [3]:
peft_type = PeftType.LORA
lr = 3e-4
peft_config = LoraConfig(task_type=None, inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)

In [11]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, base_model_name_or_path=None, revision=None, task_type=None, inference_mode=False, r=8, target_modules=None, lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)

In [4]:
lora_roberta  = get_peft_model(model, peft_config)

NameError: name 'model' is not defined

In [19]:
lora_roberta

PeftModel(
  (base_model): LoraModel(
    (model): RobertaForMaskedLM(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_

In [21]:
lora_roberta.print_trainable_parameters()

trainable params: 294,912 || all params: 124,992,345 || trainable%: 0.23594404921357384


### Can we use peft on top of peft method - i.e. LORA first and then PROMPT_TUNING on top of that?

In [5]:
peft_type = PeftType.LORA
lr = 3e-4
peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, num_labels=2)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should pr

In [7]:
lora_roberta  = get_peft_model(model, peft_config)

In [8]:
lora_roberta.print_trainable_parameters()

trainable params: 1,479,172 || all params: 125,534,212 || trainable%: 1.1783018959006968


In [9]:
# count trainable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [10]:
count_parameters(lora_roberta)

1479172

In [12]:
lora_roberta

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      

In [8]:
lora_roberta.print_trainable_parameters()

trainable params: 1,479,172 || all params: 125,534,212 || trainable%: 1.1783018959006968


In [10]:
# now run this through the prefix tuning peft config too

second_peft_type = PeftType.PROMPT_TUNING
lr = 1e-3
second_peft_config = PromptTuningConfig(task_type="SEQ_CLS", 
                                    num_virtual_tokens=10)

full_peft_roberta = get_peft_model(lora_roberta, second_peft_config)


AttributeError: 'NoneType' object has no attribute 'named_parameters'

### IA3 

In [6]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [13]:
peft_type =PeftType.IA3
# peft_config = IA3Config(task_type="SEQ_CLS", inference_mode=False)
# for bio-distilbert
peft_config = IA3Config(task_type="SEQ_CLS", 
                        target_modules=["k_lin", "v_lin","lin1", "lin2"], 
                        feedforward_modules=["lin1","lin2"],
                        inference_mode=False)
lr = 1e-3

In [4]:
peft_config

IA3Config(peft_type=<PeftType.IA3: 'IA3'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, target_modules=None, feedforward_modules=None, fan_in_fan_out=False, modules_to_save=None, init_ia3_weights=True)

In [14]:
model_name_or_path = "nlpie/bio-distilbert-uncased" # "nlpie/bio-mobilebert" nlpie/bio-distilbert-uncased
 
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()




Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at nlpie/bio-distilbert-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 624,386 || all params: 67,579,396 || trainable%: 0.9239295361562568


In [15]:
model

PeftModelForSequenceClassification(
  (base_model): IA3Model(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): MultiHeadSelfAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): Linear(in_features=768, out_features=768, bias=True)
                (k_lin): Linear(
                  in_features=768, out_features=768, bias=True
                  (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 768x1])
                )
                (v_lin): Linear(
        

In [35]:
model.base_model.model.mobilebert.encoder.layer[0].bottleneck.attention.dense

Linear(in_features=512, out_features=128, bias=True)

In [6]:
model.peft_config

{'default': IA3Config(peft_type=<PeftType.IA3: 'IA3'>, auto_mapping=None, base_model_name_or_path='roberta-base', revision=None, task_type='SEQ_CLS', inference_mode=False, target_modules=['key', 'value', 'output.dense'], feedforward_modules=['output.dense'], fan_in_fan_out=False, modules_to_save=None, init_ia3_weights=True)}

In [33]:
model

PeftModelForSequenceClassification(
  (base_model): IA3Model(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(in_features=768, out_features=768, bias=True)
                  (key): Linear(
                    in_features=768, out_features=768, bias=True
                    (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 768x1])
       

### mobile bert models
at moment does not work with mobile bert models - the peft library does not support them directly

In [27]:
# moible bert doesn't work with lora right now
mobile_bert = AutoModelForSequenceClassification.from_pretrained("nlpie/clinical-mobilebert") # nlpie/clinical-mobilebert | nlpie/bio-mobilebert

Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at nlpie/clinical-mobilebert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
mobile_bert

MobileBertForSequenceClassification(
  (mobilebert): MobileBertModel(
    (embeddings): MobileBertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (embedding_transformation): Linear(in_features=384, out_features=512, bias=True)
      (LayerNorm): NoNorm()
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): MobileBertEncoder(
      (layer): ModuleList(
        (0-23): 24 x MobileBertLayer(
          (attention): MobileBertAttention(
            (self): MobileBertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=512, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): MobileBertSelfOutput(
              (dense): Linear(in_fe

In [13]:
mobile_bert.config

MobileBertConfig {
  "_name_or_path": "nlpie/bio-mobilebert",
  "architectures": [
    "MobileBertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_activation": false,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "relu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "intra_bottleneck_size": 128,
  "key_query_shared_bottleneck": true,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "mobilebert",
  "normalization_type": "no_norm",
  "num_attention_heads": 4,
  "num_feedforward_networks": 4,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "torch_dtype": "float32",
  "transformers_version": "4.30.2",
  "trigram_input": true,
  "true_hidden_size": 128,
  "type_vocab_size": 2,
  "use_bottleneck": true,
  "use_bottleneck_attention": false,
  "vocab_size": 30522
}

In [28]:
peft_type = PeftType.LORA
lr = 3e-4
peft_config = LoraConfig(task_type="SEQ_CLS", target_modules=["key", "value", "query"], inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)

In [29]:
# get peft model
lora_mobile_bert  = get_peft_model(mobile_bert, peft_config)

In [22]:
lora_mobile_bert

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): MobileBertForSequenceClassification(
      (mobilebert): MobileBertModel(
        (embeddings): MobileBertEmbeddings(
          (word_embeddings): Embedding(30522, 128, padding_idx=0)
          (position_embeddings): Embedding(512, 512)
          (token_type_embeddings): Embedding(2, 512)
          (embedding_transformation): Linear(in_features=384, out_features=512, bias=True)
          (LayerNorm): NoNorm()
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (encoder): MobileBertEncoder(
          (layer): ModuleList(
            (0-23): 24 x MobileBertLayer(
              (attention): MobileBertAttention(
                (self): MobileBertSelfAttention(
                  (query): Linear(
                    in_features=128, out_features=128, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
         

In [30]:
count_trainable_parameters(lora_mobile_bert)

222210

In [25]:
count_trainable_parameters(lora_mobile_bert.base_model.classifier)

1026

### distil bert

In [2]:
distil_bert = AutoModelForSequenceClassification.from_pretrained("nlpie/bio-distilbert-uncased") # distilbert-base-uncased nlpie/distil-biobert nlpie/distil-clinicalbert nlpie/bio-distilbert-uncased


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at nlpie/bio-distilbert-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
distil_bert

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [14]:
count_trainable_parameters(distil_bert)

65784578

In [7]:

peft_type = PeftType.LORA
lr = 3e-4
peft_config = LoraConfig(task_type="SEQ_CLS", target_modules=["q_lin","k_lin", "v_lin"], inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)


In [8]:
lora_distil_bert  = get_peft_model(distil_bert, peft_config)

In [9]:
lora_distil_bert 

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(28996, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): MultiHeadSelfAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): Linear(
                  in_features=768, out_features=768, bias=True
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=8, bias=Fals

##### prefix

In [29]:
model_name_or_path = "nlpie/bio-distilbert-uncased" # "nlpie/bio-mobilebert" nlpie/bio-distilbert-uncased
peft_type = PeftType.PREFIX_TUNING
# peft_config = PrefixTuningConfig(task_type="SEQ_CLS",
#                                     num_virtual_tokens=20,
#           )

# for distil
peft_config = PrefixTuningConfig(task_type="SEQ_CLS",
                                    num_virtual_tokens=20,
                                    num_layers = 6,
                                    num_attention_heads = 12,
                                    token_dim = 768)
lr = 1e-2 # default 1e-2
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True)
model = get_peft_model(model, peft_config)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
model.peft_config

{'default': PrefixTuningConfig(peft_type=<PeftType.PREFIX_TUNING: 'PREFIX_TUNING'>, auto_mapping=None, base_model_name_or_path='roberta-base', revision=None, task_type='SEQ_CLS', inference_mode=False, num_virtual_tokens=20, token_dim=768, num_transformer_submodules=1, num_attention_heads=12, num_layers=12, encoder_hidden_size=768, prefix_projection=False)}

### tiny bert

In [17]:
tiny_bert

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-12, e

In [33]:
tiny_bert = AutoModelForSequenceClassification.from_pretrained("nlpie/tiny-biobert") # prajjwal1/bert-tiny

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpie/tiny-biobert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
peft_type =PeftType.IA3
peft_config = IA3Config(task_type="SEQ_CLS", inference_mode=False)
# for bio-distilbert
# peft_config = IA3Config(task_type="SEQ_CLS", 
#                         target_modules=["k_lin", "v_lin","lin1", "lin2"], 
#                         feedforward_modules=["lin1","lin2"],
#                         inference_mode=False)
lr = 1e-3

model = get_peft_model(tiny_bert, peft_config)


In [35]:
model

PeftModelForSequenceClassification(
  (base_model): IA3Model(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(28996, 312, padding_idx=0)
          (position_embeddings): Embedding(512, 312)
          (token_type_embeddings): Embedding(2, 312)
          (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-3): 4 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): Linear(in_features=312, out_features=312, bias=True)
                  (key): Linear(
                    in_features=312, out_features=312, bias=True
                    (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 312x1])
                  )
                  (value): L

### Longformer models?

In [26]:
# load longformer 

longformer = LongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096")

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weig

In [4]:
longformer.classifier

LongformerClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=2, bias=True)
)

In [6]:
count_trainable_parameters(longformer)

148660994

In [27]:
count_trainable_parameters(longformer.classifier)

592130

In [72]:
"longformer" in "yikuan8/Clinical-Longformer".lower()

True

In [67]:
auto_longformer = AutoModelForSequenceClassification.from_pretrained("yikuan8/Clinical-Longformer")

Some weights of the model checkpoint at yikuan8/Clinical-Longformer were not used when initializing LongformerForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'longformer.embeddings.position_ids', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at yikuan8/Clinical-Longformer and are newly initial

In [68]:
count_trainable_parameters(auto_longformer)

148660994

In [30]:
count_trainable_parameters(auto_longformer.classifier)

592130

In [69]:
auto_longformer.classifier

LongformerClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=2, bias=True)
)

In [31]:
peft_type = PeftType.LORA
lr = 3e-4
peft_config = LoraConfig(task_type=None, target_modules=["query","value","key", 
                                                         "query_global", 
                                                         "value_global",
                                                         "key_global"] ,inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)

In [33]:
peft_longformer = get_peft_model(longformer, peft_config)

In [16]:
peft_longformer

PeftModel(
  (base_model): LoraModel(
    (model): LongformerForSequenceClassification(
      (longformer): LongformerModel(
        (embeddings): LongformerEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (position_embeddings): Embedding(4098, 768, padding_idx=1)
        )
        (encoder): LongformerEncoder(
          (layer): ModuleList(
            (0-11): 12 x LongformerLayer(
              (attention): LongformerAttention(
                (self): LongformerSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      

In [17]:
peft_longformer.print_trainable_parameters()

trainable params: 884,736 || all params: 149,545,730 || trainable%: 0.5916156883917716


In [24]:
count_trainable_parameters(peft_longformer.classifier)

0

In [34]:
# unfreeze classifier
unfreeze_model(peft_longformer.classifier)

In [35]:
count_trainable_parameters(peft_longformer.classifier)

592130

In [42]:
peft_longformer.print_trainable_parameters()

trainable params: 1,476,866 || all params: 149,545,730 || trainable%: 0.987568150558361


In [36]:
peft_longformer

PeftModel(
  (base_model): LoraModel(
    (model): LongformerForSequenceClassification(
      (longformer): LongformerModel(
        (embeddings): LongformerEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (position_embeddings): Embedding(4098, 768, padding_idx=1)
        )
        (encoder): LongformerEncoder(
          (layer): ModuleList(
            (0-11): 12 x LongformerLayer(
              (attention): LongformerAttention(
                (self): LongformerSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      

In [43]:
peft_longformer.classifier

LongformerClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=2, bias=True)
)

when no task is specified peft library freezes all non-lora weights including the classifier etc

In [19]:
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [22]:
# create set of inputs to pass to transformer model
inputs = tokenizer(["Hello, my dog is cute", "Hello, my cat is cute"], return_tensors="pt", padding="max_length")

In [23]:
inputs

{'input_ids': tensor([[    0, 31414,     6,  ...,     1,     1,     1],
        [    0, 31414,     6,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [24]:
# pass to model
outputs = peft_longformer(**inputs)

In [15]:
# can we just load with seq task type? 

In [37]:
peft_type = PeftType.LORA
lr = 3e-4
peft_config = LoraConfig(task_type="SEQ_CLS", target_modules=["query","value","key", 
                                                         "query_global", 
                                                         "value_global",
                                                         "key_global"] ,inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)

In [38]:
lora_long_auto = get_peft_model(auto_longformer, peft_config)

In [39]:
lora_long_auto

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LongformerForSequenceClassification(
      (longformer): LongformerModel(
        (embeddings): LongformerEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (position_embeddings): Embedding(4098, 768, padding_idx=1)
        )
        (encoder): LongformerEncoder(
          (layer): ModuleList(
            (0-11): 12 x LongformerLayer(
              (attention): LongformerAttention(
                (self): LongformerSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDic

In [40]:
lora_long_auto.print_trainable_parameters()

trainable params: 2,068,996 || all params: 150,137,860 || trainable%: 1.3780641338567101


In [44]:
lora_long_auto.classifier

ModulesToSaveWrapper(
  (original_module): LongformerClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
  )
  (modules_to_save): ModuleDict(
    (default): LongformerClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
    )
  )
)

In [41]:
count_trainable_parameters(lora_long_auto.classifier)

1184260

In [23]:
768*768 + 768*2

591360

### Causal LM?

In [60]:
model_name = "gpt2"
clm_model = AutoModelForCausalLM.from_pretrained(model_name)

In [53]:
count_trainable_parameters(clm_model.lm_head)

38597376

In [54]:
clm_model.lm_head

Linear(in_features=768, out_features=50257, bias=False)

In [61]:
clm_peft_config = LoraConfig(task_type="CAUSAL_LM",
                             
                         inference_mode=False,
                         r=8,
                         lora_alpha=16,
                         lora_dropout=0.1)

In [56]:
clm_peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=8, target_modules=['query', 'value', 'key', 'lm_head.dense'], lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)

In [62]:
PeftModel.from_pretrained(clm_model)

TypeError: from_pretrained() missing 1 required positional argument: 'model_id'

In [63]:
peft_clm_model = get_peft_model(clm_model, clm_peft_config)



In [66]:
peft_clm_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): Linear(
                in_features=768, out_features=2304, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
        

In [68]:
count_trainable_parameters(peft_clm_model)

294912

In [69]:
unfreeze_model(peft_clm_model.lm_head)

In [70]:
count_trainable_parameters(peft_clm_model)

38892288

### Masked language model?

In [2]:
model_name = "roberta-base"
# auto_model = AutoModel.from_pretrained(model_name, torch_dtype=torch.float16, device_map = "auto")
mlm_model = AutoModelForMaskedLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map = "auto")




In [27]:
mlm_model

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

In [6]:
count_trainable_parameters(mlm_model.base_model)

124055040

In [7]:
count_trainable_parameters(mlm_model)

124697433

In [18]:
mlm_model.lm_head

RobertaLMHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (decoder): Linear(in_features=768, out_features=50265, bias=True)
)

In [17]:
count_trainable_parameters(mlm_model.lm_head)

590592

In [16]:
count_trainable_parameters(mlm_model.roberta.embeddings.word_embeddings)

38603520

In [None]:
mlm_model.lm_head.decoder

In [19]:
count_trainable_parameters(mlm_model.lm_head.decoder)

38653785

In [7]:
# peft_config = LoraConfig(task_type=None, target_modules=["query","value","key", "lm_head.dense"],
#                          inference_mode=False,
#                          r=8,
#                          lora_alpha=16,
#                          lora_dropout=0.1)
peft_config = LoraConfig(task_type=None,
                         inference_mode=False,
                         r=8,
                         lora_alpha=16,
                         lora_dropout=0.1,
                         modules_to_save = ["lm_head"])

In [8]:
# load peft model
peft_mlm_model = get_peft_model(mlm_model, peft_config)

In [10]:
peft_mlm_model.peft_config

{'default': LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='roberta-base', revision=None, task_type=None, inference_mode=False, r=8, target_modules=['query', 'value'], lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=['lm_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None)}

In [13]:
count_trainable_parameters(peft_mlm_model.lm_head)

78491826

In [46]:
peft_mlm_model.lm_head

RobertaLMHead(
  (dense): Linear(
    in_features=768, out_features=768, bias=True
    (lora_dropout): ModuleDict(
      (default): Dropout(p=0.1, inplace=False)
    )
    (lora_A): ModuleDict(
      (default): Linear(in_features=768, out_features=8, bias=False)
    )
    (lora_B): ModuleDict(
      (default): Linear(in_features=8, out_features=768, bias=False)
    )
    (lora_embedding_A): ParameterDict()
    (lora_embedding_B): ParameterDict()
  )
  (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (decoder): Linear(in_features=768, out_features=50265, bias=True)
)

In [47]:
# have to unfreeze/need to just write the class for peft model and add as pull request
unfreeze_model(peft_mlm_model.lm_head.decoder)

In [48]:
count_trainable_parameters(peft_mlm_model.lm_head)

38666073

### 8 bit models

In [51]:
# can you do 8 bit roberta?
model_name_or_path = "ybelkada/falcon-7b-sharded-bf16" # | roberta-large | ybelkada/falcon-7b-sharded-bf16
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path,num_labels=7, torch_dtype=torch.bfloat16, 
                                                              load_in_8bit=True, output_hidden_states=False,                                                                
                                                                device_map="auto",)

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b:
- configuration_RW.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b:
- modelling_RW.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of the model checkpoint at ybelkada/falcon-7b-sharded-bf16 were not used when initializing RWForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing RWForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RWForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RWForSequenceClassification were not initialized from the model checkpoint at ybelkada/falcon-7b-sharded-bf16 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
model.dtype

torch.bfloat16

In [55]:
model.transformer.word_embeddings.weight.dtype

torch.bfloat16

In [57]:
model_prepared = prepare_model_for_kbit_training(model)

In [58]:
model_prepared.transformer.word_embeddings.weight.dtype

torch.float32

In [10]:
roberta_model_prepared.roberta.encoder.layer[0].attention.self.query.weight.dtype

torch.int8

In [18]:
roberta_model_prepared

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear8bitLt(in_features=768, out_features=768, bias=True)
              (key): Linear8bitLt(in_features=768, out_features=768, bias=True)
              (value): Linear8bitLt(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear8bitLt(in_features=768, out_features=768, 

In [3]:
# falcon 
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [8]:
tokenizer
if getattr(tokenizer, "pad_token_id") is None:
    print("no pad token")
    tokenizer.pad_token = tokenizer.eos_token

no pad token


In [21]:
falcon_model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path,num_labels=7, torch_dtype=torch.bfloat16, 
                                                              load_in_8bit=True, output_hidden_states=False,                                                                
                                                                device_map="auto", trust_remote_code = True)

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of the model checkpoint at ybelkada/falcon-7b-sharded-bf16 were not used when initializing RWForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing RWForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RWForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RWForSequenceClassification were not initialized from the model checkpoint at ybelkada/falcon-7b-sharded-bf16 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
falcon_model

RWForSequenceClassification(
  (transformer): RWModel(
    (word_embeddings): Embedding(65024, 4544)
    (h): ModuleList(
      (0-31): 32 x DecoderLayer(
        (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
        (self_attention): Attention(
          (maybe_rotary): RotaryEmbedding()
          (query_key_value): Linear8bitLt(in_features=4544, out_features=4672, bias=False)
          (dense): Linear8bitLt(in_features=4544, out_features=4544, bias=False)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): MLP(
          (dense_h_to_4h): Linear8bitLt(in_features=4544, out_features=18176, bias=False)
          (act): GELU(approximate='none')
          (dense_4h_to_h): Linear8bitLt(in_features=18176, out_features=4544, bias=False)
        )
      )
    )
    (ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=4544, out_features=7, bias=False)
)

In [14]:
falcon_model.config.pad_token_id = tokenizer.pad_token_id

In [15]:
falcon_model.config.pad_token_id

11

In [4]:
# 8 bit llama
llama_model = LlamaForSequenceClassification.from_pretrained(model_name_or_path, num_labels=7, torch_dtype=torch.bfloat16, 
                                                              load_in_8bit=True, output_hidden_states=False,                                                                
                                                                device_map="auto",
                                                                )

Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

Some weights of the model checkpoint at decapoda-research/llama-7b-hf were not used when initializing LlamaForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at decapoda-research/llama-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# 16bit
# llama_model = LlamaForSequenceClassification.from_pretrained(model_name_or_path, num_labels=7, torch_dtype=torch.bfloat16)

In [21]:
llama_model

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=31999)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    

In [6]:
llama_model.to(device)

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=31999)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    

In [5]:
# roberta_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", output_hidden_states=True)
# custom_model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, output_hidden_states=True)

In [5]:
roberta_model.state_dict().keys()

odict_keys(['roberta.embeddings.position_ids', 'roberta.embeddings.word_embeddings.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.embeddings.token_type_embeddings.weight', 'roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.self.query.weight', 'roberta.encoder.layer.0.attention.self.query.bias', 'roberta.encoder.layer.0.attention.self.key.weight', 'roberta.encoder.layer.0.attention.self.key.bias', 'roberta.encoder.layer.0.attention.self.value.weight', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.0.attention.output.dense.weight', 'roberta.encoder.layer.0.attention.output.dense.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.intermediate.dense.weight', 'roberta.encoder.layer.0.intermediate.dense.bias', 'roberta.encoder.layer.0.output.dense.weight', 'roberta.encoder.layer.0.outp

In [8]:
for p in roberta_model.parameters():
    print(p.dtype)
    break

torch.float32


In [6]:
roberta_model.config

RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.27.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [10]:
custom_model.state_dict().keys()

odict_keys(['roberta.embeddings.position_ids', 'roberta.embeddings.word_embeddings.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.embeddings.token_type_embeddings.weight', 'roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.self.query.weight', 'roberta.encoder.layer.0.attention.self.query.bias', 'roberta.encoder.layer.0.attention.self.key.weight', 'roberta.encoder.layer.0.attention.self.key.bias', 'roberta.encoder.layer.0.attention.self.value.weight', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.0.attention.output.dense.weight', 'roberta.encoder.layer.0.attention.output.dense.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.intermediate.dense.weight', 'roberta.encoder.layer.0.intermediate.dense.bias', 'roberta.encoder.layer.0.output.dense.weight', 'roberta.encoder.layer.0.outp

In [7]:
custom_model.config

RobertaConfig {
  "_name_or_path": "/mnt/sdc/niallt/saved_models/declutr/mimic/few_epoch/mimic-roberta-base/2_anch_2_pos_min_1024/transformer_format/",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.27.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [7]:
for p in custom_model.parameters():
    print(p.dtype)
    break

torch.float32


In [17]:
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
# custom_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
llama_tokenizer = LlamaTokenizer.from_pretrained(model_name_or_path)

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


In [8]:
roberta_tokenizer

RobertaTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [9]:
custom_tokenizer

RobertaTokenizerFast(name_or_path='/mnt/sdc/niallt/saved_models/declutr/mimic/few_epoch/mimic-roberta-base/2_anch_2_pos_min_1024/transformer_format/', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [4]:
# peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)
# lr = 3e-4

In [12]:
optimizer = AdamW(params=roberta_model.parameters(), lr=0.001)  

In [14]:
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0.01
)

In [13]:
custom_optimizer = AdamW(params=custom_model.parameters(), lr=0.001)

In [15]:
custom_optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0.01
)

## Function to load different peft setups

In [3]:
def setup_peft_model(model_name_or_path,                     
                     peft_method,
                     task_type,                     
                     device,
                     num_virtual_tokens= 20,
                     num_labels = 7):
    '''
    Function to setup the peft model for training and return a peft model based on the peft method specified.
    
    '''
    
    if peft_method == "LORA":
        loguru_logger.info("Using LORA")
        peft_type = PeftType.LORA
        lr = 3e-4
        peft_config = LoraConfig(task_type=task_type, inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)
    elif peft_method == "PREFIX_TUNING":
        loguru_logger.info("Using PREFIX_TUNING")
        peft_type = PeftType.PREFIX_TUNING
        peft_config = PrefixTuningConfig(task_type=task_type, num_virtual_tokens=20)
        lr = 1e-2
    elif peft_method == "PROMPT_TUNING":
        loguru_logger.info("Using PROMPT_TUNING")
        peft_type = PeftType.PROMPT_TUNING
        peft_config = PromptTuningConfig(task_type=task_type, num_virtual_tokens=10)
        lr = 1e-3
    elif peft_method == "P_TUNING":
        loguru_logger.info("Using P_TUNING")
        peft_type = PeftType.P_TUNING
        peft_config = PromptEncoderConfig(task_type=task_type, num_virtual_tokens=20, encoder_hidden_size=128)
        lr = 1e-3
        

    # load peft model
    if "llama" in model_name_or_path:
        loguru_logger.info("Loading LLAMA model in 8 bit")
        # model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path,
        #                                                            torch_dtype=torch.bfloat16,
        #                                                            num_labels = num_labels,return_dict=True)
        # 8 bit
        model = LlamaForSequenceClassification.from_pretrained(model_name_or_path, num_labels=num_labels, torch_dtype=torch.float16, 
                                                              load_in_8bit=True, output_hidden_states=False,                                                                
                                                                device_map="auto",
                                                                )
    else:
        
        model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, num_labels = num_labels,return_dict=True)
        model.to(device)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
   
    
    
    
    # setup optimizer and lr_scheduler
    optimizer = AdamW(params=model.parameters(), lr=lr)

    # Instantiate scheduler
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
        num_training_steps=(len(train_dataloader) * num_epochs),
    )
    return model, peft_config, optimizer, lr_scheduler

### setup task and dataset

In [3]:
# create dictionary of various datasets and their sentence keys
task_to_keys ={
                "cola": ("sentence", None),
                "mnli": ("premise", "hypothesis"),
                "mnli-mm": ("premise", "hypothesis"),
                "mrpc": ("sentence1", "sentence2"),
                "qnli": ("question", "sentence"),
                "qqp": ("question1", "question2"),
                "rte": ("sentence1", "sentence2"),
                "sst2": ("sentence", None),
                "stsb": ("sentence1", "sentence2"),
                "wnli": ("sentence1", "sentence2"),
                "mimic-note-category": ("TEXT", None),
                "icd9-triage":("text", None),
                "icd9-triage-no-category-in-text":("text", None),
                "ICD9-Triage":("text", None),
                "mednli":("sentence1", "sentence2")
                }

In [4]:
# task for now is icd9-triage
task = "ICD9-Triage"

# task = "mrpc"

### load dataset 

In [5]:
with open('../datasets.yaml', 'r') as f:
    datasets = yaml.load(f, yaml.FullLoader)

try:
    dataset_info = datasets[task]

except KeyError:
    print(f"Task name {task} not in datasets.yaml. Available tasks are: {list(datasets.keys())}")
    exit(0)



In [7]:
dataset_info

{'training_data_dir': '/mnt/sdd/efficient_ml_data/datasets/icd9-triage/no_category_in_text',
 'eval_data_dir': '/mnt/sdd/efficient_ml_data/datasets/icd9-triage/no_category_in_text',
 'data_dir': '',
 'training_file': 'train.csv',
 'validation_file': 'valid.csv',
 'test_file': 'test.csv',
 'task_type': 'SEQ_CLS',
 'label_name': 'label',
 'text_column': 'text',
 'remove_columns': ['text', 'triage-category']}

In [6]:

datasets = load_dataset("csv", 
                        data_files = {"train":f"{dataset_info['training_data_dir']}/{dataset_info['training_file']}",
                                    "validation":f"{dataset_info['eval_data_dir']}/{dataset_info['validation_file']}",
                                    "test":f"{dataset_info['eval_data_dir']}/{dataset_info['validation_file']}",
                                    },
                    cache_dir = None)

In [22]:
# # load dataset

# training_data_dir = "/mnt/sdc/niallt/mimic_iii/processed/HADM_ID_split/icd9-triage/no_category_in_text/fewshot_64/"
# eval_data_dir = "/mnt/sdc/niallt/mimic_iii/processed/HADM_ID_split/icd9-triage/no_category_in_text/"
# datasets = load_dataset("csv", 
#                         data_files = {"train":f"{training_data_dir}/train.csv",
#                                         "validation":f"{eval_data_dir}/valid.csv",
#                                         "test":f"{eval_data_dir}/test.csv"},
#                         cache_dir = "/mnt/sdc/niallt/.cache/")

# loguru_logger.info(f"Number of training samples: {len(datasets['train'])}\n and validation samples:{len(datasets['validation'])}")

In [7]:
# get number of labels
num_labels = len(np.unique(datasets["train"]["label"]))

In [9]:
num_labels

7

In [19]:
len(datasets["train"])/batch_size * num_epochs

140.0

### pre-process/encode dataset

In [8]:
sentence1_key, sentence2_key = task_to_keys[task]

In [9]:
model_name_or_path = "roberta-base"
batch_size = 16

In [10]:
if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

if "llama" in model_name_or_path:
    tokenizer = LlamaTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
else:
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id


# set the sentence/task keys
# sentence1_key, sentence2_key = task_to_keys[task]

# for glue
# def tokenize_function(examples):
#     # max_length=None => use the model max length (it's actually the default)
#     outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=480)
#     return outputs

# own
def tokenize_function(examples):
    # max_length is important when using prompt tuning  or prefix tuning or p tuning as virtual tokens are added - which can overshoot the max length in pefts current form
    # for now set to 480 and see how it goes
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True, max_length = 480)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True, max_length=480)

# own
tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=['text', 'triage-category']
)

# for glue
# tokenized_datasets = datasets.map(
#     tokenize_function,
#     batched=True,
#     remove_columns=["idx", "sentence1", "sentence2"],
# )
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
# transformers library
# tokenized_datasets = tokenized_datasets.rename_column("label", "labels")


def collate_fn(examples):
    return tokenizer.pad(examples, padding="longest", return_tensors="pt")


# Instantiate dataloaders.
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size
)

In [13]:
if "labels" not in tokenized_datasets["train"].features:
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [14]:
train_dataloader.dataset[0]

{'label': 0,
 'input_ids': [0,
  35,
  4832,
  4832,
  2099,
  35,
  856,
  26331,
  35,
  3186,
  2673,
  25,
  519,
  117,
  684,
  26331,
  7,
  2196,
  4832,
  18769,
  19,
  21655,
  1499,
  6,
  1144,
  22802,
  20541,
  187,
  1423,
  73,
  139,
  538,
  15535,
  50,
  19341,
  7089,
  35,
  6286,
  7085,
  24423,
  5010,
  1640,
  5471,
  8635,
  11576,
  1423,
  73,
  139,
  2182,
  19,
  684,
  475,
  45883,
  54,
  21,
  6443,
  19,
  10,
  1144,
  22802,
  20541,
  23,
  1046,
  479,
  79,
  21,
  15423,
  19,
  13603,
  326,
  859,
  18,
  61,
  969,
  21319,
  475,
  338,
  4,
  23930,
  969,
  784,
  548,
  506,
  7606,
  19,
  6286,
  7085,
  24423,
  6701,
  7150,
  41448,
  13484,
  9,
  7606,
  4,
  79,
  9118,
  143,
  5298,
  4,
  375,
  1131,
  750,
  35,
  8944,
  33330,
  808,
  23249,
  6,
  475,
  45883,
  73,
  35685,
  6,
  6943,
  6,
  14057,
  592,
  750,
  35,
  592,
  4400,
  2678,
  6,
  697,
  19,
  985,
  6,
  9118,
  6106,
  50,
  9681,
  304,
  284,

In [15]:
# use tokenizer to decode a sample

tokenizer.decode(train_dataloader.dataset[0]["input_ids"])


"<s>: : : sex: f allergies: patient recorded as having no known allergies to drugs : sob with exertion, heart murmur since y/o major surgical or invasive procedure: mitral valve replacement(mm ce tissue y/o female with known mvp who was diagnosed with a heart murmur at age. she was evaluated with serial tte's which showed worsening mr. echo showed lvef % with mitral valve regurgitant fraction of %. she denies any symptoms. past medical history: hyperlipidemia, mvp/mr, depression, obesity social history: social etoh, live with mother, deniesda or tobacco use family history: noncontributory physical exam: y/o f in bed nad neuro aa&ox, nonfocal chest ctab resp unlab median sternotomy stable, c/d/i no d/c, rrr no m/r/g chest tubes and epicardial wires removed. abd s/nt/nd/bs+ ext warm with trace edema pertinent results: radiology preliminary report chest (pa & lat : am chest (pa & lat reason: assess lll atelectasis medical condition: year old woman with fever atelectasis seen on prio film 

In [13]:
# run through dataloader and check that the labels are correct
for batch in train_dataloader:
    # print length of input ids
    print(len(batch["input_ids"][0]))

480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480


In [16]:
from scipy.special import softmax
def compute_metrics(eval_pred):
        precision_score = evaluate.load("precision")
        recall_score = evaluate.load("recall")
        accuracy_score = evaluate.load("accuracy")
        f1_score = evaluate.load("f1")        
        roc_auc_score = evaluate.load("roc_auc", "multiclass")        

        logits, labels = eval_pred
        
        # print(f"logits are: {logits} of shape: {logits.shape}")
        #TODO add softmax to convert logits to probs
        # print(f"logits shape is: {logits.shape}")
        pred_scores = softmax(logits, axis = -1)        
        predictions = np.argmax(logits, axis = -1)
        
        # print(f"Labels are: {labels}\n")
        # print(f"Preds are: {predictions}")
        precision = precision_score.compute(predictions=predictions, references=labels, average = "macro")["precision"]
        recall = recall_score.compute(predictions=predictions, references=labels, average = "macro")["recall"]
        accuracy = accuracy_score.compute(predictions=predictions, references=labels)["accuracy"]
        f1_macro = f1_score.compute(predictions=predictions, references=labels, average = "macro")["f1"]
        f1_micro = f1_score.compute(predictions=predictions, references=labels, average = "micro")["f1"]
        f1_weighted = f1_score.compute(predictions=predictions, references=labels, average = "weighted")["f1"]
        # roc_auc has slightly different format - needs the probs/scores rather than predicted labels
        roc_auc = roc_auc_score.compute(references=labels,
                                        prediction_scores = pred_scores,
                                        multi_class = 'ovr', 
                                        average = "macro")['roc_auc']
        
        return {"precision": precision, 
                "recall": recall,
                "accuracy": accuracy,
                "f1_macro":f1_macro,
                "f1_micro":f1_micro,
                "f1_weighted":f1_weighted,
                "roc_auc_macro":roc_auc}

## Setup PEFT model

In [13]:
model, peft_config, optimizer, lr_scheduler = setup_peft_model(model_name_or_path, peft_method = "LORA", task_type = "SEQ_CLS", device = "cuda", num_labels = num_labels)

2023-07-07 15:54:44.832 | INFO     | __main__:setup_peft_model:13 - Using LORA


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

Some weights of the model checkpoint at decapoda-research/llama-7b-hf were not used when initializing LlamaForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at decapoda-research/llama-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 4,251,648 || all params: 6,611,595,264 || trainable%: 0.06430593268692861


In [14]:
model.device

device(type='cuda', index=0)

In [None]:
# roberta_model, peft_config, optimizer, lr_scheduler = setup_peft_model("roberta-base", peft_method = "LORA", task_type = "SEQ_CLS", device = "cuda", num_labels = num_labels)

2023-05-26 13:53:51.951 | INFO     | __main__:setup_peft_model:13 - Using LORA
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at

trainable params: 1486862 || all params: 125541902 || trainable%: 1.1843551645409993


## debugging

In [14]:
import torchinfo
import torch.nn as nn

In [18]:
# Count the number of zeros in each parameter tensor
num_zeros = 0
for param in model.parameters():
    num_zeros += torch.numel(param) - torch.count_nonzero(param)
print("Number of zeros:", num_zeros)

Number of zeros: tensor(149774, device='cuda:0')


In [19]:
# Count the number of zeros in each parameter tensor
num_zeros = 0
for param in roberta_model.parameters():
    num_zeros += torch.numel(param) - torch.count_nonzero(param)
print("Number of zeros:", num_zeros)

Number of zeros: tensor(150612, device='cuda:0')


In [21]:
# Count the number of zeros in each parameter tensor
num_zeros = 0
for param in declutr_model.parameters():
    num_zeros += torch.numel(param) - torch.count_nonzero(param)
print("Number of zeros:", num_zeros)

Number of zeros: tensor(149774, device='cuda:0')


In [15]:
torchinfo.summary(model)

Layer (type:depth-idx)                                                      Param #
PeftModelForSequenceClassification                                          --
├─LoraModel: 1-1                                                            --
│    └─RobertaForSequenceClassification: 2-1                                --
│    │    └─RobertaModel: 3-1                                               124,349,952
│    │    └─ModulesToSaveWrapper: 3-2                                       1,191,950
Total params: 125,541,902
Trainable params: 1,486,862
Non-trainable params: 124,055,040

In [None]:
====================================================================================================
Layer (type:depth-idx)                                                      Param #
====================================================================================================
PeftModelForSequenceClassification                                          --
├─LoraModel: 1-1                                                            --
│    └─RobertaForSequenceClassification: 2-1                                --
│    │    └─RobertaModel: 3-1                                               124,349,952
│    │    └─ModulesToSaveWrapper: 3-2                                       1,191,950
====================================================================================================
Total params: 125,541,902
Trainable params: 1,486,862
Non-trainable params: 124,055,040

In [None]:
trainable params: 1486862 || all params: 125541902 || trainable%: 1.1843551645409993

In [23]:
294912/1486862

0.1983452398406846

In [13]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, base_model_name_or_path='roberta-base', task_type='SEQ_CLS', inference_mode=False, r=8, target_modules=['query', 'value'], lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True)

## Train PEFT model

In [13]:
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=31999)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=4096, out_features=4096, bias=

#### debugging - custom roberta model leads to weird memory issues

In [23]:
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      

In [14]:
for batch in eval_dataloader:
    
    with torch.no_grad():
        outputs = model(**batch.to(device))
    predictions = outputs.logits.argmax(dim=-1)
    predictions, references = predictions, batch["labels"]
    print(batch.keys())
    # print(f"outputs:{outputs}")
    print(f"predictions:{predictions}")
    print(f"references:{references}")
    
    break

dict_keys(['labels', 'input_ids', 'attention_mask'])
predictions:tensor([1, 5, 5, 4], device='cuda:0')
references:tensor([0, 1, 1, 0], device='cuda:0')


In [1]:
# roberta_model

In [16]:
roberta_outputs

SequenceClassifierOutput(loss=tensor(2.0110, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0555, -0.0210, -0.1128,  0.0344,  0.1020, -0.0508, -0.0510],
        [-0.0573, -0.0200, -0.1036,  0.0467,  0.1004, -0.0551, -0.0567],
        [-0.0556, -0.0172, -0.1156,  0.0369,  0.0998, -0.0592, -0.0500],
        [-0.0651, -0.0201, -0.1072,  0.0478,  0.0974, -0.0566, -0.0518]],
       device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [17]:
custom_outputs

SequenceClassifierOutput(loss=tensor(2.0342, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1482, -0.0124, -0.0138,  0.2247,  0.0693, -0.3117,  0.2074],
        [-0.0756,  0.0031, -0.0501,  0.1328,  0.0983, -0.2762,  0.2746],
        [-0.0222,  0.0371, -0.0096,  0.2350,  0.1343, -0.2727,  0.1959],
        [-0.0484, -0.0302, -0.0769,  0.1806, -0.0051, -0.2624,  0.1530]],
       device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=(tensor([[[ 0.1712, -0.0483, -0.0230,  ..., -0.0518,  0.0839,  0.0430],
         [ 0.0998, -0.1730, -0.0247,  ..., -0.1891, -0.0604,  0.0077],
         [-0.1805, -0.3721, -0.0325,  ..., -0.2030, -0.2124,  0.0519],
         ...,
         [-0.2720,  0.0421,  0.2390,  ..., -0.5429,  0.1871, -0.1404],
         [ 0.2573, -0.1542,  0.0187,  ..., -0.9629,  0.0768,  0.2467],
         [-0.0058, -0.2664, -0.1019,  ...,  0.3818,  0.1682,  0.1829]],

        [[ 0.1712, -0.0483, -0.0230,  ..., -0.0518,  0.0839,  0.0430],
         [ 0.0998, -0.1730, -0

In [23]:
# optimizer.param_groups

In [25]:
lr_scheduler

<torch.optim.lr_scheduler.LambdaLR at 0x7ff934fa4f40>

In [26]:
# optimizer = AdamW(params=model.parameters(), lr=lr)

# # Instantiate scheduler
# lr_scheduler = get_linear_schedule_with_warmup(
#     optimizer=optimizer,
#     num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
#     num_training_steps=(len(train_dataloader) * num_epochs),
# )

In [27]:
device

'cuda'

In [14]:
metric = evaluate.load("f1")

In [15]:
metric

EvaluationModule(name: "f1", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    labels (`list` of `int`): The set of labels to include when `average` is not set to `'binary'`, and the order of the labels if `average` is `None`. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class. Labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in `predictions` and `references` are used in sorted order. Defaults to None.
    pos_label (`int`): The class to be considered the positive class, in the case where `average` is set to `binary`. Defaults to 1.
    average (`string`): This parameter is required for multiclass/multilabel t

## Train

In [16]:
model.to(device)
for epoch in range(num_epochs):
    model.train()
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch.to(device)
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch.to(device)
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        predictions, references = predictions, batch["labels"]
        metric.add_batch(
            predictions=predictions,
            references=references,
        )

    eval_metric = metric.compute(average = "macro")
    print(f"epoch {epoch}:", eval_metric)

  0%|          | 0/224 [00:00<?, ?it/s]

100%|██████████| 224/224 [02:54<00:00,  1.28it/s]
100%|██████████| 1557/1557 [08:14<00:00,  3.15it/s]


epoch 0: {'f1': 0.09580728769629517}


100%|██████████| 224/224 [03:03<00:00,  1.22it/s]
 13%|█▎        | 205/1557 [01:08<07:33,  2.98it/s]

 13%|█▎        | 208/1557 [01:09<07:41,  2.92it/s]

#### save model


In [None]:
model.save_pretrained('./peft_model')

## reload saved peft weights

In [9]:
# peft_model_dir = "/mnt/sdc/niallt/saved_models/peft_training/ckpts/icd9-triage-no-category-in-text/fewshot_64/mimic-roberta-base/declutr/2_anch_2_pos_min_1024/LORA/23-06-2023--11-41/"
peft_model_dir = "/mnt/sdd/efficient_ml_data/saved_models/peft/ckpts/ICD9-Triage/full/roberta-base/LORA/30-08-2023--15-58/"


In [10]:
adapter_bin = torch.load(f"{peft_model_dir}/adapter_model.bin")

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/sdd/efficient_ml_data/saved_models/peft/ckpts/ICD9-Triage/full/roberta-base/LORA/30-08-2023--15-58//adapter_model.bin'

In [29]:
adapter_bin

{'base_model.model.bert.encoder.layer.0.attention.self.query.lora_A.weight': tensor([[-0.0171,  0.0368, -0.0372,  ..., -0.0156,  0.0369,  0.0535],
         [-0.0262,  0.0111, -0.0117,  ..., -0.0040, -0.0437,  0.0237],
         [-0.0066,  0.0427, -0.0378,  ...,  0.0340, -0.0255,  0.0529],
         ...,
         [ 0.0131, -0.0496, -0.0212,  ...,  0.0491,  0.0521, -0.0051],
         [-0.0099,  0.0169,  0.0190,  ..., -0.0289,  0.0178,  0.0147],
         [ 0.0184, -0.0021,  0.0051,  ...,  0.0164, -0.0060, -0.0724]],
        device='cuda:0'),
 'base_model.model.bert.encoder.layer.0.attention.self.query.lora_B.weight': tensor([[-0.0051, -0.0075,  0.0026,  ..., -0.0222, -0.0095, -0.0070],
         [ 0.0025, -0.0080,  0.0042,  ...,  0.0061,  0.0087, -0.0097],
         [-0.0120, -0.0257,  0.0023,  ...,  0.0039,  0.0017, -0.0115],
         ...,
         [-0.0007,  0.0175,  0.0042,  ..., -0.0065,  0.0036, -0.0025],
         [-0.0158,  0.0124, -0.0370,  ..., -0.0119, -0.0216,  0.0210],
         [-0

In [32]:
config = PeftConfig.from_pretrained(peft_model_dir)

In [17]:
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='dmis-lab/biobert-v1.1', revision=None, task_type='SEQ_CLS', inference_mode=True, r=8, target_modules={'query', 'value'}, lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={})

In [33]:
# load config
config = PeftConfig.from_pretrained(peft_model_dir)
# load base model 
model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path, num_labels = 7)



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from datasets import concatenate_datasets
all_eval = concatenate_datasets([tokenized_datasets["validation"], tokenized_datasets["test"]])

In [12]:
all_eval

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 6228
})

In [34]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [45]:
org_layer11_weights = model.roberta.encoder.layer[0].attention.self.query.weight.detach().clone()

In [46]:
org_layer11_weights

tensor([[ 0.0729, -0.0029, -0.0902,  ...,  0.1033,  0.0900, -0.1030],
        [-0.0516,  0.2061,  0.0739,  ...,  0.0657,  0.0634,  0.1282],
        [ 0.0878,  0.0698, -0.0515,  ..., -0.0426, -0.0081,  0.1100],
        ...,
        [-0.1871,  0.0172, -0.0315,  ..., -0.0503,  0.1024, -0.1165],
        [-0.2532,  0.0439,  0.0638,  ...,  0.0701, -0.1045,  0.0118],
        [-0.0516, -0.0859,  0.1027,  ..., -0.1895,  0.0033, -0.0541]])

In [35]:
# load peft model
model = PeftModel.from_pretrained(model, peft_model_dir)

RuntimeError: Error(s) in loading state_dict for PeftModelForSequenceClassification:
	size mismatch for base_model.model.classifier.modules_to_save.default.out_proj.weight: copying a param with shape torch.Size([7, 768]) from checkpoint, the shape in current model is torch.Size([2, 768]).
	size mismatch for base_model.model.classifier.modules_to_save.default.out_proj.bias: copying a param with shape torch.Size([7]) from checkpoint, the shape in current model is torch.Size([2]).

In [36]:
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(28996, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_

In [59]:
peft_loaded_model_layer11 = model.bert.encoder.layer[0].attention.self.query.weight.detach().clone()

In [60]:
torch.equal(org_layer11_weights, peft_loaded_model_layer11)

True

In [54]:
org_layer11_weights

tensor([[-0.0024, -0.0051, -0.0104,  ..., -0.0529, -0.0080, -0.0795],
        [-0.0238,  0.0116, -0.0225,  ...,  0.0170, -0.0107,  0.0075],
        [-0.0422,  0.0222,  0.0508,  ..., -0.0590, -0.0259, -0.0288],
        ...,
        [ 0.0169, -0.0077, -0.0231,  ..., -0.0427,  0.0233,  0.0325],
        [-0.0583,  0.1086, -0.0798,  ..., -0.0704, -0.0072,  0.0647],
        [ 0.0236,  0.0544,  0.0459,  ...,  0.0631, -0.0455, -0.0590]])

In [21]:
# merge and unload
model = model.merge_and_unload()

In [24]:
unloaded_peft_layer11 = model.roberta.encoder.layer[0].attention.self.query.weight.detach().clone()

In [None]:
unloaded_peft_layer11

tensor([[-0.0042,  0.0016, -0.0165,  ..., -0.0577, -0.0042, -0.0744],
        [-0.0217,  0.0038, -0.0154,  ...,  0.0222, -0.0149,  0.0017],
        [-0.0429,  0.0249,  0.0484,  ..., -0.0611, -0.0243, -0.0267],
        ...,
        [ 0.0160, -0.0047, -0.0260,  ..., -0.0447,  0.0249,  0.0347],
        [-0.0568,  0.1029, -0.0745,  ..., -0.0662, -0.0102,  0.0604],
        [ 0.0259,  0.0459,  0.0535,  ...,  0.0692, -0.0503, -0.0655]])

In [23]:
torch.equal(org_layer11_weights, unloaded_peft_layer11)

NameError: name 'org_layer11_weights' is not defined

In [27]:
from scipy.special import softmax
def compute_metrics(eval_pred):
        precision_score = evaluate.load("precision")
        recall_score = evaluate.load("recall")
        accuracy_score = evaluate.load("accuracy")
        f1_score = evaluate.load("f1")        
        roc_auc_score = evaluate.load("roc_auc", "multiclass")        

        logits, labels = eval_pred
        
        # print(f"logits are: {logits} of shape: {logits.shape}")
        #TODO add softmax to convert logits to probs
        # print(f"logits shape is: {logits.shape}")
        pred_scores = softmax(logits, axis = -1)        
        predictions = np.argmax(logits, axis = -1)
        
        # print(f"Labels are: {labels}\n")
        # print(f"Preds are: {predictions}")
        precision = precision_score.compute(predictions=predictions, references=labels, average = "macro")["precision"]
        recall = recall_score.compute(predictions=predictions, references=labels, average = "macro")["recall"]
        accuracy = accuracy_score.compute(predictions=predictions, references=labels)["accuracy"]
        f1_macro = f1_score.compute(predictions=predictions, references=labels, average = "macro")["f1"]
        f1_micro = f1_score.compute(predictions=predictions, references=labels, average = "micro")["f1"]
        f1_weighted = f1_score.compute(predictions=predictions, references=labels, average = "weighted")["f1"]
        # roc_auc has slightly different format - needs the probs/scores rather than predicted labels
        roc_auc = roc_auc_score.compute(references=labels,
                                        prediction_scores = pred_scores,
                                        multi_class = 'ovr', 
                                        average = "macro")['roc_auc']
        
        return {"precision": precision, 
                "recall": recall,
                "accuracy": accuracy,
                "f1_macro":f1_macro,
                "f1_micro":f1_micro,
                "f1_weighted":f1_weighted,
                "roc_auc_macro":roc_auc}

In [35]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [39]:
# get the number of labels from the dataset
num_labels = len(np.unique(datasets["validation"]["label"]))

In [40]:
num_labels

7

In [28]:
# run over evaluation dataloader and collect metrics
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./test_trainer",
    evaluation_strategy = "epoch",
    do_train=False,  
    do_eval=True,
    per_device_eval_batch_size=16  
)

trainer = Trainer(model=model,
                    args = training_args,
                    train_dataset=tokenized_datasets["train"],
                    eval_dataset=tokenized_datasets["validation"],
                    compute_metrics=compute_metrics,
                    tokenizer=tokenizer,
                    data_collator=collate_fn,)

metrics = trainer.evaluate()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mniall-taylor[0m. Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers`

In [29]:
metrics

{'eval_loss': 0.18856966495513916,
 'eval_precision': 0.872335055021986,
 'eval_recall': 0.8682285224869758,
 'eval_accuracy': 0.9421965317919075,
 'eval_f1_macro': 0.8697938878679062,
 'eval_f1_micro': 0.9421965317919075,
 'eval_f1_weighted': 0.9418719624376168,
 'eval_roc_auc_macro': 0.9928218321449308,
 'eval_runtime': 26.456,
 'eval_samples_per_second': 117.705,
 'eval_steps_per_second': 7.371}

In [None]:
model.eval()
for step, batch in enumerate(tqdm(eval_dataloader)):
    batch.to(device)
    with torch.no_grad():
        outputs = model(**batch)
        metrics

# below is miscellanous code that is not used in the tutorial

In [15]:
import torchinfo 

In [3]:
# load standard model with peft
model_name_or_path = "roberta-base"

model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map = "auto")
peft_type = PeftType.LORA
lr = 3e-4
peft_config = LoraConfig(task_type = "SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)


The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classif

In [None]:
peft_model = get_peft_model(model, peft_config)

In [5]:
peft_model.peft_config

{'default': LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='roberta-base', revision=None, task_type='SEQ_CLS', inference_mode=False, r=8, target_modules=['query', 'value'], lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)}

In [None]:
# roberta model gpu usage
18777

In [None]:
# declutr model gpu usage
19905 - 19595

In [29]:
tokenizer.model_max_length

512

In [30]:
model.peft_config

{'default': PromptTuningConfig(peft_type=<PeftType.PROMPT_TUNING: 'PROMPT_TUNING'>, base_model_name_or_path='roberta-base', task_type='SEQ_CLS', inference_mode=False, num_virtual_tokens=10, token_dim=768, num_transformer_submodules=1, num_attention_heads=12, num_layers=12, prompt_tuning_init=<PromptTuningInit.RANDOM: 'RANDOM'>, prompt_tuning_init_text=None, tokenizer_name_or_path=None)}

In [128]:
peft_model.model.roberta.encoder.layer[0].attention.self.query

Linear(
  in_features=768, out_features=768, bias=True
  (lora_dropout): ModuleDict(
    (default): Dropout(p=0.1, inplace=False)
  )
  (lora_A): ModuleDict(
    (default): Linear(in_features=768, out_features=8, bias=False)
  )
  (lora_B): ModuleDict(
    (default): Linear(in_features=8, out_features=768, bias=False)
  )
  (lora_embedding_A): ParameterDict()
  (lora_embedding_B): ParameterDict()
)

In [139]:
query_module = peft_model.model.roberta.encoder.layer[0].attention.self.query

In [140]:
query_module

Linear(
  in_features=768, out_features=768, bias=True
  (lora_dropout): ModuleDict(
    (default): Dropout(p=0.1, inplace=False)
  )
  (lora_A): ModuleDict(
    (default): Linear(in_features=768, out_features=8, bias=False)
  )
  (lora_B): ModuleDict(
    (default): Linear(in_features=8, out_features=768, bias=False)
  )
  (lora_embedding_A): ParameterDict()
  (lora_embedding_B): ParameterDict()
)

In [129]:
# convert to zeros
query_module.lora_A.default.weight.data = torch.zeros_like(query_module.lora_A.default.weight.data)
query_module.weight.data = torch.zeros_like(query_module.weight.data)

In [142]:
query_module.weight

Parameter containing:
tensor([[ 0.0729, -0.0029, -0.0902,  ...,  0.1033,  0.0900, -0.1030],
        [-0.0516,  0.2061,  0.0739,  ...,  0.0657,  0.0634,  0.1282],
        [ 0.0878,  0.0698, -0.0515,  ..., -0.0426, -0.0081,  0.1100],
        ...,
        [-0.1871,  0.0172, -0.0315,  ..., -0.0503,  0.1024, -0.1165],
        [-0.2532,  0.0439,  0.0638,  ...,  0.0701, -0.1045,  0.0118],
        [-0.0516, -0.0859,  0.1027,  ..., -0.1895,  0.0033, -0.0541]],
       device='cuda:0', dtype=torch.float16)

In [63]:
tensor.dtype

torch.float32

In [143]:
# test query module
# create tensor of shape (768, 768) and set dtype to float16
tensor = torch.rand(768, 768).to("cuda").to(torch.float16)

In [144]:

query_result = query_module(tensor)

In [145]:
query_result

tensor([[ 1.6777,  1.2158,  0.5444,  ..., -1.5322,  2.0527, -0.0883],
        [ 0.1980, -0.7061,  0.8887,  ..., -0.8794,  0.0526,  1.4717],
        [ 0.4829,  1.0625,  0.4050,  ...,  0.5127,  1.7363,  0.4033],
        ...,
        [ 0.5737, -0.4644, -0.1860,  ...,  0.7104,  0.6699,  0.5977],
        [ 1.0684, -0.0388,  0.4219,  ...,  0.7651,  1.5244,  1.4688],
        [ 0.6953,  0.8516, -0.0283,  ..., -0.2627,  1.2002,  0.5142]],
       device='cuda:0', dtype=torch.float16, grad_fn=<AddBackward0>)

In [80]:
query_module.get_submodule

In [84]:
model.roberta.encoder.layer[0].attention.self.query

Linear(in_features=768, out_features=768, bias=True)

In [None]:
import torch
from torch import nn