In [1]:

import argparse
import os

#set visible cuda devices
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PeftConfig,
    PeftModel,
    PrefixTuningConfig,
    PromptEncoderConfig,
    PromptTuningConfig,
    prepare_model_for_int8_training,
    # AutoPeftModel,
    prepare_model_for_kbit_training # only for latest dev version of peft
)


import evaluate
from datasets import load_dataset, load_from_disk
from transformers import (AutoModelForSequenceClassification, 
                          AutoModelForCausalLM,
                          AutoModelForMaskedLM,
                          AutoModel,
                        AutoTokenizer,
                        get_linear_schedule_with_warmup,
                        set_seed,
                        LlamaForSequenceClassification,
                        LlamaForCausalLM,
                        LlamaTokenizer, LongformerForMaskedLM, LongformerForSequenceClassification)
import yaml
from tqdm import tqdm
from loguru import logger as loguru_logger
import numpy as np

import sys
sys.path.append("../")

from data_utils.model_utils import count_trainable_parameters, freeze_model, unfreeze_model

## Setup some parameters 

In [None]:
# function to count trainable params of a model


In [15]:
LlamaTokenizer

transformers.models.llama.tokenization_llama.LlamaTokenizer

In [3]:

batch_size = 2
# model_name_or_path = "/mnt/sdc/niallt/saved_models/language_modelling/mimic/roberta-base-mimic-wecho/sampled_250000/08-03-2023--13-06/checkpoint-84000/" # 
# model_name_or_path = "/mnt/sdc/niallt/saved_models/declutr/mimic/few_epoch/mimic-roberta-base/2_anch_2_pos_min_1024/transformer_format/"
# model_name_or_path = "/mnt/sdc/niallt/saved_models/language_modelling/mimic/mimic-roberta-base/sampled_250000/22-12-2022--12-45/checkpoint-100000/"
# model_name_or_path = "/mnt/sdc/niallt/saved_models/declutr/mimic/few_epoch/mimic-roberta-base/2_anch_2_pos_min_1024/transformer_format/"
model_name_or_path = "roberta-base" # | roberta-large
peft_method = "LORA" # | PROMPT_TUNING | PREFIX_TUNING | P_TUNING
device = "cuda"
num_epochs = 3

In [19]:
## llama model
# model_name_or_path = "decapoda-research/llama-7b-hf" # ybelkada/falcon-7b-sharded-bf16
model_name_or_path ="ybelkada/falcon-7b-sharded-bf16"

In [3]:
## open llama 3b
model_path = 'openlm-research/open_llama_3b'
model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map = "auto")

Downloading (…)lve/main/config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/6.85G [00:00<?, ?B/s]

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [4]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 3200, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (k_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (v_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (o_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3200, out_features=8640, bias=False)
          (down_proj): Linear(in_features=8640, out_features=3200, bias=False)
          (up_proj): Linear(in_features=3200, out_features=8640, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm(

In [8]:
auto_model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map = "auto")

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


In [9]:
auto_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 3200, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (k_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (v_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (o_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3200, out_features=8640, bias=False)
          (down_proj): Linear(in_features=8640, out_features=3200, bias=False)
          (up_proj): Linear(in_features=3200, out_features=8640, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm(

## debugging

In [59]:
# load standard model with peft
model_name_or_path = "roberta-base"

model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map = "auto")

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classif

In [60]:
peft_type = PeftType.LORA
lr = 3e-4
peft_config = LoraConfig(task_type = "SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)

In [62]:
peft_model = get_peft_model(model, peft_config)

In [64]:
peft_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      

In [66]:
count_trainable_parameters(peft_model.base_model.classifier)

1184260

### can we load in a base peft model with no spcific task

What can happen is the get_peft_model will end up freezing all layers if no task is specified

[x] - Works for AutoModel with no LM head
[]  - Does not work for AutoModelForMLM

In [23]:
auto_model = AutoModel.from_pretrained(model_name_or_path, torch_dtype=torch.float16)
mlm_model = AutoModelForMaskedLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
auto_model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [25]:
mlm_model

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

In [3]:
peft_type = PeftType.LORA
lr = 3e-4
peft_config = LoraConfig(task_type=None, inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)

In [11]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, base_model_name_or_path=None, revision=None, task_type=None, inference_mode=False, r=8, target_modules=None, lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)

In [4]:
lora_roberta  = get_peft_model(model, peft_config)

NameError: name 'model' is not defined

In [19]:
lora_roberta

PeftModel(
  (base_model): LoraModel(
    (model): RobertaForMaskedLM(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_

In [21]:
lora_roberta.print_trainable_parameters()

trainable params: 294,912 || all params: 124,992,345 || trainable%: 0.23594404921357384


### Can we use peft on top of peft method - i.e. LORA first and then PROMPT_TUNING on top of that?

In [5]:
peft_type = PeftType.LORA
lr = 3e-4
peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, num_labels=2)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should pr

In [7]:
lora_roberta  = get_peft_model(model, peft_config)

In [8]:
lora_roberta.print_trainable_parameters()

trainable params: 1,479,172 || all params: 125,534,212 || trainable%: 1.1783018959006968


In [9]:
# count trainable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [10]:
count_parameters(lora_roberta)

1479172

In [12]:
lora_roberta

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      

In [8]:
lora_roberta.print_trainable_parameters()

trainable params: 1,479,172 || all params: 125,534,212 || trainable%: 1.1783018959006968


In [10]:
# now run this through the prefix tuning peft config too

second_peft_type = PeftType.PROMPT_TUNING
lr = 1e-3
second_peft_config = PromptTuningConfig(task_type="SEQ_CLS", 
                                    num_virtual_tokens=10)

full_peft_roberta = get_peft_model(lora_roberta, second_peft_config)


AttributeError: 'NoneType' object has no attribute 'named_parameters'

### mobile bert models
at moment does not work with mobile bert models - the peft library does not support them directly

In [2]:
# moible bert doesn't work with lora right now
mobile_bert = AutoModelForSequenceClassification.from_pretrained("nlpie/bio-mobilebert")

Some weights of the model checkpoint at nlpie/bio-mobilebert were not used when initializing MobileBertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MobileBertForSequenceClassification were not initializ

In [3]:
mobile_bert

MobileBertForSequenceClassification(
  (mobilebert): MobileBertModel(
    (embeddings): MobileBertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (embedding_transformation): Linear(in_features=384, out_features=512, bias=True)
      (LayerNorm): NoNorm()
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): MobileBertEncoder(
      (layer): ModuleList(
        (0-23): 24 x MobileBertLayer(
          (attention): MobileBertAttention(
            (self): MobileBertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=512, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): MobileBertSelfOutput(
              (dense): Linear(in_fe

In [13]:
mobile_bert.config

MobileBertConfig {
  "_name_or_path": "nlpie/bio-mobilebert",
  "architectures": [
    "MobileBertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_activation": false,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "relu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "intra_bottleneck_size": 128,
  "key_query_shared_bottleneck": true,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "mobilebert",
  "normalization_type": "no_norm",
  "num_attention_heads": 4,
  "num_feedforward_networks": 4,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "torch_dtype": "float32",
  "transformers_version": "4.30.2",
  "trigram_input": true,
  "true_hidden_size": 128,
  "type_vocab_size": 2,
  "use_bottleneck": true,
  "use_bottleneck_attention": false,
  "vocab_size": 30522
}

In [5]:
peft_type = PeftType.LORA
lr = 3e-4
peft_config = LoraConfig(task_type="SEQ_CLS", target_modules=["key", "value", "query"], inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)

In [6]:
# get peft model
lora_mobile_bert  = get_peft_model(mobile_bert, peft_config)

In [7]:
lora_mobile_bert

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): MobileBertForSequenceClassification(
      (mobilebert): MobileBertModel(
        (embeddings): MobileBertEmbeddings(
          (word_embeddings): Embedding(30522, 128, padding_idx=0)
          (position_embeddings): Embedding(512, 512)
          (token_type_embeddings): Embedding(2, 512)
          (embedding_transformation): Linear(in_features=384, out_features=512, bias=True)
          (LayerNorm): NoNorm()
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (encoder): MobileBertEncoder(
          (layer): ModuleList(
            (0-23): 24 x MobileBertLayer(
              (attention): MobileBertAttention(
                (self): MobileBertSelfAttention(
                  (query): Linear(
                    in_features=128, out_features=128, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
         

### Longformer models?

In [26]:
# load longformer 

longformer = LongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096")

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weig

In [4]:
longformer.classifier

LongformerClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=2, bias=True)
)

In [6]:
count_trainable_parameters(longformer)

148660994

In [27]:
count_trainable_parameters(longformer.classifier)

592130

In [72]:
"longformer" in "yikuan8/Clinical-Longformer".lower()

True

In [67]:
auto_longformer = AutoModelForSequenceClassification.from_pretrained("yikuan8/Clinical-Longformer")

Some weights of the model checkpoint at yikuan8/Clinical-Longformer were not used when initializing LongformerForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'longformer.embeddings.position_ids', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at yikuan8/Clinical-Longformer and are newly initial

In [68]:
count_trainable_parameters(auto_longformer)

148660994

In [30]:
count_trainable_parameters(auto_longformer.classifier)

592130

In [69]:
auto_longformer.classifier

LongformerClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=2, bias=True)
)

In [31]:
peft_type = PeftType.LORA
lr = 3e-4
peft_config = LoraConfig(task_type=None, target_modules=["query","value","key", 
                                                         "query_global", 
                                                         "value_global",
                                                         "key_global"] ,inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)

In [33]:
peft_longformer = get_peft_model(longformer, peft_config)

In [16]:
peft_longformer

PeftModel(
  (base_model): LoraModel(
    (model): LongformerForSequenceClassification(
      (longformer): LongformerModel(
        (embeddings): LongformerEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (position_embeddings): Embedding(4098, 768, padding_idx=1)
        )
        (encoder): LongformerEncoder(
          (layer): ModuleList(
            (0-11): 12 x LongformerLayer(
              (attention): LongformerAttention(
                (self): LongformerSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      

In [17]:
peft_longformer.print_trainable_parameters()

trainable params: 884,736 || all params: 149,545,730 || trainable%: 0.5916156883917716


In [24]:
count_trainable_parameters(peft_longformer.classifier)

0

In [34]:
# unfreeze classifier
unfreeze_model(peft_longformer.classifier)

In [35]:
count_trainable_parameters(peft_longformer.classifier)

592130

In [42]:
peft_longformer.print_trainable_parameters()

trainable params: 1,476,866 || all params: 149,545,730 || trainable%: 0.987568150558361


In [36]:
peft_longformer

PeftModel(
  (base_model): LoraModel(
    (model): LongformerForSequenceClassification(
      (longformer): LongformerModel(
        (embeddings): LongformerEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (position_embeddings): Embedding(4098, 768, padding_idx=1)
        )
        (encoder): LongformerEncoder(
          (layer): ModuleList(
            (0-11): 12 x LongformerLayer(
              (attention): LongformerAttention(
                (self): LongformerSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      

In [43]:
peft_longformer.classifier

LongformerClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=2, bias=True)
)

when no task is specified peft library freezes all non-lora weights including the classifier etc

In [19]:
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [22]:
# create set of inputs to pass to transformer model
inputs = tokenizer(["Hello, my dog is cute", "Hello, my cat is cute"], return_tensors="pt", padding="max_length")

In [23]:
inputs

{'input_ids': tensor([[    0, 31414,     6,  ...,     1,     1,     1],
        [    0, 31414,     6,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [24]:
# pass to model
outputs = peft_longformer(**inputs)

In [15]:
# can we just load with seq task type? 

In [37]:
peft_type = PeftType.LORA
lr = 3e-4
peft_config = LoraConfig(task_type="SEQ_CLS", target_modules=["query","value","key", 
                                                         "query_global", 
                                                         "value_global",
                                                         "key_global"] ,inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)

In [38]:
lora_long_auto = get_peft_model(auto_longformer, peft_config)

In [39]:
lora_long_auto

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LongformerForSequenceClassification(
      (longformer): LongformerModel(
        (embeddings): LongformerEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (position_embeddings): Embedding(4098, 768, padding_idx=1)
        )
        (encoder): LongformerEncoder(
          (layer): ModuleList(
            (0-11): 12 x LongformerLayer(
              (attention): LongformerAttention(
                (self): LongformerSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDic

In [40]:
lora_long_auto.print_trainable_parameters()

trainable params: 2,068,996 || all params: 150,137,860 || trainable%: 1.3780641338567101


In [44]:
lora_long_auto.classifier

ModulesToSaveWrapper(
  (original_module): LongformerClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
  )
  (modules_to_save): ModuleDict(
    (default): LongformerClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
    )
  )
)

In [41]:
count_trainable_parameters(lora_long_auto.classifier)

1184260

In [23]:
768*768 + 768*2

591360

### 8 bit models

In [51]:
# can you do 8 bit roberta?
model_name_or_path = "ybelkada/falcon-7b-sharded-bf16" # | roberta-large | ybelkada/falcon-7b-sharded-bf16
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path,num_labels=7, torch_dtype=torch.bfloat16, 
                                                              load_in_8bit=True, output_hidden_states=False,                                                                
                                                                device_map="auto",)

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b:
- configuration_RW.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b:
- modelling_RW.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of the model checkpoint at ybelkada/falcon-7b-sharded-bf16 were not used when initializing RWForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing RWForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RWForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RWForSequenceClassification were not initialized from the model checkpoint at ybelkada/falcon-7b-sharded-bf16 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
model.dtype

torch.bfloat16

In [55]:
model.transformer.word_embeddings.weight.dtype

torch.bfloat16

In [57]:
model_prepared = prepare_model_for_kbit_training(model)

In [58]:
model_prepared.transformer.word_embeddings.weight.dtype

torch.float32

In [10]:
roberta_model_prepared.roberta.encoder.layer[0].attention.self.query.weight.dtype

torch.int8

In [18]:
roberta_model_prepared

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear8bitLt(in_features=768, out_features=768, bias=True)
              (key): Linear8bitLt(in_features=768, out_features=768, bias=True)
              (value): Linear8bitLt(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear8bitLt(in_features=768, out_features=768, 

In [3]:
# falcon 
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [8]:
tokenizer
if getattr(tokenizer, "pad_token_id") is None:
    print("no pad token")
    tokenizer.pad_token = tokenizer.eos_token

no pad token


In [21]:
falcon_model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path,num_labels=7, torch_dtype=torch.bfloat16, 
                                                              load_in_8bit=True, output_hidden_states=False,                                                                
                                                                device_map="auto", trust_remote_code = True)

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of the model checkpoint at ybelkada/falcon-7b-sharded-bf16 were not used when initializing RWForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing RWForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RWForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RWForSequenceClassification were not initialized from the model checkpoint at ybelkada/falcon-7b-sharded-bf16 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
falcon_model

RWForSequenceClassification(
  (transformer): RWModel(
    (word_embeddings): Embedding(65024, 4544)
    (h): ModuleList(
      (0-31): 32 x DecoderLayer(
        (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
        (self_attention): Attention(
          (maybe_rotary): RotaryEmbedding()
          (query_key_value): Linear8bitLt(in_features=4544, out_features=4672, bias=False)
          (dense): Linear8bitLt(in_features=4544, out_features=4544, bias=False)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): MLP(
          (dense_h_to_4h): Linear8bitLt(in_features=4544, out_features=18176, bias=False)
          (act): GELU(approximate='none')
          (dense_4h_to_h): Linear8bitLt(in_features=18176, out_features=4544, bias=False)
        )
      )
    )
    (ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=4544, out_features=7, bias=False)
)

In [14]:
falcon_model.config.pad_token_id = tokenizer.pad_token_id

In [15]:
falcon_model.config.pad_token_id

11

In [4]:
# 8 bit llama
llama_model = LlamaForSequenceClassification.from_pretrained(model_name_or_path, num_labels=7, torch_dtype=torch.bfloat16, 
                                                              load_in_8bit=True, output_hidden_states=False,                                                                
                                                                device_map="auto",
                                                                )

Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

Some weights of the model checkpoint at decapoda-research/llama-7b-hf were not used when initializing LlamaForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at decapoda-research/llama-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# 16bit
# llama_model = LlamaForSequenceClassification.from_pretrained(model_name_or_path, num_labels=7, torch_dtype=torch.bfloat16)

In [21]:
llama_model

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=31999)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    

In [6]:
llama_model.to(device)

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=31999)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    

In [5]:
# roberta_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", output_hidden_states=True)
# custom_model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, output_hidden_states=True)

In [5]:
roberta_model.state_dict().keys()

odict_keys(['roberta.embeddings.position_ids', 'roberta.embeddings.word_embeddings.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.embeddings.token_type_embeddings.weight', 'roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.self.query.weight', 'roberta.encoder.layer.0.attention.self.query.bias', 'roberta.encoder.layer.0.attention.self.key.weight', 'roberta.encoder.layer.0.attention.self.key.bias', 'roberta.encoder.layer.0.attention.self.value.weight', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.0.attention.output.dense.weight', 'roberta.encoder.layer.0.attention.output.dense.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.intermediate.dense.weight', 'roberta.encoder.layer.0.intermediate.dense.bias', 'roberta.encoder.layer.0.output.dense.weight', 'roberta.encoder.layer.0.outp

In [8]:
for p in roberta_model.parameters():
    print(p.dtype)
    break

torch.float32


In [6]:
roberta_model.config

RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.27.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [10]:
custom_model.state_dict().keys()

odict_keys(['roberta.embeddings.position_ids', 'roberta.embeddings.word_embeddings.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.embeddings.token_type_embeddings.weight', 'roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.self.query.weight', 'roberta.encoder.layer.0.attention.self.query.bias', 'roberta.encoder.layer.0.attention.self.key.weight', 'roberta.encoder.layer.0.attention.self.key.bias', 'roberta.encoder.layer.0.attention.self.value.weight', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.0.attention.output.dense.weight', 'roberta.encoder.layer.0.attention.output.dense.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.intermediate.dense.weight', 'roberta.encoder.layer.0.intermediate.dense.bias', 'roberta.encoder.layer.0.output.dense.weight', 'roberta.encoder.layer.0.outp

In [7]:
custom_model.config

RobertaConfig {
  "_name_or_path": "/mnt/sdc/niallt/saved_models/declutr/mimic/few_epoch/mimic-roberta-base/2_anch_2_pos_min_1024/transformer_format/",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.27.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [7]:
for p in custom_model.parameters():
    print(p.dtype)
    break

torch.float32


In [17]:
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
# custom_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
llama_tokenizer = LlamaTokenizer.from_pretrained(model_name_or_path)

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


In [8]:
roberta_tokenizer

RobertaTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [9]:
custom_tokenizer

RobertaTokenizerFast(name_or_path='/mnt/sdc/niallt/saved_models/declutr/mimic/few_epoch/mimic-roberta-base/2_anch_2_pos_min_1024/transformer_format/', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [4]:
# peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)
# lr = 3e-4

In [12]:
optimizer = AdamW(params=roberta_model.parameters(), lr=0.001)  

In [14]:
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0.01
)

In [13]:
custom_optimizer = AdamW(params=custom_model.parameters(), lr=0.001)

In [15]:
custom_optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0.01
)

## Function to load different peft setups

In [4]:
def setup_peft_model(model_name_or_path,                     
                     peft_method,
                     task_type,                     
                     device,
                     num_virtual_tokens= 20,
                     num_labels = 7):
    '''
    Function to setup the peft model for training and return a peft model based on the peft method specified.
    
    '''
    
    if peft_method == "LORA":
        loguru_logger.info("Using LORA")
        peft_type = PeftType.LORA
        lr = 3e-4
        peft_config = LoraConfig(task_type=task_type, inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)
    elif peft_method == "PREFIX_TUNING":
        loguru_logger.info("Using PREFIX_TUNING")
        peft_type = PeftType.PREFIX_TUNING
        peft_config = PrefixTuningConfig(task_type=task_type, num_virtual_tokens=20)
        lr = 1e-2
    elif peft_method == "PROMPT_TUNING":
        loguru_logger.info("Using PROMPT_TUNING")
        peft_type = PeftType.PROMPT_TUNING
        peft_config = PromptTuningConfig(task_type=task_type, num_virtual_tokens=10)
        lr = 1e-3
    elif peft_method == "P_TUNING":
        loguru_logger.info("Using P_TUNING")
        peft_type = PeftType.P_TUNING
        peft_config = PromptEncoderConfig(task_type=task_type, num_virtual_tokens=20, encoder_hidden_size=128)
        lr = 1e-3
        

    # load peft model
    if "llama" in model_name_or_path:
        loguru_logger.info("Loading LLAMA model in 8 bit")
        # model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path,
        #                                                            torch_dtype=torch.bfloat16,
        #                                                            num_labels = num_labels,return_dict=True)
        # 8 bit
        model = LlamaForSequenceClassification.from_pretrained(model_name_or_path, num_labels=num_labels, torch_dtype=torch.float16, 
                                                              load_in_8bit=True, output_hidden_states=False,                                                                
                                                                device_map="auto",
                                                                )
    else:
        
        model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, num_labels = num_labels,return_dict=True)
        model.to(device)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
   
    
    
    
    # setup optimizer and lr_scheduler
    optimizer = AdamW(params=model.parameters(), lr=lr)

    # Instantiate scheduler
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
        num_training_steps=(len(train_dataloader) * num_epochs),
    )
    return model, peft_config, optimizer, lr_scheduler

### setup task and dataset

In [6]:
# create dictionary of various datasets and their sentence keys
task_to_keys ={
                "cola": ("sentence", None),
                "mnli": ("premise", "hypothesis"),
                "mnli-mm": ("premise", "hypothesis"),
                "mrpc": ("sentence1", "sentence2"),
                "qnli": ("question", "sentence"),
                "qqp": ("question1", "question2"),
                "rte": ("sentence1", "sentence2"),
                "sst2": ("sentence", None),
                "stsb": ("sentence1", "sentence2"),
                "wnli": ("sentence1", "sentence2"),
                "mimic-note-category": ("TEXT", None),
                "icd9-triage":("text", None),
                "icd9-triage-no-category-in-text":("text", None),
                "mednli":("sentence1", "sentence2")
                }

In [7]:
# task for now is icd9-triage
task = "mednli"

# task = "mrpc"

### load dataset 

In [14]:
with open('../datasets.yaml', 'r') as f:
    datasets = yaml.load(f, yaml.FullLoader)

try:
    dataset_info = datasets[task]

except KeyError:
    print(f"Task name {task} not in datasets.yaml. Available tasks are: {list(datasets.keys())}")
    exit(0)



In [15]:
dataset_info

{'training_data_dir': '',
 'eval_data_dir': '',
 'data_dir': '/mnt/sdd/efficient_ml_data/datasets/MedNLI',
 'training_file': '',
 'validation_file': '',
 'test_file': '',
 'task_type': 'SEQ_CLS',
 'label_name': '',
 'text_column': ['sentence1 sentence2'],
 'remove_columns': ['sentence1 sentence2']}

In [20]:
datasets = load_from_disk(dataset_info["data_dir"])

In [22]:
# # load dataset

# training_data_dir = "/mnt/sdc/niallt/mimic_iii/processed/HADM_ID_split/icd9-triage/no_category_in_text/fewshot_64/"
# eval_data_dir = "/mnt/sdc/niallt/mimic_iii/processed/HADM_ID_split/icd9-triage/no_category_in_text/"
# datasets = load_dataset("csv", 
#                         data_files = {"train":f"{training_data_dir}/train.csv",
#                                         "validation":f"{eval_data_dir}/valid.csv",
#                                         "test":f"{eval_data_dir}/test.csv"},
#                         cache_dir = "/mnt/sdc/niallt/.cache/")

# loguru_logger.info(f"Number of training samples: {len(datasets['train'])}\n and validation samples:{len(datasets['validation'])}")

In [24]:
# get number of labels
num_labels = len(np.unique(datasets["train"]["labels"]))

In [25]:
num_labels

3

In [19]:
len(datasets["train"])/batch_size * num_epochs

140.0

In [15]:
num_training_steps

(140,)

### pre-process/encode dataset

In [26]:
sentence1_key, sentence2_key = task_to_keys[task]

In [29]:
if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

if "llama" in model_name_or_path:
    tokenizer = LlamaTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
else:
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id


# set the sentence/task keys
# sentence1_key, sentence2_key = task_to_keys[task]

# for glue
# def tokenize_function(examples):
#     # max_length=None => use the model max length (it's actually the default)
#     outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=480)
#     return outputs

# own
def tokenize_function(examples):
    # max_length is important when using prompt tuning  or prefix tuning or p tuning as virtual tokens are added - which can overshoot the max length in pefts current form
    # for now set to 480 and see how it goes
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True, max_length = 480)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True, max_length=480)

# own
tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=['sentence1', 'sentence2'], # remove_columns=['text', 'triage-category']
)

# for glue
# tokenized_datasets = datasets.map(
#     tokenize_function,
#     batched=True,
#     remove_columns=["idx", "sentence1", "sentence2"],
# )
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
# transformers library
# tokenized_datasets = tokenized_datasets.rename_column("label", "labels")


def collate_fn(examples):
    return tokenizer.pad(examples, padding="longest", return_tensors="pt")


# Instantiate dataloaders.
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size
)

Loading cached processed dataset at /mnt/sdd/efficient_ml_data/datasets/MedNLI/train/cache-c9adf01eef7de685.arrow


Map:   0%|          | 0/1395 [00:00<?, ? examples/s]

Loading cached processed dataset at /mnt/sdd/efficient_ml_data/datasets/MedNLI/test/cache-1c451bd3938b5117.arrow


In [30]:
train_dataloader.dataset[0]

{'labels': 2,
 'input_ids': [0,
  574,
  10155,
  58,
  8091,
  13,
  5309,
  112,
  4,
  406,
  36,
  15609,
  7012,
  321,
  4,
  245,
  228,
  793,
  2189,
  43,
  8,
  36149,
  877,
  132,
  4,
  306,
  4,
  2,
  2,
  27690,
  34,
  10944,
  5309,
  2],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [33]:
# use tokenizer to decode a sample

tokenizer.decode(train_dataloader.dataset[0]["input_ids"])


'<s>Labs were notable for Cr 1.7 (baseline 0.5 per old records) and lactate 2.4.</s></s> Patient has elevated Cr</s>'

In [13]:
# run through dataloader and check that the labels are correct
for batch in train_dataloader:
    # print length of input ids
    print(len(batch["input_ids"][0]))

480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480
480


In [11]:
def compute_metrics(eval_pred):
        precision_score = evaluate.load("precision")
        recall_score = evaluate.load("recall")
        accuracy_score = evaluate.load("accuracy")
        f1_score = evaluate.load("f1")        
        roc_auc_score = evaluate.load("roc_auc", "multiclass")        

        logits, labels = eval_pred
        
        # print(f"logits are: {logits} of shape: {logits.shape}")
        #TODO add softmax to convert logits to probs
        # print(f"logits shape is: {logits.shape}")
        pred_scores = softmax(logits, axis = -1)        
        predictions = np.argmax(logits, axis = -1)
        
        # print(f"Labels are: {labels}\n")
        # print(f"Preds are: {predictions}")
        precision = precision_score.compute(predictions=predictions, references=labels, average = "macro")["precision"]
        recall = recall_score.compute(predictions=predictions, references=labels, average = "macro")["recall"]
        accuracy = accuracy_score.compute(predictions=predictions, references=labels)["accuracy"]
        f1_macro = f1_score.compute(predictions=predictions, references=labels, average = "macro")["f1"]
        f1_micro = f1_score.compute(predictions=predictions, references=labels, average = "micro")["f1"]
        f1_weighted = f1_score.compute(predictions=predictions, references=labels, average = "weighted")["f1"]
        # roc_auc has slightly different format - needs the probs/scores rather than predicted labels
        roc_auc = roc_auc_score.compute(references=labels,
                                        prediction_scores = pred_scores,
                                        multi_class = 'ovr', 
                                        average = "macro")['roc_auc']
        
        return {"precision": precision, 
                "recall": recall,
                "accuracy": accuracy,
                "f1_macro":f1_macro,
                "f1_micro":f1_micro,
                "f1_weighted":f1_weighted,
                "roc_auc_macro":roc_auc}

## Setup PEFT model

In [13]:
model, peft_config, optimizer, lr_scheduler = setup_peft_model(model_name_or_path, peft_method = "LORA", task_type = "SEQ_CLS", device = "cuda", num_labels = num_labels)

2023-07-07 15:54:44.832 | INFO     | __main__:setup_peft_model:13 - Using LORA


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

Some weights of the model checkpoint at decapoda-research/llama-7b-hf were not used when initializing LlamaForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at decapoda-research/llama-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 4,251,648 || all params: 6,611,595,264 || trainable%: 0.06430593268692861


In [14]:
model.device

device(type='cuda', index=0)

In [None]:
# roberta_model, peft_config, optimizer, lr_scheduler = setup_peft_model("roberta-base", peft_method = "LORA", task_type = "SEQ_CLS", device = "cuda", num_labels = num_labels)

2023-05-26 13:53:51.951 | INFO     | __main__:setup_peft_model:13 - Using LORA
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at

trainable params: 1486862 || all params: 125541902 || trainable%: 1.1843551645409993


## debugging

In [14]:
import torchinfo
import torch.nn as nn

In [18]:
# Count the number of zeros in each parameter tensor
num_zeros = 0
for param in model.parameters():
    num_zeros += torch.numel(param) - torch.count_nonzero(param)
print("Number of zeros:", num_zeros)

Number of zeros: tensor(149774, device='cuda:0')


In [19]:
# Count the number of zeros in each parameter tensor
num_zeros = 0
for param in roberta_model.parameters():
    num_zeros += torch.numel(param) - torch.count_nonzero(param)
print("Number of zeros:", num_zeros)

Number of zeros: tensor(150612, device='cuda:0')


In [21]:
# Count the number of zeros in each parameter tensor
num_zeros = 0
for param in declutr_model.parameters():
    num_zeros += torch.numel(param) - torch.count_nonzero(param)
print("Number of zeros:", num_zeros)

Number of zeros: tensor(149774, device='cuda:0')


In [15]:
torchinfo.summary(model)

Layer (type:depth-idx)                                                      Param #
PeftModelForSequenceClassification                                          --
├─LoraModel: 1-1                                                            --
│    └─RobertaForSequenceClassification: 2-1                                --
│    │    └─RobertaModel: 3-1                                               124,349,952
│    │    └─ModulesToSaveWrapper: 3-2                                       1,191,950
Total params: 125,541,902
Trainable params: 1,486,862
Non-trainable params: 124,055,040

In [None]:
====================================================================================================
Layer (type:depth-idx)                                                      Param #
====================================================================================================
PeftModelForSequenceClassification                                          --
├─LoraModel: 1-1                                                            --
│    └─RobertaForSequenceClassification: 2-1                                --
│    │    └─RobertaModel: 3-1                                               124,349,952
│    │    └─ModulesToSaveWrapper: 3-2                                       1,191,950
====================================================================================================
Total params: 125,541,902
Trainable params: 1,486,862
Non-trainable params: 124,055,040

In [None]:
trainable params: 1486862 || all params: 125541902 || trainable%: 1.1843551645409993

In [23]:
294912/1486862

0.1983452398406846

In [13]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, base_model_name_or_path='roberta-base', task_type='SEQ_CLS', inference_mode=False, r=8, target_modules=['query', 'value'], lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True)

## Train PEFT model

In [13]:
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=31999)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=4096, out_features=4096, bias=

#### debugging - custom roberta model leads to weird memory issues

In [23]:
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      

In [14]:
for batch in eval_dataloader:
    
    with torch.no_grad():
        outputs = model(**batch.to(device))
    predictions = outputs.logits.argmax(dim=-1)
    predictions, references = predictions, batch["labels"]
    print(batch.keys())
    # print(f"outputs:{outputs}")
    print(f"predictions:{predictions}")
    print(f"references:{references}")
    
    break

dict_keys(['labels', 'input_ids', 'attention_mask'])
predictions:tensor([1, 5, 5, 4], device='cuda:0')
references:tensor([0, 1, 1, 0], device='cuda:0')


In [1]:
# roberta_model

In [16]:
roberta_outputs

SequenceClassifierOutput(loss=tensor(2.0110, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0555, -0.0210, -0.1128,  0.0344,  0.1020, -0.0508, -0.0510],
        [-0.0573, -0.0200, -0.1036,  0.0467,  0.1004, -0.0551, -0.0567],
        [-0.0556, -0.0172, -0.1156,  0.0369,  0.0998, -0.0592, -0.0500],
        [-0.0651, -0.0201, -0.1072,  0.0478,  0.0974, -0.0566, -0.0518]],
       device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [17]:
custom_outputs

SequenceClassifierOutput(loss=tensor(2.0342, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1482, -0.0124, -0.0138,  0.2247,  0.0693, -0.3117,  0.2074],
        [-0.0756,  0.0031, -0.0501,  0.1328,  0.0983, -0.2762,  0.2746],
        [-0.0222,  0.0371, -0.0096,  0.2350,  0.1343, -0.2727,  0.1959],
        [-0.0484, -0.0302, -0.0769,  0.1806, -0.0051, -0.2624,  0.1530]],
       device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=(tensor([[[ 0.1712, -0.0483, -0.0230,  ..., -0.0518,  0.0839,  0.0430],
         [ 0.0998, -0.1730, -0.0247,  ..., -0.1891, -0.0604,  0.0077],
         [-0.1805, -0.3721, -0.0325,  ..., -0.2030, -0.2124,  0.0519],
         ...,
         [-0.2720,  0.0421,  0.2390,  ..., -0.5429,  0.1871, -0.1404],
         [ 0.2573, -0.1542,  0.0187,  ..., -0.9629,  0.0768,  0.2467],
         [-0.0058, -0.2664, -0.1019,  ...,  0.3818,  0.1682,  0.1829]],

        [[ 0.1712, -0.0483, -0.0230,  ..., -0.0518,  0.0839,  0.0430],
         [ 0.0998, -0.1730, -0

In [23]:
# optimizer.param_groups

In [25]:
lr_scheduler

<torch.optim.lr_scheduler.LambdaLR at 0x7ff934fa4f40>

In [26]:
# optimizer = AdamW(params=model.parameters(), lr=lr)

# # Instantiate scheduler
# lr_scheduler = get_linear_schedule_with_warmup(
#     optimizer=optimizer,
#     num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
#     num_training_steps=(len(train_dataloader) * num_epochs),
# )

In [27]:
device

'cuda'

In [14]:
metric = evaluate.load("f1")

In [15]:
metric

EvaluationModule(name: "f1", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    labels (`list` of `int`): The set of labels to include when `average` is not set to `'binary'`, and the order of the labels if `average` is `None`. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class. Labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in `predictions` and `references` are used in sorted order. Defaults to None.
    pos_label (`int`): The class to be considered the positive class, in the case where `average` is set to `binary`. Defaults to 1.
    average (`string`): This parameter is required for multiclass/multilabel t

## Train

In [16]:
model.to(device)
for epoch in range(num_epochs):
    model.train()
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch.to(device)
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch.to(device)
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        predictions, references = predictions, batch["labels"]
        metric.add_batch(
            predictions=predictions,
            references=references,
        )

    eval_metric = metric.compute(average = "macro")
    print(f"epoch {epoch}:", eval_metric)

  0%|          | 0/224 [00:00<?, ?it/s]

100%|██████████| 224/224 [02:54<00:00,  1.28it/s]
100%|██████████| 1557/1557 [08:14<00:00,  3.15it/s]


epoch 0: {'f1': 0.09580728769629517}


100%|██████████| 224/224 [03:03<00:00,  1.22it/s]
 13%|█▎        | 205/1557 [01:08<07:33,  2.98it/s]

 13%|█▎        | 208/1557 [01:09<07:41,  2.92it/s]

#### save model


In [None]:
model.save_pretrained('./peft_model')

#### reload saved peft weights

In [4]:
peft_model_dir = "/mnt/sdc/niallt/saved_models/peft_training/ckpts/icd9-triage-no-category-in-text/fewshot_64/mimic-roberta-base/declutr/2_anch_2_pos_min_1024/LORA/23-06-2023--11-41/"


In [6]:
# load config
config = PeftConfig.from_pretrained(peft_model_dir)
# load base model 
model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path, num_labels = 7)
# load peft model
model = PeftModel.from_pretrained(model, peft_model_dir)


Some weights of the model checkpoint at /mnt/sdc/niallt/saved_models/declutr/mimic/few_epoch/mimic-roberta-base/2_anch_2_pos_min_1024/transformer_format/ were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /mnt/sdc/niallt/saved_models/declutr/mimic/few_epoch/mimic-robe

In [7]:
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      

# below is miscellanous code that is not used in the tutorial

In [15]:
import torchinfo 

Layer (type:depth-idx)                                                      Param #
PeftModelForSequenceClassification                                          --
├─LoraModel: 1-1                                                            --
│    └─RobertaForSequenceClassification: 2-1                                --
│    │    └─RobertaModel: 3-1                                               124,349,952
│    │    └─ModulesToSaveWrapper: 3-2                                       1,191,950
Total params: 125,541,902
Trainable params: 1,486,862
Non-trainable params: 124,055,040

In [None]:
# roberta model gpu usage
18777

In [None]:
# declutr model gpu usage
19905 - 19595

In [29]:
tokenizer.model_max_length

512

In [30]:
model.peft_config

{'default': PromptTuningConfig(peft_type=<PeftType.PROMPT_TUNING: 'PROMPT_TUNING'>, base_model_name_or_path='roberta-base', task_type='SEQ_CLS', inference_mode=False, num_virtual_tokens=10, token_dim=768, num_transformer_submodules=1, num_attention_heads=12, num_layers=12, prompt_tuning_init=<PromptTuningInit.RANDOM: 'RANDOM'>, prompt_tuning_init_text=None, tokenizer_name_or_path=None)}