In [1]:
from peft import (LoraConfig, PeftType, PrefixTuningConfig,
                  PromptEncoderConfig, PromptTuningConfig, TaskType,
                  get_peft_config, get_peft_model, get_peft_model_state_dict,
                  prepare_model_for_int8_training,
                  prepare_model_for_kbit_training, set_peft_model_state_dict)
from scipy.special import softmax
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (AutoModelForSequenceClassification,
                          AutoModelForTokenClassification, AutoTokenizer,
                          DataCollatorForTokenClassification,
                          LlamaForSequenceClassification, LlamaTokenizer,
                          Trainer, TrainingArguments,
                          get_linear_schedule_with_warmup, set_seed)
from model_utils import count_trainable_parameters
import pandas as pd
from tqdm import tqdm
import yaml
# import sys and append path
import sys
sys.path.append("../")
from peft_trainer import create_peft_config

  warn(f"Failed to load image Python extension: {e}")


In [3]:
model = AutoModelForSequenceClassification.from_pretrained("meta-llama/Llama-2-7b-hf")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at meta-llama/Llama-2-7b-hf were not used when initializing LlamaForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
model

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (nor

In [43]:


model_type_mappings = {"vanilla": "roberta-base",
               "mobile": "nlpie/bio-mobilebert",
               "distil": "nlpie/distil-biobert",
               "tiny": "nlpie/tiny-biobert",
            #    "llama-7b": "meta-llama/Llama-2-7b-hf",
               }

peft_types = ["PROMPT_TUNING","LORA", "PREFIX_TUNING", "P_TUNING"]

def get_number_of_trainable_params(model_type_mappings:dict,
                                   peft_types:list,
                                   task_type:str = "SEQ_CLS",
                                   num_labels:int = 2):

    # set up empty dicts to full for dfs
    model_peft_dict = {}
    
    for model_type in model_type_mappings.keys():
        
        model_dict = {}
        model_name_or_path = model_type_mappings[model_type]
        model_args = dict(pretrained_model_name_or_path=model_name_or_path, 
                          num_labels=num_labels, 
                          output_hidden_states=False, 
                          trust_remote_code=True)

            
        if task_type == "SEQ_CLS":
            model = AutoModelForSequenceClassification.from_pretrained(**model_args)
        elif task_type == "TOKEN_CLS":
            model = AutoModelForTokenClassification.from_pretrained(**model_args)
        
        # falcon model seems to use model config to define pad token and the remote code panicks if you don't set it
        if "falcon" in model_name_or_path:
            model.config.use_cache = False            

        # count total trainable params before peft
        total_trainable_params = count_trainable_parameters(model)
        
        for peft_method in tqdm(peft_types, desc=f"model type: {model_type}"):
            
            
            # set up some PEFT params
            peft_config, lr = create_peft_config(peft_method, model_name_or_path,task_type)
            model = get_peft_model(model, peft_config)
            print(f"peft config is: {peft_config}")
            # print(model)
            model.print_trainable_parameters()
            
            # lets also confirm this directly and save to args
            n_trainable_params = count_trainable_parameters(model)
            # proportion of total trainable params
            n_trainable_params_perc = (n_trainable_params / total_trainable_params) * 100
            
            # store the model name, peft method and number of trainable params
            model_dict[peft_method] = {"n_trainable_params": [n_trainable_params],
                                 "total_trainable_params": [total_trainable_params],
                                 "n_trainable_params_perc": [n_trainable_params_perc]}
            
        model_peft_dict[model_type] = model_dict

    return model_peft_dict

In [44]:
all_df = get_number_of_trainable_params(model_type_mappings, peft_types)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should pr

peft config is: PromptTuningConfig(peft_type=<PeftType.PROMPT_TUNING: 'PROMPT_TUNING'>, auto_mapping=None, base_model_name_or_path='roberta-base', revision=None, task_type='SEQ_CLS', inference_mode=False, num_virtual_tokens=10, token_dim=768, num_transformer_submodules=1, num_attention_heads=12, num_layers=12, prompt_tuning_init=<PromptTuningInit.RANDOM: 'RANDOM'>, prompt_tuning_init_text=None, tokenizer_name_or_path=None)
trainable params: 1,191,940 || all params: 125,246,980 || trainable%: 0.9516716490888643


model type: vanilla:  50%|█████     | 2/4 [00:00<00:00,  7.00it/s]2023-08-16 14:12:11.583 | INFO     | peft_trainer:create_peft_config:641 - Using PREFIX_TUNING
2023-08-16 14:12:11.615 | INFO     | peft_trainer:create_peft_config:661 - Using P_TUNING
model type: vanilla: 100%|██████████| 4/4 [00:00<00:00, 11.35it/s]

peft config is: LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=8, target_modules=['query', 'value'], lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)
trainable params: 294,912 || all params: 125,541,892 || trainable%: 0.23491122787921662
peft config is: PrefixTuningConfig(peft_type=<PeftType.PREFIX_TUNING: 'PREFIX_TUNING'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, num_virtual_tokens=20, token_dim=768, num_transformer_submodules=1, num_attention_heads=12, num_layers=12, encoder_hidden_size=768, prefix_projection=False)
trainable params: 368,640 || all params: 125,902,852 || trainable%: 0.2927971798446631
peft config is: PromptEncoderConfig(peft_type=<PeftType.P_TUNING: 'P_TUNING'>, auto_mapping=None, base_


Some weights of the model checkpoint at nlpie/bio-mobilebert were not used when initializing MobileBertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MobileBertForSequenceClassification were not initiali

peft config is: PromptTuningConfig(peft_type=<PeftType.PROMPT_TUNING: 'PROMPT_TUNING'>, auto_mapping=None, base_model_name_or_path='nlpie/bio-mobilebert', revision=None, task_type='SEQ_CLS', inference_mode=False, num_virtual_tokens=10, token_dim=128, num_transformer_submodules=1, num_attention_heads=4, num_layers=24, prompt_tuning_init=<PromptTuningInit.RANDOM: 'RANDOM'>, prompt_tuning_init_text=None, tokenizer_name_or_path=None)
trainable params: 3,332 || all params: 24,585,220 || trainable%: 0.013552858180646747
peft config is: LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=8, target_modules=['query', 'key', 'value'], lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)
trainable params: 221,184 || all params: 24,806,404 || trainable%: 0.8916407230971486


2023-08-16 14:12:14.097 | INFO     | peft_trainer:create_peft_config:661 - Using P_TUNING
model type: mobile: 100%|██████████| 4/4 [00:00<00:00, 11.42it/s]


peft config is: PrefixTuningConfig(peft_type=<PeftType.PREFIX_TUNING: 'PREFIX_TUNING'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, num_virtual_tokens=20, token_dim=512, num_transformer_submodules=1, num_attention_heads=4, num_layers=24, encoder_hidden_size=512, prefix_projection=False)
trainable params: 491,520 || all params: 25,296,644 || trainable%: 1.9430245371678552
peft config is: PromptEncoderConfig(peft_type=<PeftType.P_TUNING: 'P_TUNING'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, num_virtual_tokens=20, token_dim=512, num_transformer_submodules=1, num_attention_heads=4, num_layers=24, encoder_reparameterization_type=<PromptEncoderReparameterizationType.MLP: 'MLP'>, encoder_hidden_size=128, encoder_num_layers=2, encoder_dropout=0.0)
trainable params: 158,464 || all params: 24,963,588 || trainable%: 0.6347805451684269


Some weights of the model checkpoint at nlpie/distil-biobert were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpie/distil-biobert and

peft config is: PromptTuningConfig(peft_type=<PeftType.PROMPT_TUNING: 'PROMPT_TUNING'>, auto_mapping=None, base_model_name_or_path='nlpie/distil-biobert', revision=None, task_type='SEQ_CLS', inference_mode=False, num_virtual_tokens=10, token_dim=768, num_transformer_submodules=1, num_attention_heads=12, num_layers=6, prompt_tuning_init=<PromptTuningInit.RANDOM: 'RANDOM'>, prompt_tuning_init_text=None, tokenizer_name_or_path=None)
trainable params: 10,756 || all params: 65,793,796 || trainable%: 0.016348045946459753
peft config is: LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=8, target_modules=['query', 'value'], lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)
trainable params: 147,456 || all params: 65,941,252 || trainable%: 0.22361722825644864
peft config is: Prefix

Some weights of the model checkpoint at nlpie/tiny-biobert were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpie/tiny-biobert and are

peft config is: PromptTuningConfig(peft_type=<PeftType.PROMPT_TUNING: 'PROMPT_TUNING'>, auto_mapping=None, base_model_name_or_path='nlpie/tiny-biobert', revision=None, task_type='SEQ_CLS', inference_mode=False, num_virtual_tokens=10, token_dim=312, num_transformer_submodules=1, num_attention_heads=12, num_layers=4, prompt_tuning_init=<PromptTuningInit.RANDOM: 'RANDOM'>, prompt_tuning_init_text=None, tokenizer_name_or_path=None)
trainable params: 4,372 || all params: 13,878,508 || trainable%: 0.03150194530997136
peft config is: LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=8, target_modules=['query', 'value'], lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)
trainable params: 39,936 || all params: 13,918,444 || trainable%: 0.2869286250675722
peft config is: PrefixTuning




In [45]:
all_df

{'vanilla': {'PROMPT_TUNING': {'n_trainable_params': [1191940],
   'total_trainable_params': [124647170],
   'n_trainable_params_perc': [0.956251152753809]},
  'LORA': {'n_trainable_params': [294912],
   'total_trainable_params': [124647170],
   'n_trainable_params_perc': [0.23659742936803138]},
  'PREFIX_TUNING': {'n_trainable_params': [368640],
   'total_trainable_params': [124647170],
   'n_trainable_params_perc': [0.29574678671003923]},
  'P_TUNING': {'n_trainable_params': [229376],
   'total_trainable_params': [124647170],
   'n_trainable_params_perc': [0.1840202228418022]}},
 'mobile': {'PROMPT_TUNING': {'n_trainable_params': [3332],
   'total_trainable_params': [24582914],
   'n_trainable_params_perc': [0.013554129506371783]},
  'LORA': {'n_trainable_params': [221184],
   'total_trainable_params': [24582914],
   'n_trainable_params_perc': [0.8997468729703891]},
  'PREFIX_TUNING': {'n_trainable_params': [491520],
   'total_trainable_params': [24582914],
   'n_trainable_params_per

In [46]:
with open("../trainable_params.yaml", 'r') as f:
    reloaded = yaml.load(f, yaml.FullLoader)

In [48]:
reloaded["distil"]

{'LORA': {'n_trainable_params': [147456],
  'n_trainable_params_perc': [0.2241497999728751],
  'total_trainable_params': [65784578]},
 'PREFIX_TUNING': {'n_trainable_params': [184320],
  'n_trainable_params_perc': [0.2801872499660939],
  'total_trainable_params': [65784578]},
 'PROMPT_TUNING': {'n_trainable_params': [10756],
  'n_trainable_params_perc': [0.016350336700495363],
  'total_trainable_params': [65784578]},
 'P_TUNING': {'n_trainable_params': [229376],
  'n_trainable_params_perc': [0.34867746662447235],
  'total_trainable_params': [65784578]}}