In [None]:
from transformers import BertTokenizer, RobertaTokenizer
from model.engine import BertClassifier, RobertaClassifier
from model.bert import BertModel
from model.roberta import RobertaModel
from train import DoubleLoss
from transformers import AutoModel, BertModel

import torch
import torch.nn as nn
import torch.nn.functional as F


# Since, we want to optimize only parameters of the adapter modules and layer normalization
# l=["adapter", "LayerNorm"]
# [n for n, p in model.named_parameters() if any([(nd in n) for nd in l])]

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaClassifier(model_path='roberta-base', num_labels=100, tokenizer='roberta-base')
# model_bert = AutoModel.from_pretrained('roberta-base')



In [2]:
from transformers import get_linear_schedule_with_warmup

strategy = 'adapter'
trainable_params_count = 0

In [3]:
import torch


optimizer_grouped_parameters = []
if strategy == 'full-finetuning':
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.1,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
elif strategy == 'adapter':
    no_decay = ["adapter.proj_up.bias", "adapter.proj_down.bias", "LayerNorm"]
    cls_bias = ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
    cls_weight = ['cls.seq_relationship.weight', 'cls.predicions.transform.dense.weight', 'cls.predictions.decoder.weight']
    layers = ["adapter.proj_up.weight", "adapter.proj_down.weight"]
    layers.extend(cls_weight)
    no_decay.extend(cls_bias)
    
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if any([nd in n for nd in layers])],
            "weight_decay": 0.1,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]

# count the total no. of trainable params
for group in optimizer_grouped_parameters:
    for param in group["params"]:
        trainable_params_count += param.numel()
print(f'Total Trainable params: {trainable_params_count}')

optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=1e-5, eps=1e-6)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=32,
)
scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}

Total Trainable params: 2417664


In [5]:
[scheduler]

[{'scheduler': <torch.optim.lr_scheduler.LambdaLR at 0x7fb36780fc50>,
  'interval': 'step',
  'frequency': 1}]