<a href="https://colab.research.google.com/github/mobarakol/tutorial_notebooks/blob/main/ALL_PEFT_LoRA_DoRA_PitVQA_Sentence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Download code
!git clone https://github.com/HRL-Mike/PitVQA.git

#Download Dataset
!mkdir /content/PitVQA/datasets
%cd /content/PitVQA/datasets
!gdown --id 1FoAEY_u0PTAlrscjEifi2om15A83wL78

# Unzipping the VQA EndoVis18 Dataset
!unzip -q EndoVis-18-VQA.zip
%cd /content/PitVQA

Cloning into 'PitVQA'...
remote: Enumerating objects: 401, done.[K
remote: Counting objects: 100% (139/139), done.[K
remote: Compressing objects: 100% (138/138), done.[K
remote: Total 401 (delta 74), reused 0 (delta 0), pack-reused 262 (from 1)[K
Receiving objects: 100% (401/401), 14.44 MiB | 11.14 MiB/s, done.
Resolving deltas: 100% (199/199), done.
/content/PitVQA/datasets
Downloading...
From (original): https://drive.google.com/uc?id=1FoAEY_u0PTAlrscjEifi2om15A83wL78
From (redirected): https://drive.google.com/uc?id=1FoAEY_u0PTAlrscjEifi2om15A83wL78&confirm=t&uuid=f1e0204e-4c48-4cb4-865e-198ef8d1acea
To: /content/PitVQA/datasets/EndoVis-18-VQA.zip
100% 2.71G/2.71G [00:39<00:00, 68.2MB/s]
/content/PitVQA


In [2]:
!pip install -q timm==0.9.12 fairscale==0.4.13 scikit-learn==1.3.2 -U evaluate bert_score

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/60.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/266.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m266.2/266.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.3/266.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### Dataloader

In [1]:
import os
import glob

from PIL import Image
from torch.utils.data import Dataset
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from pathlib import Path
from torchvision.transforms.functional import InterpolationMode

class EndoVis18VQAGPTGen(Dataset):
    def __init__(self, seq, folder_head, folder_tail):

        self.transform = transforms.Compose([
            transforms.Resize((224, 224), interpolation=InterpolationMode.BICUBIC),  # input image size
            transforms.ToTensor(),
        ])

        # files, question and answers
        filenames = []
        for curr_seq in seq:
            filenames = filenames + glob.glob(folder_head + str(curr_seq) + folder_tail)
        self.vqas = []
        for file in filenames:
            file_data = open(file, "r")
            lines = [line.strip("\n") for line in file_data if line != "\n"]
            file_data.close()
            for line in lines:
                self.vqas.append([file, line])
        print('Total files: %d | Total question: %.d' % (len(filenames), len(self.vqas)))

        # Labels
        self.labels = ['kidney',
                'Idle', 'Grasping', 'Retraction', 'Tissue_Manipulation',
                'Tool_Manipulation', 'Cutting', 'Cauterization', 'Suction',
                'Looping', 'Suturing', 'Clipping', 'Staple', 'Ultrasound_Sensing',
                'left-top', 'right-top', 'left-bottom', 'right-bottom']

    def __len__(self):
        return len(self.vqas)

    def __getitem__(self, idx):
        qa_full_path = Path(self.vqas[idx][0])
        seq_path = qa_full_path.parents[2]
        file_name = self.vqas[idx][0].split('/')[-1]  # / in linux and \\ in windows

        # img
        img_loc = os.path.join(seq_path, 'left_fr', file_name.split('_')[0] + '.png')
        raw_image = Image.open(img_loc).convert('RGB')
        img = self.transform(raw_image)

        # question and answer
        question, answer = self.vqas[idx][1].split('|')

        return img_loc, img, question, answer

### Model

In [2]:
import torch
from torch import nn

from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import ViTModel, BlipConfig, BlipTextModel

from peft import get_peft_model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class BLIPGPTVQAGen(nn.Module):
    def __init__(self, peft_config=None):
        super(BLIPGPTVQAGen, self).__init__()

        # gpt2 decoder
        gpt = GPT2LMHeadModel.from_pretrained('gpt2')
        self.gpt = get_peft_model(gpt, peft_config)
        self.gpt.print_trainable_parameters()  # Verify trainable MoRA parameters

        # visual encoder
        model_name = "google/vit-base-patch16-224-in21k"
        self.visual_encoder = ViTModel.from_pretrained(model_name)

        # tokenizer
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.tokenizer.pad_token = self.tokenizer.eos_token  # end of string

        # text encoder
        config = BlipConfig.from_pretrained("Salesforce/blip-vqa-base")
        self.text_encoder = BlipTextModel(config.text_config, add_pooling_layer=False)

        # modify embedding layer
        new_vocab_size = len(self.tokenizer)
        embedding_dim = self.text_encoder.embeddings.word_embeddings.embedding_dim
        self.text_encoder.embeddings.word_embeddings = nn.Embedding(new_vocab_size, embedding_dim)  # He init

    def forward(self, image, question_inputs, answer_inputs=None):
        # visual encoder
        image = image.to(device)
        image_embeds = self.visual_encoder(image).last_hidden_state  # torch.Size([bs, 197, 768])
        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device)  # torch.Size([bs, 197])

        question_input_ids = question_inputs['input_ids']  # torch.Size([bs, 25])
        question_att_mask = question_inputs['attention_mask']

        answer_input_ids = answer_inputs['input_ids']  # torch.Size([bs, 25])
        answer_att_mask = answer_inputs['attention_mask']

        # multimodal encoder
        img_question_output = self.text_encoder(input_ids=question_input_ids,
                         attention_mask=question_att_mask,
                         encoder_hidden_states=image_embeds,
                         encoder_attention_mask=image_atts,
                         return_dict=True)

        img_question_embeds = img_question_output.last_hidden_state  # torch.Size([bs, 25, 768]), args.question_len=25

        # multimodal encoder
        img_answer_output = self.text_encoder(input_ids=answer_input_ids,
                         attention_mask=answer_att_mask,
                         encoder_hidden_states=image_embeds,
                         encoder_attention_mask=image_atts,
                         return_dict=True)

        img_answer_embeds = img_answer_output.last_hidden_state  # torch.Size([bs, 25, 768]), args.question_len=25
        # print('img_answer_embeds:', img_answer_embeds.shape)

        inputs_embeds_qa = torch.cat((img_question_embeds, img_answer_embeds), dim=1)
        # print('inputs_embeds_qa:', inputs_embeds_qa.shape)

        # text decoder
        gpt_output = self.gpt(inputs_embeds=inputs_embeds_qa,
                              encoder_attention_mask=question_att_mask)  # torch.Size([bs, 25, 50257])
        return gpt_output.logits

#GPT LoRA from PEFT

In [8]:
from peft import  TaskType, LoraConfig

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    # target_modules=["c_attn", "c_proj"]
    target_modules=["c_attn"]
)

model = BLIPGPTVQAGen(peft_config=lora_config)



config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

#GPT2 DoRA from PEFT

In [9]:
from peft import  TaskType, LoraConfig

# Configure DoRA
dora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # GPT-2 is a causal LM
    r=8,  # Rank for adaptation
    lora_alpha=16,  # Scaling factor
    lora_dropout=0.1,  # Regularization
    target_modules=["c_attn",],
    # target_modules=["c_attn", "c_proj"],  # Apply DoRA to attention layers
    use_dora=True  # ✅ Enables DoRA adaptation instead of standard LoRA
)

model = BLIPGPTVQAGen(peft_config=dora_config)

#GPT2 AdaLoRA from PEFT

In [10]:
from peft import TaskType, LoraConfig, AdaLoraConfig

# Configure AdaLoRA adapters
adalora_config = AdaLoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,  # Low-rank matrix rank
    lora_alpha=16,  # Scaling factor
    lora_dropout=0.1,  # Dropout rate
    target_modules=["c_attn", "c_proj"],  # Apply AdaLoRA to attention layers
    beta1=0.85,  # Adaptive importance weighting factor
    beta2=0.85,  # Further control factor for importance weighting
    orth_reg_weight=0.5  # Orthogonality regularization weight
)

model = BLIPGPTVQAGen(peft_config=adalora_config)



#GPT2 VeLoRA from PEFT

In [11]:
from peft import TaskType, LoraConfig, AdaLoraConfig

# Configure VeLoRA adapters (Approximating VeLoRA with Variable Low-Rank Updates)
velora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,  # Initial low-rank matrix rank
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout rate
    target_modules=["c_attn", "c_proj"],  # Apply LoRA to attention layers
    rank_pattern={"c_attn": 16, "c_proj": 8},  # Simulating variable rank adjustment
)

model = BLIPGPTVQAGen(peft_config=velora_config)



#GPT2 RSLoRA from PEFT

In [12]:
from peft import TaskType, LoraConfig

rslora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,       # e.g., GPT-like models
    r=8,                                # Rank for LoRA adaptation
    lora_alpha=16,                      # Scaling factor for LoRA parameters
    lora_dropout=0.1,                   # Dropout for LoRA layers
    target_modules=["c_attn"],          # Target modules to apply R-S LoRA
    use_rslora=True                     # Enables R-S LoRA instead of standard LoRA
)

model = BLIPGPTVQAGen(peft_config=rslora_config)


#Explained Variance Adaptation (EVA) PEFT

Paper: https://arxiv.org/pdf/2410.07170

In [15]:
from peft import TaskType, LoraConfig, EvaConfig

eva_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Suitable for LLMs like GPT-style models
    r=8,  # Initial rank for LoRA adaptation
    lora_alpha=16,  # Scaling factor for LoRA weights
    lora_dropout=0.1,  # Dropout to regularize LoRA layers
    target_modules=["c_attn"],  # Apply EVA-based LoRA to attention layers
    eva_config=EvaConfig(
        rho=2.0,  # Redistribution factor for adaptive rank allocation (max rank = 2r)
        tau=0.99,  # Cosine similarity threshold for early stopping in SVD iterations
        use_label_mask=True,  # Enables label-based masking for better adaptation in structured data
        label_mask_value=-100,  # Masking value for ignored tokens
        whiten=False,  # No whitening applied to singular vectors
        adjust_scaling_factors=True  # Normalizes LoRA gradients after rank redistribution
    )
)

model = BLIPGPTVQAGen(peft_config=eva_config)




#LoftQ PEFT

LoftQ: LoRA-Fine-Tuning-Aware Quantization for Large Language Models: https://arxiv.org/abs/2310.08659

In [22]:
from peft import TaskType, LoraConfig, LoftQConfig

# Define the LoftQ configuration
loftq_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Model type (e.g., LLMs like GPT)
    r=8,  # LoRA rank
    lora_alpha=16,  # LoRA scaling factor
    lora_dropout=0.1,  # LoRA dropout rate
    target_modules=["c_attn"],  # Apply LoRA to attention layers
    loftq_config=LoftQConfig(
        loftq_bits=4,  # Default quantization bits (e.g., 4-bit for memory efficiency)
        loftq_iter=3,  # Number of alternating optimization iterations for LoftQ
    )
)

model = BLIPGPTVQAGen(peft_config=loftq_config)




#MoRA PEFT

In [24]:
!git clone https://github.com/kongds/MoRA.git

%cd MoRA
!pip -q install -e ./peft-mora

Cloning into 'MoRA'...
remote: Enumerating objects: 176, done.[K
remote: Counting objects: 100% (176/176), done.[K
remote: Compressing objects: 100% (137/137), done.[K
remote: Total 176 (delta 57), reused 136 (delta 35), pack-reused 0 (from 0)[K
Receiving objects: 100% (176/176), 257.45 KiB | 4.77 MiB/s, done.
Resolving deltas: 100% (57/57), done.
/content/PitVQA/MoRA
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
  Building editable for peft (pyproject.toml) ... [?25l[?25hdone


In [3]:
import torch
from peft import TaskType, LoraConfig

# Configure MoRA adapters
mora_config = LoraConfig(
    use_mora=True,
    mora_type=6,
    r=8,
    lora_dropout=0.1,
    target_modules=["c_attn"],
    task_type=TaskType.CAUSAL_LM,
)

# Wrap the model with MoRA
model = BLIPGPTVQAGen(peft_config=mora_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


trainable params: 292,032 || all params: 124,731,840 || trainable%: 0.23412786983660308
