In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! mkdir data

In [3]:
! cp '/content/drive/MyDrive/image_text_dataset.zip' .

In [4]:
! unzip --qq image_text_dataset.zip -d data

In [5]:
#!  pip install torch torchvision transformers peft timm accelerate bitsandbytes
! pip install transformers torch torchvision bitsandbytes peft timm accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.m

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig

from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    TaskType
)
#import bitsandbytes
#from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, BitsAndBytesConfig
import timm
from tqdm import tqdm
import os

# 1. Dataset Definition
class ImageTextQADataset(Dataset):
    def __init__(self, data, image_dir, image_transform, text_tokenizer, max_length=128):
        """
        Args:
            data (list): List of dictionaries, each with 'image_path', 'question', 'answer' keys.
            image_transform (callable): Image transformation.
            text_tokenizer (callable): Text tokenizer.
            max_length (int): Max sequence length.
        """
        self.data = data
        self.image_dir = image_dir
        self.image_transform = image_transform
        self.text_tokenizer = text_tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        image_path = os.path.join(self.image_dir, item['image'])
        question = item['question']
        answer = item['answer']

        image = Image.open(image_path).convert("RGB")
        image = self.image_transform(image)

        # Combine question and answer for training (SFT)
        prompt = f"Question: {question}\nAnswer: {answer}{self.text_tokenizer.eos_token}"

        encoded = self.text_tokenizer(prompt,
                                      max_length=self.max_length,
                                      padding="max_length",
                                      truncation=True,
                                      return_tensors="pt")

        return {
            "image": image,
            "input_ids": encoded["input_ids"].squeeze(),
            "attention_mask": encoded["attention_mask"].squeeze(),
        }

In [7]:
from torchvision import transforms

image_transform = transforms.Compose([
      transforms.Resize((224, 224)),  # Adjust size as needed
      transforms.ToTensor(),
      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet stats
])

In [8]:
# 2. Model Definitions
class SigLIPImageEncoder(torch.nn.Module):
    def __init__(self, model_name='resnet50', embed_dim=512, pretrained_path=None):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=False, num_classes=0, global_pool='avg') # pretrained=False
        self.embed_dim = embed_dim
        self.projection = torch.nn.Linear(self.model.num_features, embed_dim)

        if pretrained_path:
            self.load_state_dict(torch.load(pretrained_path))
            print(f"Loaded SigLIP image encoder from {pretrained_path}")
        else:
            print("Initialized SigLIP image encoder without pretrained weights.")

    def forward(self, image):
        features = self.model(image)
        embedding = self.projection(features)
        return embedding



In [9]:
class Phi3WithImage(torch.nn.Module):
    def __init__(self, phi3_model_name, image_encoder, image_embed_dim=512, bnb_config=None):
        super().__init__()
        self.phi3 = AutoModelForCausalLM.from_pretrained(phi3_model_name,
                                                         torch_dtype=torch.bfloat16,
                                                         device_map="auto",
                                                         trust_remote_code=True, # Important for some Phi-3 variants
                                                         quantization_config=bnb_config  # Use BitsAndBytesConfig here
                                                         ) # Important for some Phi-3 variants
        self.image_encoder = image_encoder
        self.image_embed_dim = image_embed_dim
        self.phi3_embed_dim = self.phi3.config.hidden_size

        # Project image embeddings to Phi-3's embedding space
        self.image_projection = torch.nn.Linear(image_embed_dim, self.phi3_embed_dim)

    def forward(self, image, input_ids, attention_mask):
        image_embeddings = self.image_encoder(image)
        projected_image_embeddings = self.image_projection(image_embeddings)

        # Concatenate image embeddings to the input sequence
        #  This is a simplified approach.  More sophisticated methods exist.
        #  Assumes image embeddings are prepended to the sequence.
        #  You might need to adjust this based on your specific setup.

        #  Reshape image embeddings to (batch_size, 1, phi3_embed_dim)
        projected_image_embeddings = projected_image_embeddings.unsqueeze(1)

        #  Concatenate along the sequence dimension (dim=1)
        extended_attention_mask = torch.cat([torch.ones(projected_image_embeddings.shape[:2], device=attention_mask.device), attention_mask], dim=1)
        extended_input_ids = torch.cat([torch.zeros(projected_image_embeddings.shape[:2], dtype=torch.long, device=input_ids.device), input_ids], dim=1)

        # Replace the first 'image_embed_dim' tokens with the image embeddings
        # This is a very basic way to inject the image information
        # A more sophisticated approach would involve a cross-attention mechanism
        # or a learned positional embedding for the image tokens.
        # extended_input_ids[:, :self.image_embed_dim] = projected_image_embeddings.reshape(projected_image_embeddings.shape[0], -1)

        #  Pass the concatenated input to Phi-3
        outputs = self.phi3(input_ids=extended_input_ids, attention_mask=extended_attention_mask, labels=extended_input_ids) # Labels for SFT

        return outputs



In [10]:
# 3. QLoRA Configuration
import pandas as pd
def create_lora_config(r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", target_modules=["o_proj", "qkv_proj","gate_up_proj", "down_proj"]):
    """Creates a LoRA configuration for QLoRA."""
    config = LoraConfig(
        r=r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        bias=bias,
        task_type=task_type,
        target_modules=target_modules
    )
    return config

# 4. Data Loading and Preprocessing
def load_and_preprocess_data(data_path, image_dir, image_transform, text_tokenizer, max_length):
    """Loads and preprocesses the data."""
    import json
    df = pd.read_csv(data_path)
    data = df.to_dict('records')  # List of dictionaries

    dataset = ImageTextQADataset(data, image_dir, image_transform, text_tokenizer, max_length)
    return dataset


In [11]:
# 5. Training Loop
def train(model, dataloader, optimizer, device, num_epochs=1, save_path="qlora_phi3_model"):
    """Trains the model."""
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            image = batch["image"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            optimizer.zero_grad()
            outputs = model(image, input_ids, attention_mask)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss}")

    # Save the LoRA adapter weights
    model.phi3.save_pretrained(save_path)
    print(f"LoRA adapter weights saved to {save_path}")


In [12]:
# 6. Main Execution
# Configuration
phi3_model_name = "microsoft/Phi-3-mini-4k-instruct"  # Or your specific Phi-3 variant
image_model_name = 'resnet50'
image_embed_dim = 512
max_length = 512
batch_size = 4
num_epochs = 5
data_path = "/content/data/image_text_dataset.csv"  # Path to your data file
image_dir = "/content/data"  # Path to your image directory
save_path = "/content/drive/MyDrive/qlora_phi3_model"
siglip_pretrained_path = "/content/drive/MyDrive/image_encoder.pth" # Path to your pretrained SigLIP model

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 7. BitsAndBytesConfig for QLoRA
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
        )

# 8. Load Tokenizer
text_tokenizer = AutoTokenizer.from_pretrained(phi3_model_name, trust_remote_code=True)
text_tokenizer.pad_token = text_tokenizer.eos_token # Important for training

# 9. Image Transformations
from torchvision import transforms
image_transform = transforms.Compose([
      transforms.Resize((224, 224)),
      transforms.ToTensor(),
      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 10. Load and Preprocess Data
dataset = load_and_preprocess_data(data_path, image_dir, image_transform, text_tokenizer, max_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 11. Load Models
image_encoder = SigLIPImageEncoder(model_name=image_model_name, embed_dim=image_embed_dim, pretrained_path=siglip_pretrained_path).to(device)
model = Phi3WithImage(phi3_model_name, image_encoder, image_embed_dim,  bnb_config=bnb_config).to(device)

# 12. Prepare Model for QLoRA
model.phi3 = prepare_model_for_kbit_training(model.phi3)

# 13. Create LoRA Config
lora_config = create_lora_config()

# 14. Get PEFT Model
model.phi3 = get_peft_model(model.phi3, lora_config)
#model.print_trainable_parameters()
print(model.phi3)

# 15. Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# 16. Train
train(model, dataloader, optimizer, device, num_epochs, save_path)

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Loaded SigLIP image encoder from /content/drive/MyDrive/image_encoder.pth


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3Attention(
              (o_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitud

  return fn(*args, **kwargs)
Epoch 1/5: 100%|██████████| 64/64 [03:28<00:00,  3.27s/it]


Epoch 1/5, Average Loss: 1.4678252509329468


Epoch 2/5: 100%|██████████| 64/64 [03:28<00:00,  3.25s/it]


Epoch 2/5, Average Loss: 0.16581910429522395


Epoch 3/5: 100%|██████████| 64/64 [03:28<00:00,  3.25s/it]


Epoch 3/5, Average Loss: 0.11800261272583157


Epoch 4/5: 100%|██████████| 64/64 [03:28<00:00,  3.25s/it]


Epoch 4/5, Average Loss: 0.09110973973292857


Epoch 5/5: 100%|██████████| 64/64 [03:28<00:00,  3.25s/it]


Epoch 5/5, Average Loss: 0.07081384444609284
LoRA adapter weights saved to /content/drive/MyDrive/qlora_phi3_model
