Set Up the Image and Text Encoder

In [None]:
from transformers import AutoModel, AutoTokenizer, CLIPProcessor, CLIPModel, AutoModelForCausalLM
from datasets import load_dataset
import torch
from PIL import Image
import requests
from io import BytesIO

from huggingface_hub import login
login(token='###')

# Load the Wave-UI-25k dataset
dataset = load_dataset("miketes/Web-filtered-english-wave-ui-25k")

# Load the LLaMA 3 model and tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
llama_model = AutoModel.from_pretrained("meta-llama/Llama-3.1-8B")

# Load CLIP (or another image encoder) for converting images to embeddings
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define the bounding box embedding layer
bbox_embedding_layer = nn.Linear(4, llama_model.config.hidden_size)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\Mike\.cache\huggingface\token
Login successful


KeyboardInterrupt: 

Preprocess Data

In [None]:
def preprocess_function(example):
    # Convert image URLs to embeddings
    image_url = example["image"]
    response = requests.get(image_url)
    image = Image.open(BytesIO(response.content)).convert("RGB")
    image_inputs = clip_processor(images=image, return_tensors="pt")
    image_embeddings = clip_model.get_image_features(**image_inputs).squeeze(0)
    
    # Tokenize text data (e.g., instructions and descriptions)
    text_inputs = llama_tokenizer(
        example["instruction"] + " " + example["description"],
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=128
    )
    
    return {
        "input_ids": text_inputs["input_ids"].squeeze(),
        "attention_mask": text_inputs["attention_mask"].squeeze(),
        "image_embeddings": image_embeddings
    }

# Apply preprocessing
processed_dataset = dataset.map(preprocess_function)


Create a Multi-Modal Model

In [None]:
import torch.nn as nn

class MultiModalLLaMA(nn.Module):
    def __init__(self, llama_model, clip_model):
        super(MultiModalLLaMA, self).__init__()
        self.llama_model = llama_model
        self.image_projection = nn.Linear(clip_model.config.projection_dim, llama_model.config.hidden_size)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, input_ids, attention_mask, image_embeddings):
        # Project image embeddings to the LLaMA hidden size
        image_embeddings_projected = self.image_projection(image_embeddings).unsqueeze(1)
        
        # Concatenate image embeddings with text embeddings
        embeddings = torch.cat([image_embeddings_projected, self.llama_model.embeddings(input_ids)], dim=1)
        
        # Adjust attention mask to account for image embedding
        extended_attention_mask = torch.cat([torch.ones((attention_mask.size(0), 1), device=attention_mask.device), attention_mask], dim=1)
        
        # Pass through LLaMA model
        outputs = self.llama_model(inputs_embeds=embeddings, attention_mask=extended_attention_mask)
        
        return outputs

# Instantiate the multi-modal model
multi_modal_llama = MultiModalLLaMA(llama_model, clip_model)


Fine-Tuning

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW

# Prepare DataLoader
train_loader = DataLoader(processed_dataset["train"], batch_size=8, shuffle=True)

# Set up optimizer
optimizer = AdamW(multi_modal_llama.parameters(), lr=5e-5)

# Training loop
multi_modal_llama.train()
for epoch in range(3):  # Adjust epochs as needed
    for batch in train_loader:
        # Move data to GPU if available
        input_ids = batch["input_ids"].cuda()
        attention_mask = batch["attention_mask"].cuda()
        image_embeddings = batch["image_embeddings"].cuda()

        # Forward pass
        outputs = multi_modal_llama(input_ids, attention_mask, image_embeddings)
        
        # Compute loss (using a dummy loss function here, update based on task)
        loss = outputs.loss
        loss.backward()
        
        # Step optimizer
        optimizer.step()
        optimizer.zero_grad()
        
    print(f"Epoch {epoch + 1} completed.")


Save the Model

In [None]:
multi_modal_llama.save_pretrained("fine_tuned_llama_wave_ui")
