In [23]:
import base64
import requests
import json
import pandas as pd
import os
from tqdm import tqdm
import re
import torch

The pipeline is the following:

1. Get input
2. Read context
3. Get caption from the adapter
4. Overlay captions

In [16]:
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

### Load all our models

Note: for some model I am simply using Inference API from Hugging Face to save space and time for running this code. 

Inference API requires a hugging face token.

The free option has a limited rate of queries per hour, which is 300. In the more advanced setting (e.g. in production with more GPUs), a model can be stored locally to avoid this limitation.

In [1]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

I am using a Gemma model with 2 billion parameters + customly trained adapters.

Gemma2B turned out to be a good compromise between size and performance. With relatively few samples (300 to 500), I was able to train adapters. 

Other models were also tried for the purpose of caption generation (e.g., FLAN-T5 family - large, XL, and XXL; ...), but they didn't achieve reliable performance neither before training nor after. Notebooks with prefix-tuning of FLAN-T5 are added in the folder.

In [None]:
config = PeftConfig.from_pretrained("NursNurs/outputs_gemma2b_angry")
base_model = AutoModelForCausalLM.from_pretrained("google/gemma-2b")

In [22]:
model_angry = PeftModel.from_pretrained(base_model, "NursNurs/outputs_gemma2b_angry")
model_happy = PeftModel.from_pretrained(base_model, "NursNurs/outputs_gemma2b_happy")

adapter_config.json:   0%|          | 0.00/668 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/314M [00:00<?, ?B/s]

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [25]:
base_model.to(device)
model_happy.to(device)
model_angry.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GemmaForCausalLM(
      (model): GemmaModel(
        (embed_tokens): Embedding(256000, 2048, padding_idx=0)
        (layers): ModuleList(
          (0-17): 18 x GemmaDecoderLayer(
            (self_attn): GemmaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k

In [27]:
models = dict()
models['angry'] = model_angry
models['happy'] = model_happy

### Loading CLIP model for sentiment

In [10]:
# replace with your HF token

hf_token = "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"

In [11]:
API_URL = "https://api-inference.huggingface.co/models/openai/clip-vit-base-patch32"
headers = {"Authorization": f"Bearer {hf_token}"}

In [18]:
def query_clip(data):
	with open(data["image_path"], "rb") as f:
		img = f.read()
	payload={
		"parameters": data["parameters"],
		"inputs": base64.b64encode(img).decode("utf-8")
	}
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

In [19]:
def get_sentiment(img_path):
    output = query_clip({
        "image_path": img_path,
        "parameters": {"candidate_labels": ["angry", "happy"]},
    })
    try:
        return output[0]['label']
    except:
        print("The model is not available right now due to query limits. Try within the next hour")

In [None]:
def extract_between_quotes(text):
    """
    Extract content between single quotes, ignoring escaped single quotes,
    and ensure the second quote is followed by whitespace or end of string.
    """
    # Regex pattern to match content between unescaped single quotes,
    # ensuring the second quote is followed by whitespace or end of the string
    pattern = r"(?<!\\)'(.*?)(?<!\\)'(?:\s|$|<|\.)"

    # Find all matches in the text using re.DOTALL to handle multiline text
    matches = re.findall(pattern, text, re.DOTALL)


    return matches

### Load BLIP for image description

In [17]:
API_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large"
headers = {"Authorization": "Bearer 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"}

def query_blip(filename):
    with open(filename, "rb") as f:
        data = f.read()
    response = requests.post(API_URL, headers=headers, data=data)
    return response.json()

In [20]:
def get_description(img_path):
    output = query_blip(query_blip)
    try:
        return output[0]['generated_text']
    except:
        print("The model is not available right now due to query limits. Try within the next hour")

### Prepare functions to lay text over the image

In [21]:
def get_caption_from_img(img_path):
    

SyntaxError: expected ':' (305975063.py, line 1)

In [6]:
img_path = "monkey.png"