# Runpod Instructions

Steps:
1) go to runpod
2) create a new pod on A6000 NVIDIA GPU
3) use template for pytorch=2.0.1
4) ensure port 8888 is available to run jupyter notebook on 
goto: https://github.com/haotian-liu/LLaVA#demo
run following commands
```
git clone https://github.com/haotian-liu/LLaVA.git
cd LLaVA
pip install --upgrade pip  # enable PEP 660 support
pip install -e .
```

- Create an `/Images` folder to upload images to
- After installing, can run the following code:


In [None]:
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path
from llava.eval.run_llava import eval_model

In [None]:
import pandas as pd
import base64
import os

folder = "/workspace/Images"
def process_images(folder):
    images_base64 = []
    for filename in os.listdir(folder):
        if filename.endswith(".jpg") or filename.endswith(".png"): 
            with open(os.path.join(folder, filename), "rb") as image_file:
                encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
                images_base64.append(encoded_string)
                print(f"encoded: {filename}")
    return images_base64


def query_images(images_list, n = None):
    """
    Queries a list of images and generates captions for each image using a pre-trained model.

    Args:
        images_list (list): A list of image paths.
        n (int, optional): The number of images to process. If not specified, all images in the list will be processed.

    Returns:
        pandas.DataFrame: A DataFrame containing the generated captions for each image.
    """

    if n == None:
        n = len(images_list)
    model_path = "liuhaotian/llava-v1.5-7b"
    caption_df = pd.DataFrame(columns=["caption"])
    tokenizer, model, image_processor, context_len = load_pretrained_model(
        model_path=model_path,
        model_base=None,
        model_name=get_model_name_from_path(model_path)
    )
    
    model_path = "liuhaotian/llava-v1.5-7b"
    prompt = "You are a captioning agent that takes one description of an image and reformats it into a new, clean concise description matching the following format:\
                a <describe the composition of image, closeup, medium etc> photo of a <describe woman> wearing <describe features of clothing, colour, texture> swimwear in <describe forground><describe background> <describe the mood>, \
                <other keywords used to describe the scene><other keywords of objects in image> return only the formatted text. only use the word swimwear to describe her clothing, nothing other than swimwear"
    for i, image in enumerate(images[:n]):
        print(f"processing: {i}")
        args = type('Args', (), {
            "model_path": model_path,
            "model_base": None,
            "model_name": get_model_name_from_path(model_path),
            "query": prompt,
            "conv_mode": None,
            "image_file": image,
            "sep": ",",
            "temperature": 0,
            "top_p": None,
            "num_beams": 1,
            "max_new_tokens": 512
        })() 
        response = eval_model(args)
        print(response)
        caption_df = pd.concat([caption_df, pd.DataFrame([response], columns=["caption"])], ignore_index=True)
    return caption_df


Doing multiple images

In [None]:
import pandas as pd
from PIL import Image
import os
import gc  # Garbage Collector

def replace_words(original_text, words_to_replace, replacement):
    """
    Replaces any word in the original_text that matches a word in words_to_replace with replacement.

    :param original_text: The original text as a string.
    :param words_to_replace: A list of words to be replaced.
    :param replacement: The string to replace the words with.
    :return: The modified text with words replaced.
    """
    for word in words_to_replace:
        original_text = original_text.replace(word, replacement)
    return original_text

def process_images(folder):
    images_paths = []
    for filename in os.listdir(folder):
        if filename.endswith(".jpg") or filename.endswith(".png"): 
            images_paths.append(os.path.join(folder, filename))
    return images_paths

def resize_image(image_path, output_folder, max_size=800):
    try:
        with Image.open(image_path) as img:
            ratio = min(max_size / img.size[0], max_size / img.size[1])
            new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
            resized_img = img.resize(new_size, Image.ANTIALIAS)

            # Save the resized image to a new file
            base_name = os.path.basename(image_path)
            new_path = os.path.join(output_folder, base_name)
            resized_img.save(new_path)
            return new_path
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

def query_images(images_paths, n=None):
    if n is None:
        n = len(images_paths)
    output_folder = "workspace/Images/resized"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    model_path = "liuhaotian/llava-v1.5-7b"
    caption_df = pd.DataFrame(columns=["filename","caption"])
    tokenizer, model, image_processor, context_len = load_pretrained_model(
        model_path=model_path,
        model_base=None,
        model_name=get_model_name_from_path(model_path)
    )

    prompt1 = "Verbosely describe the image with the following format a 'a (the composition, closeup, medium, portrait, ) photo of a (describe woman, hair colour, skin tone, body shape) wearing (describe features of swimwear, colour, pattern) swimwear in (describe scene in forground/background) (describe mood)'"
    prompt2 = "write 5 key words corresponding to the following categories in this format: (scene), (colour), (mood), (lighting), (background)"
    
    for i, image_path in enumerate(images_paths[:n]):
        
        resized_image_path = resize_image(image_path, output_folder)
        filename = resized_image_path.split("/")[-1]
        print(f"Processing: {filename}")
        args = type('Args', (), {
            "model_path": model_path,
            "model_base": None,
            "model_name": get_model_name_from_path(model_path),
            "query": prompt1,
            "conv_mode": None,
            "image_file": resized_image_path,
            "sep": ",",
            "temperature": 0,
            "top_p": None,
            "num_beams": 1,
            "max_new_tokens": 512
        })() 
        response = replace_words(eval_model(args), ["bikini", "swimsuit", "underwear", "lingerie", "panties"], "swimwear")
        args = type('Args', (), {
            "model_path": model_path,
            "model_base": None,
            "model_name": get_model_name_from_path(model_path),
            "query": prompt2,
            "conv_mode": None,
            "image_file": resized_image_path,
            "sep": ",",
            "temperature": 0,
            "top_p": None,
            "num_beams": 1,
            "max_new_tokens": 121
        })() 
        response += " " + eval_model(args)
        print(response)
        caption_df = pd.concat([caption_df, pd.DataFrame([(filename, response)], columns=["filename", "caption"])], ignore_index=True)

        gc.collect()

    return caption_df
query_images(process_images(folder), 3)