In [1]:
# # Fine-Tuning LLaVA 1.5 7B (HF version) for Instagram Captioning on Custom JSONL Data
# # Compatible with RTX 3060 / T4 (12-16 GB GPUs)

# !pip install -U "transformers>=4.39.0"
# !pip install peft bitsandbytes
# !pip install -U "trl>=0.8.3"
# !pip install wandb
# !pip install evaluate
# !pip install datasets
# !pip install rouge_score
# !pip install torchao
# # !pip install git+https://github.com/Maluuba/nlg-eval.git

Loading Libraries

In [2]:
import os
import json
import torch
import wandb
import torchao
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
from trl import SFTTrainer
from peft import LoraConfig
from torch.optim import AdamW
# from datasets import load_metric
from datasets import load_dataset
from torch.utils.data import Dataset
from evaluate import load as load_metric
from datasets import Dataset as HFDataset
from multiprocessing import Pool, cpu_count
from transformers.integrations import WandbCallback
from transformers import get_cosine_schedule_with_warmup
from transformers import AutoTokenizer, AutoProcessor, TrainingArguments, LlavaForConditionalGeneration, BitsAndBytesConfig

Setting Up WandB

In [3]:
os.environ["WANDB_API_KEY"] = "d707117fb8a8f4cf9916b4cf42fe630e09c93b6b"

Setting Up Script Configs

In [4]:
# data configs
DATASET = "InstaCities1M"
BASE_IMAGES_DIR = "/mnt/InstaCities1M/image/img_resized_1M/cities_instagram/"
BASE_CAPTIONS_DIR = "/mnt/InstaCities1M/captions/captions_resized_1M/cities_instagram/"
OUTPUT_JSONL_PATH = './datasset_v1.jsonl'
CITIES = ['newyork']

LLAVA_CHAT_TEMPLATE = (
    "You are a social media influencer. Write a captivating Instagram caption for this image "
    "that will engage more viewers and boost interaction. Analyze the image to decide the tone of the caption."
)

print(os.path.exists(BASE_IMAGES_DIR))
print(os.path.exists(BASE_CAPTIONS_DIR))

# Expirementation Details
PROJECT = "Snap2Caption"
RUN_NAME = "llava-7b-ft-instagram-v2"

# model configs
MODEL_NAME = "LLaVA-7B-HF (LLaVA-1.5-7B)"
TASK = "Image Captioning"
LORA_R = 32
LORA_ALPHA = 64
LORA_DROPOUT = 0.05
TARGET_MODULES = ["q_proj", "k_proj", "v_proj"]

# Search/Filter Tags
TAGS = ["llava", "image-captioning", "LoRA", "fine-tuning"]
GROUP = "llava-instagram-experiments"
NOTES = "Baseline fine-tuning on InstaCities1M with 4-bit quantized model and LoRA adapters."

# Transformer Setting
MODEL_ID = "llava-hf/llava-1.5-7b-hf"
MODEL_SAVE_PATH = "./llava_lora_instagram"

# Optimization Strategy
TRAIN_BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 8
NO_OF_EPOCHS = 10
LEARNING_RATE = 1e-4
LOGGING_STEPS = 5
WEIGHT_DECAY = 0.01

True
True


Verifying GPU

In [5]:
torch.cuda.is_available()

True

Dataset Parsing

In [6]:
images_files = []
captions_files = []

for city in CITIES:
    img = BASE_IMAGES_DIR + city + '/' + np.array(os.listdir(BASE_IMAGES_DIR + city))
    caption = BASE_CAPTIONS_DIR + city + '/' + np.array(os.listdir(BASE_CAPTIONS_DIR + city))
    images_files.extend(img)
    captions_files.extend(caption)

In [7]:
# Clean filenames
image_ids = {os.path.splitext(os.path.basename(img))[0] for img in images_files}
caption_ids = {os.path.splitext(os.path.basename(cap))[0] for cap in captions_files}

# Now match
common_ids = image_ids & caption_ids

# Filter
filtered_image_files = [img for img in images_files if os.path.splitext(os.path.basename(img))[0] in common_ids]
filtered_caption_files = [cap for cap in captions_files if os.path.splitext(os.path.basename(cap))[0] in common_ids]

images_files = filtered_image_files
captions_files = filtered_caption_files

In [8]:
len(images_files), len(captions_files)

(100000, 100000)

In [9]:
# --- Worker function ---
def process_pair(i):
    try:
        img_path = images_files[i]
        caption_path = captions_files[i]

        with open(caption_path, 'r', encoding='utf-8') as f:
            caption = f.read().strip().replace('\n', ' ')
            if not caption:
                return None

        # Create messages field directly
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": LLAVA_CHAT_TEMPLATE},
                    {"type": "image"}
                ]
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": caption}
                ]
            }
        ]

        return {
            "image_path": img_path,
            "messages": messages
        }

    except Exception:
        return None


# --- Multiprocessing ---
with Pool(cpu_count()) as pool:
    results = list(tqdm(pool.imap(process_pair, range(len(images_files))), total=len(images_files)))


100%|██████████| 100000/100000 [00:33<00:00, 3024.88it/s]


In [10]:
data = [entry for entry in results if entry is not None]

# --- Write JSONL File ---
with open(OUTPUT_JSONL_PATH, 'w', encoding='utf-8') as f:
    for entry in data:
        f.write(json.dumps(entry) + "\n")

print(f"JSONL created: {OUTPUT_JSONL_PATH} with {len(data)} samples.")

JSONL created: ./datasset_v1.jsonl with 100000 samples.


Configuring Model

In [11]:
# --- Configuration ---
model_id = MODEL_ID
data_path = OUTPUT_JSONL_PATH  # path to your formatted JSONL file
output_dir = MODEL_SAVE_PATH

In [12]:
# --- Model Loading (4bit Quantization) ---
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16

)

# quantization_config = BitsAndBytesConfig(
#     load_in_8bit=True,
#     llm_int8_threshold=6.0,
#     llm_int8_has_fp16_weight=False
# )


model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    torch_dtype=torch.float16,
    device_map="auto"
)

processor = AutoProcessor.from_pretrained(model_id)
tokenizer = processor.tokenizer

tokenizer.chat_template = (
    "{% for message in messages %}"
    "{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}"
    "{% for item in message['content'] %}"
    "{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}"
    "{% endfor %}"
    "{% if message['role'] == 'assistant' %}{{ eos_token }}{% endif %}"
    "{% endfor %}"
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [13]:
def load_image(example):
    example["image"] = Image.open(example["image_path"]).convert("RGB")
    return example

# Load and preprocess dataset with pre-formatted messages
dataset = load_dataset("json", data_files=data_path)["train"]

Generating train split: 0 examples [00:00, ? examples/s]

In [14]:
dataset[0]

{'image_path': '/mnt/InstaCities1M/image/img_resized_1M/cities_instagram/newyork/1480879485913200243.jpg',
 'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': 'You are a social media influencer. Write a captivating Instagram caption for this image that will engage more viewers and boost interaction. Analyze the image to decide the tone of the caption.'},
    {'type': 'image', 'text': None}]},
  {'role': 'assistant',
   'content': [{'type': 'text',
     'text': 'Where the dreams come true. #newyork #newyorkcity #rooseveltisland #dreams #manhattan #usa #bowtie #pocketsquare #handmade #newjersey #style #stylish #styleblogger #fashion #fashionstyle #fashionblogger #gay #gaymen #gayboy #gayguy #men #mensuit #menstyle #menswear #nyc #suitandtie #suit #shirt #ootd #onlinestore'}]}]}

In [18]:
# Read JSONL manually
dataset = []
with open(data_path, "r", encoding="utf-8") as f:
    for line in f:
        example = json.loads(line.strip())
        dataset.append(example)

print(f"Loaded {len(dataset)} samples.")

dataset = HFDataset.from_list(dataset)

split_dataset = dataset.train_test_split(test_size=0.001, seed=42)

train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

print(len(train_dataset))
print(len(eval_dataset))

Loaded 100000 samples.
99900
100


In [19]:
train_dataset.map

<bound method Dataset.map of Dataset({
    features: ['image_path', 'messages'],
    num_rows: 99900
})>

In [20]:
# class SimpleDataset(Dataset):
#     def __init__(self, data):
#         self.data = data

#     def __getitem__(self, idx):
#         return self.data[idx]

#     def __len__(self):
#         return len(self.data)

# # Wrap
# train_dataset = SimpleDataset(dataset)

In [43]:
from evaluate import load
import torch
import numpy as np

def build_compute_metrics(tokenizer):
    bleu = load("bleu")
    rouge = load("rouge")

    def compute_metrics(eval_preds):

        print(eval_preds)
        
        preds, labels = eval_preds

        # Convert to numpy arrays if not already
        if isinstance(preds, tuple):
            preds = preds[0]
        preds = np.asarray(preds)
        labels = np.asarray(labels)

        # Special trick for generative models: remove -100
        labels[labels == -100] = tokenizer.pad_token_id

        # Decode
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # BLEU expects a list of list of references
        bleu_score = bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])['bleu']

        rouge_score = rouge.compute(predictions=decoded_preds, references=decoded_labels)['rougeL']

        return {
            "bleu": round(bleu_score * 100, 2),
            "rougeL": round(rouge_score * 100, 2),
        }

    return compute_metrics

# def build_compute_metrics(tokenizer):
#     bleu = load_metric("bleu")
#     rouge = load_metric("rouge")

#     def compute_metrics(eval_preds):
#         predictions, labels = eval_preds

#         # Decode tokenized outputs into text
#         preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#         refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

#         # Compute BLEU
#         bleu_score = bleu.compute(predictions=preds, references=[[r] for r in refs])['bleu']

#         # Compute ROUGE
#         rouge_score = rouge.compute(predictions=preds, references=refs)['rougeL']

#         return {
#             "bleu": bleu_score,
#             "rougeL": rouge_score,
#         }

#     return compute_metrics

# # def compute_metrics(eval_preds, tokenizer):
# #     preds, labels = eval_preds
# #     if hasattr(preds, "predictions"):
# #         preds = preds.predictions
# #     preds = preds.argmax(dim=-1)
# #     labels = labels.long()

# #     preds = preds.tolist()
# #     labels = labels.tolist()

# #     pred_texts = tokenizer.batch_decode(preds, skip_special_tokens=True)
# #     label_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

# #     pred_texts = [text.strip().split() for text in pred_texts]
# #     label_texts = [[text.strip().split()] for text in label_texts]

# #     try:
# #         bleu_score = bleu.compute(predictions=pred_texts, references=label_texts)["bleu"]
# #     except Exception as e:
# #         print(f"Error computing BLEU: {e}")
# #         bleu_score = 0.0
# #     try:
# #         rouge_score = rouge.compute(predictions=[" ".join(p) for p in pred_texts], references=[" ".join(l[0]) for l in label_texts])["rougeL"]
# #     except Exception as e:
# #         print(f"Error computing ROUGE: {e}")
# #         rouge_score = 0.0

# #     exact_matches = sum([" ".join(p) == " ".join(l[0]) for p, l in zip(pred_texts, label_texts)])
# #     exact_match_score = exact_matches / len(pred_texts)

# #     return {
# #         "BLEU": bleu_score,
# #         "ROUGE_L": rouge_score,
# #         "Exact_Match": exact_match_score,
# #     }

In [44]:
# --- LoRA Configuration ---
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

In [45]:
total_steps = (len(dataset) // (TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS)) * NO_OF_EPOCHS
warmup_steps = int(0.05 * total_steps)

# --- SFT Trainer ---
training_args = TrainingArguments(
    
    dataloader_num_workers=4,
    logging_steps=LOGGING_STEPS,
    
    optim="adamw_torch_4bit",
    learning_rate=LEARNING_RATE,
    warmup_steps = warmup_steps,
    num_train_epochs=NO_OF_EPOCHS,
    lr_scheduler_type="constant_with_warmup",
    
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,

    per_device_eval_batch_size=4,
    eval_accumulation_steps=8,
    eval_strategy="steps",
    eval_steps=LOGGING_STEPS,
    
    fp16=True,
    report_to="wandb",
    
    save_strategy="epoch",
    output_dir=output_dir,

    remove_unused_columns=False
)

In [46]:
# Intiailize Wandb
wandb.init(
    project=PROJECT,
    name=RUN_NAME,
    config={
        **training_args.to_dict(),
        "custom_config": {
            "model_name": MODEL_NAME,
            "dataset": DATASET,
            "task": TASK,
            "LoRA_r": LORA_R,
            "LoRA_alpha": LORA_ALPHA,
            "LoRA_dropout": LORA_DROPOUT,
            "target_modules": TARGET_MODULES,
        }
    },
    tags=TAGS,
    group=GROUP,
    notes=NOTES,
    mode="online"
)

0,1
startup/loss,▁
startup/step,▁
train/epoch,▁
train/global_step,▁
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
startup/loss,0.0
startup/step,0.0
train/epoch,0.0016
train/global_step,5.0
train/grad_norm,65.30002
train/learning_rate,0.0
train/loss,48.9635


In [47]:
wandb.log({"startup/step": 0, "startup/loss": 0.0})

wandb.watch(model, log="all", log_freq=100)

In [62]:
from evaluate import load
import numpy as np

def build_compute_metrics(tokenizer):
    bleu = load("bleu")
    rouge = load("rouge")

    def compute_metrics(eval_preds):
        preds = eval_preds.predictions
        labels = eval_preds.label_ids

        # If preds are tuple (logits, ), take first
        if isinstance(preds, tuple):
            preds = preds[0]

        preds = np.array(preds)
        labels = np.array(labels)

        # Take argmax if needed
        if preds.ndim == 3:
            preds = np.argmax(preds, axis=-1)

        # Clean labels (-100 to pad_token_id)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

        # Convert to list
        preds = preds.tolist()
        labels = labels.tolist()

        # Decode
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=True)

        # Clean whitespace
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [label.strip() for label in decoded_labels]

        # Compute BLEU and ROUGE
        bleu_score = bleu.compute(predictions=decoded_preds, references=[[r] for r in decoded_labels])["bleu"]
        rouge_score = rouge.compute(predictions=decoded_preds, references=decoded_labels)["rougeL"]

        return {
            "bleu": round(bleu_score * 100, 2),
            "rougeL": round(rouge_score * 100, 2),
        }

    return compute_metrics

In [64]:
# # # compute_metrics_func = compute_metrics(tokenizer)
# # compute_metrics_func = build_compute_metrics(tokenizer)


# trainer = SFTTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset ,
#     eval_dataset=eval_dataset,
#     peft_config=lora_config,
#     processing_class =tokenizer,
#     # compute_metrics=compute_metrics_func,
#     # callbacks=[DynamicEvalSubsetCallback(eval_dataset, subset_size=100)],
# )
compute_metrics_func = build_compute_metrics(tokenizer)

import random
from trl import SFTTrainer

class SFTTrainerEvalSampling(SFTTrainer):
    def __init__(self, *args, eval_sample_size=16, **kwargs):
        super().__init__(*args, **kwargs)
        self.eval_sample_size = eval_sample_size

    def get_eval_dataloader(self, eval_dataset=None):
        '''
        Samples the evaluation dataset and returns a subset 
        of size self.eval_sample_size.
        '''
        if eval_dataset is None:
            eval_dataset = self.eval_dataset

        idxs = random.sample(range(len(eval_dataset)), self.eval_sample_size)
        print(idxs)
        eval_subset = eval_dataset.select(idxs)

        return super().get_eval_dataloader(eval_subset)

trainer = SFTTrainerEvalSampling(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,    # pass full 10k dataset here
    eval_sample_size=100,               # choose sample size you want
    peft_config=lora_config,
    processing_class=tokenizer,
    # compute_metrics_func = build_compute_metrics(tokenizer)

    compute_metrics=compute_metrics_func,
)



Converting train dataset to ChatML:   0%|          | 0/99900 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/99900 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/99900 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/99900 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/100 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [65]:
# --- Start Fine-tuning ---
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss,Validation Loss,Bleu,Rougel
5,48.9635,6.227513,3.51,19.41
10,50.1877,6.210383,3.55,19.36
15,49.3113,6.197471,3.51,19.41


[81, 14, 3, 94, 35, 31, 28, 17, 13, 86, 69, 11, 75, 54, 4, 97, 88, 27, 29, 64, 77, 84, 71, 25, 89, 53, 93, 57, 95, 0, 20, 90, 43, 79, 19, 82, 67, 6, 5, 24, 62, 22, 68, 58, 38, 16, 51, 2, 46, 99, 34, 7, 60, 61, 66, 18, 40, 39, 23, 36, 12, 85, 52, 98, 44, 74, 63, 59, 47, 8, 33, 87, 26, 83, 49, 80, 96, 32, 21, 30, 37, 76, 92, 48, 45, 72, 56, 55, 10, 73, 78, 15, 70, 1, 50, 65, 9, 91, 41, 42]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[72, 91, 40, 27, 83, 63, 50, 82, 58, 18, 33, 17, 31, 71, 68, 89, 74, 54, 95, 51, 46, 28, 88, 65, 94, 11, 6, 14, 19, 20, 92, 8, 49, 48, 59, 32, 1, 43, 79, 7, 62, 56, 34, 66, 77, 67, 41, 21, 60, 90, 96, 10, 29, 0, 16, 64, 81, 44, 73, 97, 86, 42, 12, 9, 23, 98, 84, 61, 70, 15, 38, 3, 76, 26, 25, 36, 80, 75, 35, 93, 2, 39, 30, 87, 4, 45, 57, 52, 85, 99, 55, 69, 53, 22, 24, 37, 13, 5, 78, 47]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[66, 57, 15, 31, 28, 8, 43, 2, 75, 70, 29, 91, 95, 0, 9, 80, 7, 89, 94, 4, 42, 85, 65, 30, 35, 62, 27, 69, 16, 60, 96, 90, 52, 24, 12, 77, 55, 22, 73, 26, 82, 63, 46, 3, 93, 41, 54, 6, 56, 25, 98, 21, 67, 97, 64, 45, 34, 87, 81, 61, 11, 17, 59, 49, 84, 79, 47, 51, 86, 92, 37, 99, 40, 83, 5, 13, 36, 23, 33, 44, 1, 50, 20, 72, 38, 88, 74, 58, 68, 14, 48, 53, 39, 19, 78, 76, 18, 71, 10, 32]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

KeyboardInterrupt: 

In [None]:
# --- Save Final Model ---
trainer.model.save_pretrained(output_dir)
print(f"✅ Training complete. Model saved at {output_dir}")

In [None]:
import shutil
shutil.make_archive('llava_lora_instagram', 'zip', 'llava_lora_instagram')

Inference

In [None]:
# --- Inference function
def generate_caption(image_path):
    # Load and preprocess the image
    image = Image.open(image_path).convert("RGB")

    # Prepare the input prompt (same as training)
    messages = [
        {"role": "user", "content": [
            {"type": "text", "text": "You are a social media influencer. Write a captivating Instagram caption for this image that will engage more viewers and boost interaction. Analyze the image to decide the tone of the caption."},
            {"type": "image"}
        ]}
    ]

    inputs = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, return_tensors="pt")
    input_tensors = processor(text=inputs, images=[image], return_tensors="pt", padding=True).to(model.device)

    # Generate
    with torch.no_grad():
        # output = model.generate(**input_tensors, max_new_tokens=80)
        output = model.generate(
            **input_tensors,
            max_new_tokens=80,
            repetition_penalty=1.2,   # Encourage less repetition
            temperature=0.7,          # Add some randomness
            top_p=0.9,                # Top-p sampling (nucleus sampling)
            do_sample=True            # Enable sampling instead of greedy decoding
        )

    # Decode
    generated_text = processor.batch_decode(output[:, input_tensors["input_ids"].shape[1]:], skip_special_tokens=True)[0]

    return generated_text.strip()

In [None]:
# --- Example usage
caption = generate_caption("./temp.jpg")
print("Generated Caption:", caption)

In [None]:
# --- Example usage
caption = generate_caption("./test1.jpg")
print("Generated Caption:", caption)

In [None]:
# --- Example usage
caption = generate_caption("./test3.jpg")
print("Generated Caption:", caption)

In [None]:
# --- Example usage
caption = generate_caption("./test4.jpg")
print("Generated Caption:", caption)

In [None]:
# --- Example usage
caption = generate_caption("./test5.jpg")
print("Generated Caption:", caption)

In [None]:
# --- Example usage
caption = generate_caption("./test6.jpg")
print("Generated Caption:", caption)

In [None]:
# --- Example usage
caption = generate_caption("./test7.jpg")
print("Generated Caption:", caption)

In [None]:
# def chat_with_model(messages, image=None):
#     """
#     Function to send messages to the model and get a reply.
#     - `messages`: current conversation list
#     - `image`: PIL.Image if needed for the first user input
#     """
#     # Prepare input
#     if image:
#         inputs = processor.apply_chat_template(messages, images=[image], return_tensors="pt", tokenize=True, add_generation_prompt=True)
#     else:
#         inputs = processor.apply_chat_template(messages, return_tensors="pt", tokenize=True, add_generation_prompt=True)
    
#     inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
#     # Generate
#     with torch.no_grad():
#         output = model.generate(
#             **inputs,
#             max_new_tokens=100,
#             temperature=0.7,
#             top_p=0.9,
#             repetition_penalty=1.1,
#             do_sample=True
#         )
    
#     # Decode output
#     reply = processor.tokenizer.decode(output[0], skip_special_tokens=True)
    
#     return reply

def chat_with_model(messages, image=None):
    """
    Function to send messages to the model and get a reply.
    """
    # Step 1: Create chat template
    prompt_text = processor.tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    
    # Step 2: Encode inputs
    if image:
        inputs = processor(text=prompt_text, images=[image], return_tensors="pt", padding=True)
    else:
        inputs = processor(text=prompt_text, return_tensors="pt", padding=True)
    
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Step 3: Generate
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
            do_sample=True
        )
    
    # Step 4: Decode output
    reply = processor.tokenizer.decode(output[0], skip_special_tokens=True)
    
    return reply


In [None]:

# ------------------
# Start a conversation
# ------------------

# Step 1: Initial messages with an image
image_path = "test2.jpg"
image = Image.open(image_path).convert("RGB")

messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "You are a social media influencer. Write a catchy Instagram caption for this image."},
            {"type": "image"}
        ]
    }
]

# First model reply
caption = chat_with_model(messages, image=image)
print(f"\n🧠 Model: {caption}")

# Add model's response to messages
messages.append({
    "role": "assistant",
    "content": [{"type": "text", "text": caption}]
})

# Step 2: Loop for continuous chat
while True:
    user_input = input("\n💬 Your input (type 'quit' to stop): ")
    
    if user_input.lower() == "quit":
        print("👋 Ending chat. Goodbye!")
        break
    
    # Add user's new message
    messages.append({
        "role": "user",
        "content": [{"type": "text", "text": user_input}]
    })
    
    # Get model's reply
    model_reply = chat_with_model(messages)
    print(f"\n🧠 Model: {model_reply}")
    
    # Add model's reply back to messages
    messages.append({
        "role": "assistant",
        "content": [{"type": "text", "text": model_reply}]
    })
