In [1]:
import matplotlib.pyplot as plt
import numpy as np

from datasets import load_dataset
from datasets.dataset_dict import DatasetDict
from evaluate import load
from huggingface_hub import login
from transformers.models.auto.modeling_auto import AutoModelForCausalLM
from transformers.models.auto.processing_auto import AutoProcessor
from transformers.trainer import Trainer
from transformers.training_args import TrainingArguments
from transformers.models.auto.modeling_auto import AutoModelForImageTextToText

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import torch
from torchvision import datasets, transforms

from PIL import Image

In [3]:
dataset: DatasetDict = load_dataset("aryachakraborty/Food_Calorie_Dataset")
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'Query', 'Response'],
        num_rows: 285
    })
})

In [4]:
dataset: DatasetDict = load_dataset("aryachakraborty/Food_Calorie_Dataset")
dataset = dataset['train'].train_test_split(test_size=0.05)

train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [5]:
checkpoint = "microsoft/git-base"
processor = AutoProcessor.from_pretrained(checkpoint)

def transforms_data(example_batch):
    answers = [x for x in example_batch["Response"]]
    images = [x.resize((50, 50)) for x in example_batch['image']]
    question = [x for x in example_batch['Query']]
    inputs = processor(images=images, text=question, truncation=True, max_length=512, padding='max_length')
    inputs.update({'labels': processor(text=answers, truncation=True, max_length=512, padding='max_length')['input_ids']})
    return inputs


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [6]:
train_dataset = train_dataset.select_columns(['image', 'Query', 'Response'])
test_dataset = test_dataset.select_columns(['image', 'Query', 'Response'])

train_dataset.set_transform(transforms_data)
test_dataset.set_transform(transforms_data)

In [7]:
checkpoint = "microsoft/git-base"
processor = AutoProcessor.from_pretrained(checkpoint)

In [10]:
model = AutoModelForCausalLM.from_pretrained(checkpoint)

model_name = checkpoint.split("/")[1]
training_args = TrainingArguments(
    output_dir=f"{model_name}-food-calorie",
    learning_rate=5e-5,
    num_train_epochs=10,
    fp16=False,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    save_total_limit=3,
    eval_strategy="epoch",
    eval_steps=1,
    save_strategy="epoch",
    save_steps=11,
    logging_steps=10,
    remove_unused_columns=False,
    push_to_hub=True,
    label_names=["labels"],
    load_best_model_at_end=True,
    no_cuda=True
)



In [11]:
wer = load("wer")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predicted = logits.argmax(-1)
    decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)
    decoded_predictions = processor.batch_decode(predicted, skip_special_tokens=True)
    wer_score = wer.compute(predictions=decoded_predictions, references=decoded_labels)
    return {"wer_score": wer_score}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Wer Score
1,4.2118,3.048738,1.063609
2,0.5558,0.882604,0.91716
3,0.3831,0.857486,0.927515
4,1.0105,0.819103,0.893491
5,0.8106,0.817961,0.860947
6,0.4029,0.843766,0.878698
7,0.3621,0.854932,0.847633
8,0.4192,0.817092,0.912722
9,0.4384,0.824324,0.87574
10,0.3426,0.825832,0.868343


'(MaxRetryError("HTTPSConnectionPool(host='hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com', port=443): Max retries exceeded with url: /repos/80/aa/80aa34992b496dd89dc8c7bd0bf705c6b8d048aec1ec62b7da47fecbb963672f/fb21de8fc8f326f71b61b92f92bae9b55f9d53fb8db44e8269ad5e218697f299?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQLC2QXPN7%2F20250615%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250615T155106Z&X-Amz-Expires=86400&X-Amz-Signature=434e188d7c5aa6307a5f6442563e6005d320ae934d1bdaf00ae1bfe0eacf25d7&X-Amz-SignedHeaders=host&partNumber=35&uploadId=npI5cC61CoqpD_xq5aVKW9lm9tJnmjOpZWcGzitA3Ui3Gd0OkZvQ8mdJCmUW0mynsYqibTBHnREUgKlo2uYxQ12vmHeRro_aYvAMJmBqtABHVas7I2k9uvqEb4vqFhyE&x-id=UploadPart (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:2426)')))"), '(Request ID: 1b43462e-4836-4850-b673-02b70ec3e59d)')' thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/80/aa/

TrainOutput(global_step=680, training_loss=1.134646477769403, metrics={'train_runtime': 4277.3649, 'train_samples_per_second': 0.631, 'train_steps_per_second': 0.159, 'total_flos': 1262743122124800.0, 'train_loss': 1.134646477769403, 'epoch': 10.0})