# Food Classifier Fine-Tuning


## 1. Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q transformers datasets peft accelerate bitsandbytes tqdm scikit-learn

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
import json
import torch
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import Dict, List
from difflib import get_close_matches

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report
)
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings('ignore')

print("Imports successful")
print(f"CUDA available: {torch.cuda.is_available()}")

Imports successful
CUDA available: True


In [4]:
@dataclass
class Config:
    model_name: str = "Qwen/Qwen2.5-3B-Instruct"
    use_qlora: bool = True

    train_file: str = "/content/drive/MyDrive/train.jsonl"
    test_file: str = "/content/drive/MyDrive/test.jsonl"


    output_dir: str = "./qwen-food-qlora"
    max_length: int = 512

    # Training
    num_epochs: int = 3
    batch_size: int = 4
    gradient_accumulation_steps: int = 8
    learning_rate: float = 2e-4

    # LoRA
    lora_r: int = 16
    lora_alpha: int = 32
    lora_dropout: float = 0.05

    seed: int = 42

config = Config()
torch.manual_seed(config.seed)
np.random.seed(config.seed)
os.makedirs(config.output_dir, exist_ok=True)

print("="*70)
print("CONFIGURATION")
print("="*70)
print(f"Model: {config.model_name}")
print(f"Method: {'QLoRA (4-bit)' if config.use_qlora else 'LoRA (16-bit)'}")
print(f"Epochs: {config.num_epochs}")
print(f"Batch size: {config.batch_size * config.gradient_accumulation_steps}")
print("="*70)

CONFIGURATION
Model: Qwen/Qwen2.5-3B-Instruct
Method: QLoRA (4-bit)
Epochs: 3
Batch size: 32


## 2. Load Model and Tokenizer

In [5]:
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(config.model_name, trust_remote_code=True)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print(f"Tokenizer loaded")

Loading tokenizer...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Tokenizer loaded


In [6]:
def format_dataset(sample):
    text = tokenizer.apply_chat_template(
        sample["messages"],
        tokenize=False,
        add_generation_prompt=False
    )

    tokenized = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=config.max_length,
        return_tensors=None
    )

    labels = tokenized["input_ids"].copy()
    labels = [
        -100 if token_id == tokenizer.pad_token_id else token_id
        for token_id in labels
    ]
    tokenized["labels"] = labels

    return tokenized

In [7]:
print("Loading model...")

if config.use_qlora:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )

    model = AutoModelForCausalLM.from_pretrained(
        config.model_name,
        quantization_config=bnb_config,
        device_map={'': torch.cuda.current_device()} if torch.cuda.is_available() else 'auto',
        trust_remote_code=True
    )
    model = prepare_model_for_kbit_training(model)
    print("✓ Model loaded in 4-bit")
else:
    model = AutoModelForCausalLM.from_pretrained(
        config.model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    print("✓ Model loaded in 16-bit")

Loading model...


config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

✓ Model loaded in 4-bit


## 3. Prepare Data

In [8]:
train_dataset = load_dataset("json", data_files=config.train_file, split="train")
val_dataset = load_dataset("json", data_files=config.test_file, split="train")

train_dataset = train_dataset.map(format_dataset, remove_columns=["messages"])
val_dataset = val_dataset.map(format_dataset, remove_columns=["messages"])

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/11919 [00:00<?, ? examples/s]

Map:   0%|          | 0/2031 [00:00<?, ? examples/s]

In [9]:
def classify_ingredients_qwen(prompt: str, model, tokenizer, max_new_tokens: int = 10) -> str:
    messages = [
        {"role": "user", "content": prompt}
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            temperature=0.0,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(
        outputs[0][input_ids.shape[1]:],
        skip_special_tokens=True
    ).strip()

    return response

In [10]:
print("Sample input_ids length:", len(train_dataset[0]["input_ids"]))
print("Sample labels length:", len(train_dataset[0]["labels"]))
print("First 10 tokens:", train_dataset[0]["input_ids"][:10])
print("Type of input_ids:", type(train_dataset[0]["input_ids"]))

Sample input_ids length: 512
Sample labels length: 512
First 10 tokens: [151644, 8948, 198, 2610, 525, 1207, 16948, 11, 3465, 553]
Type of input_ids: <class 'list'>


## 4. Evaluation Functions

In [11]:
import json
import torch
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, classification_report
import re

def normalize_label(label: str) -> str:
    if not label or pd.isna(label):
        return ""
    label = str(label).strip().lower()
    # Standardize variants
    if 'contains-gluten' in label:
        return 'contains-gluten'
    elif 'gluten-free' in label:
        return 'gluten-free'
    elif 'contains-lactose' in label:
        return 'contains-lactose'
    elif 'lactose-free' in label:
        return 'lactose-free'
    elif 'non-veg' in label or 'nonveg' in label or 'non veg' in label:
        return 'non-veg'
    elif 'veg' in label:
        return 'veg'
    else:
        return label

def get_task_from_content(content: str) -> str:
    """Extract task type from user message content"""
    if "gluten-free or contains-gluten" in content:
        return "gluten"
    elif "lactose-free or contains-lactose" in content:
        return "lactose"
    elif "veg or non-veg" in content:
        return "veg"
    else:
        return "unknown"

def evaluate_on_chat_test_set(model, tokenizer, test_file: str, max_new_tokens: int = 10):
    print("=" * 80)
    print("EVALUATING MODEL ON CHAT-FORMATTED TEST SET")
    print("=" * 80)

    # Load test data
    test_samples = []
    with open(test_file, 'r', encoding='utf-8') as f:
        for line in f:
            test_samples.append(json.loads(line))

    print(f"Loaded {len(test_samples)} chat-formatted test samples.")

    all_preds = []
    all_labels = []
    task_data = {"gluten": {"preds": [], "labels": []},
                 "lactose": {"preds": [], "labels": []},
                 "veg": {"preds": [], "labels": []}}

    errors = []

    for sample in tqdm(test_samples, desc="Evaluating"):
        messages = sample["messages"]
        if len(messages) < 2:
            continue

        user_msg = messages[0]["content"]
        true_label = messages[1]["content"]

        task = get_task_from_content(user_msg)
        if task == "unknown":
            continue

        pred_raw = classify_ingredients_qwen(
            user_msg,
            model,
            tokenizer,
            max_new_tokens=max_new_tokens
        )

        norm_true = normalize_label(true_label)
        norm_pred = normalize_label(pred_raw)

        all_labels.append(norm_true)
        all_preds.append(norm_pred)
        task_data[task]["labels"].append(norm_true)
        task_data[task]["preds"].append(norm_pred)

        if norm_true != norm_pred:
            errors.append({
                "user_msg": user_msg,
                "true": norm_true,
                "pred": norm_pred,
                "pred_raw": pred_raw
            })

    print("\n" + "="*60)
    print("OVERALL RESULTS")
    print("="*60)
    overall_acc = accuracy_score(all_labels, all_preds)
    print(f"Overall Accuracy: {overall_acc:.4f} ({overall_acc*100:.2f}%)")
    print(f"Total evaluated samples: {len(all_labels)}")

    print("\n" + "="*60)
    print("PER-TASK ACCURACY")
    print("="*60)
    for task in ["gluten", "lactose", "veg"]:
        labels = task_data[task]["labels"]
        preds = task_data[task]["preds"]
        if labels:
            acc = accuracy_score(labels, preds)
            print(f"{task.capitalize():>10}: {acc:.4f} ({acc*100:.2f}%) ({len(labels)} samples)")
        else:
            print(f"{task.capitalize():>10}: No samples")

    print("\n" + "="*60)
    print("FULL CLASSIFICATION REPORT")
    print("="*60)
    unique_labels = sorted(set(all_labels))
    print(classification_report(all_labels, all_preds, labels=unique_labels, zero_division=0))


    return {
        "overall_accuracy": overall_acc,
        "per_task": {task: (accuracy_score(task_data[task]["labels"], task_data[task]["preds"])
                           if task_data[task]["labels"] else None)
                    for task in ["gluten", "lactose", "veg"]},
        "errors": errors,
        "total_samples": len(all_labels)
    }

## 5. BEFORE Fine-Tuning Evaluation (Baseline)

In [12]:
eval_results = evaluate_on_chat_test_set(
    model,
    tokenizer,
    test_file="/content/drive/MyDrive/test.jsonl",
    max_new_tokens=10
)

EVALUATING MODEL ON CHAT-FORMATTED TEST SET
Loaded 2031 chat-formatted test samples.


Evaluating:   0%|          | 0/2031 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



OVERALL RESULTS
Overall Accuracy: 0.0965 (9.65%)
Total evaluated samples: 2031

PER-TASK ACCURACY
    Gluten: 0.0739 (7.39%) (677 samples)
   Lactose: 0.0414 (4.14%) (677 samples)
       Veg: 0.1743 (17.43%) (677 samples)

FULL CLASSIFICATION REPORT
                  precision    recall  f1-score   support

 contains-gluten       0.00      0.00      0.00        96
contains-lactose       0.00      0.00      0.00       153
     gluten-free       1.00      0.09      0.16       581
    lactose-free       1.00      0.05      0.10       524
         non-veg       0.38      0.03      0.06       100
             veg       0.88      0.20      0.32       577

       micro avg       0.90      0.10      0.17      2031
       macro avg       0.54      0.06      0.11      2031
    weighted avg       0.81      0.10      0.17      2031



## 6. Configure LoRA and Train

In [13]:
# Configure LoRA
print("Configuring LoRA...")

lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=config.lora_dropout,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print("✓ LoRA configured")
model.print_trainable_parameters()

Configuring LoRA...
✓ LoRA configured
trainable params: 7,372,800 || all params: 3,093,311,488 || trainable%: 0.2383


In [14]:
# Training arguments
training_args = TrainingArguments(
    output_dir=config.output_dir,
    num_train_epochs=config.num_epochs,
    per_device_train_batch_size=config.batch_size,
    per_device_eval_batch_size=config.batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    learning_rate=config.learning_rate,
    warmup_ratio=0.1,
    fp16=True,
    optim="paged_adamw_32bit" if config.use_qlora else "adamw_torch",
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
    seed=config.seed,
    remove_unused_columns=False,
    gradient_checkpointing=False,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)

print("Trainer configured")

Trainer configured


In [15]:
print("\n" + "="*70)
print("STARTING TRAINING")
print("="*70 + "\n")

trainer.train()

print("\n" + "="*70)
print("TRAINING COMPLETED!")
print("="*70)


STARTING TRAINING



`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
50,1.6665,1.357787
100,1.0791,1.000412
150,0.9895,0.953245
200,0.9844,0.930646
250,0.9064,0.91313
300,0.8809,0.901314
350,0.838,0.88954
400,0.8479,0.880631
450,0.838,0.872763
500,0.7859,0.868693



TRAINING COMPLETED!


## 7. AFTER Fine-Tuning Evaluation

In [16]:
final_results = evaluate_on_chat_test_set(
    model, tokenizer, config.test_file, max_new_tokens=10
)

EVALUATING MODEL ON CHAT-FORMATTED TEST SET
Loaded 2031 chat-formatted test samples.


Evaluating:   0%|          | 0/2031 [00:00<?, ?it/s]


OVERALL RESULTS
Overall Accuracy: 0.9685 (96.85%)
Total evaluated samples: 2031

PER-TASK ACCURACY
    Gluten: 0.9823 (98.23%) (677 samples)
   Lactose: 0.9601 (96.01%) (677 samples)
       Veg: 0.9631 (96.31%) (677 samples)

FULL CLASSIFICATION REPORT
                  precision    recall  f1-score   support

 contains-gluten       0.92      0.96      0.94        96
contains-lactose       0.95      0.87      0.91       153
     gluten-free       0.99      0.99      0.99       581
    lactose-free       0.96      0.99      0.97       524
         non-veg       0.90      0.84      0.87       100
             veg       0.97      0.98      0.98       577

        accuracy                           0.97      2031
       macro avg       0.95      0.94      0.94      2031
    weighted avg       0.97      0.97      0.97      2031



## 8. Compare Results

In [17]:
print("\n" + "="*70)
print("BEFORE vs AFTER COMPARISON")
print("="*70)
print(f"\nBaseline (before):  {eval_results['overall_accuracy']:.4f} accuracy")
print(f"Fine-tuned (after): {final_results['overall_accuracy']:.4f} accuracy")
print(f"\nImprovement: {(final_results['overall_accuracy'] - eval_results['overall_accuracy']) * 100:+.2f}%")
print("\n" + "="*70)


BEFORE vs AFTER COMPARISON

Baseline (before):  0.0965 accuracy
Fine-tuned (after): 0.9685 accuracy

Improvement: +87.20%



In [18]:
from google.colab import drive
import shutil


drive.mount('/content/drive')

zip_name = 'food_classifier_model'
print("Creating ZIP archive...")
shutil.make_archive(zip_name, 'zip', './qwen-food-qlora')

shutil.move(f'{zip_name}.zip', '/content/drive/MyDrive/food_classifier_model_best_full_dset.zip')

print("ZIP saved to Google Drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Creating ZIP archive...
ZIP saved to Google Drive
