In [20]:
!pip install -q transformers datasets torch torchvision scikit-learn pillow

import torch
from transformers import (
    ViTForImageClassification, 
    ViTImageProcessor,
    Trainer, 
    TrainingArguments
)
import numpy as np
import os
import warnings
from PIL import Image
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset as TorchDataset
from torchvision import transforms

warnings.filterwarnings('ignore')
device = "cuda" if torch.cuda.is_available() else "cpu"

print("SETUP COMPLETE")
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

os.makedirs('./models', exist_ok=True)
os.makedirs('./outputs', exist_ok=True)
os.makedirs('./logs', exist_ok=True)

print("All libraries imported!")

SETUP COMPLETE
Device: cuda
GPU: Tesla T4
Memory: 15.83 GB
All libraries imported!


In [21]:
metrics_store = {}

print("Loading Food-41...")

data_path = '/kaggle/input/food41/images'

print(f"Dataset path: {data_path}")

image_paths = []
labels = []

class_folders = sorted([f for f in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, f))])

print(f"Total classes: {len(class_folders)}")
print(f"First 5 classes: {class_folders[:5]}")

label_to_idx = {label: idx for idx, label in enumerate(class_folders)}
idx_to_label = {idx: label for label, idx in label_to_idx.items()}

for label_name in class_folders:
    class_path = os.path.join(data_path, label_name)
    image_files = [f for f in os.listdir(class_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    
    for img_name in image_files:
        image_paths.append(os.path.join(class_path, img_name))
        labels.append(label_to_idx[label_name])
    
    # Show progress for first few classes
    if len(class_folders) <= 10 or label_name in class_folders[:3]:
        print(f"  {label_name}: {len(image_files)} images")

print(f"\nâœ“ Total images collected: {len(image_paths):,}")

SAMPLE_RATIO = 0.6
sample_size = int(len(image_paths) * SAMPLE_RATIO)
indices = np.random.choice(len(image_paths), sample_size, replace=False)

image_paths = [image_paths[i] for i in indices]
labels = [labels[i] for i in indices]

print(f"Using {len(image_paths):,} images ({SAMPLE_RATIO*100:.0f}% sample)")

# Split dataset
train_paths, temp_paths, train_labels, temp_labels = train_test_split(
    image_paths, labels, test_size=0.3, random_state=42, stratify=labels
)

val_paths, test_paths, val_labels, test_labels = train_test_split(
    temp_paths, temp_labels, test_size=0.5, random_state=42, stratify=temp_labels
)

print(f"Train: {len(train_paths):,} | Val: {len(val_paths):,} | Test: {len(test_paths):,}")

metrics_store['vit'] = {
    'label_to_idx': label_to_idx,
    'idx_to_label': idx_to_label,
    'num_classes': len(class_folders)
}

print("Dataset loaded")

Loading Food-41...
Dataset path: /kaggle/input/food41/images
Total classes: 101
First 5 classes: ['apple_pie', 'baby_back_ribs', 'baklava', 'beef_carpaccio', 'beef_tartare']
  apple_pie: 1000 images
  baby_back_ribs: 1000 images
  baklava: 1000 images

âœ“ Total images collected: 101,000
Using 60,600 images (60% sample)
Train: 42,420 | Val: 9,090 | Test: 9,090
Dataset loaded


In [None]:
class FoodDataset(TorchDataset):
    def __init__(self, image_paths, labels, processor, augment=False):
        self.image_paths = image_paths
        self.labels = labels
        self.processor = processor
        self.augment = augment
        
        if augment:
            self.transform = transforms.Compose([
                transforms.RandomResizedCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ColorJitter(0.2, 0.2, 0.2),
                transforms.RandomRotation(10),
            ])
        else:
            self.transform = transforms.Resize((224, 224))
    
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        try:
            image = Image.open(self.image_paths[idx]).convert('RGB')
        except:
            image = Image.new('RGB', (224, 224), 'white')
        
        image = self.transform(image)
        encoding = self.processor(image, return_tensors='pt')
        
        return {
            'pixel_values': encoding['pixel_values'].squeeze(),
            'labels': torch.tensor(self.labels[idx])
        }

print("ViT...")

vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
vit_model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224',
    num_labels=len(class_folders),
    ignore_mismatched_sizes=True
)

vit_model.config.id2label = idx_to_label
vit_model.config.label2id = label_to_idx

print(f"âœ“ ViT loaded ({vit_model.num_parameters() / 1e6:.1f}M params)")

# Create datasets
vit_train_dataset = FoodDataset(train_paths, train_labels, vit_processor, augment=True)
vit_val_dataset = FoodDataset(val_paths, val_labels, vit_processor, augment=False)
vit_test_dataset = FoodDataset(test_paths, test_labels, vit_processor, augment=False)

print(f"Datasets created")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy': accuracy_score(labels, predictions)}

vit_training_args = TrainingArguments(
    output_dir='./models/vit-food',
    num_train_epochs=3,                    
    per_device_train_batch_size=32,       
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,       
    learning_rate=2e-4,
    warmup_steps=200,
    weight_decay=0.01,
    logging_steps=100,
    save_steps=500,
    save_total_limit=1,
    eval_strategy="steps",
    eval_steps=500,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    remove_unused_columns=False,
    report_to="none",
)

print(f"Epochs: {vit_training_args.num_train_epochs}")
print(f"Batch size: {vit_training_args.per_device_train_batch_size}")

vit_trainer = Trainer(
    model=vit_model,
    args=vit_training_args,
    train_dataset=vit_train_dataset,
    eval_dataset=vit_val_dataset,
    compute_metrics=compute_metrics,
)

print("\nðŸš€ Training started...\n")
train_result = vit_trainer.train()

print("\nâœ“ Training complete!")
print(f"Training time: {train_result.metrics['train_runtime']:.2f}s")

# Save
vit_trainer.save_model('./models/vit-food-final')
vit_processor.save_pretrained('./models/vit-food-final')

# Store metrics
metrics_store['vit']['train_loss'] = train_result.metrics['train_loss']
metrics_store['vit']['train_time'] = train_result.metrics['train_runtime']

print("Model saved")


In [None]:
import shutil
import os

# Path to your trained model folder
model_folder = "./models/vit-food-final"
zip_filename = "vit_food_final.zip"

# Make sure the folder exists
if os.path.exists(model_folder):
    # Create ZIP archive
    shutil.make_archive(zip_filename.replace(".zip", ""), 'zip', model_folder)
    print(f"Zipped model saved as {zip_filename}")
else:
    print("Model folder not found! Check the path and try again.")


In [None]:
from IPython.display import FileLink
FileLink(r'vit_food_final.zip')


In [None]:
!pip install gradio gdown 


In [23]:
print(list(vit_model.config.id2label.keys())[:10])


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [5]:
import gradio as gr
from PIL import Image
import torch
import gdown
import zipfile
import os
from transformers import ViTImageProcessor, ViTForImageClassification


# Download fine-tuned model

drive_url = "https://drive.google.com/uc?id=11sQr1xMlFhgQmE6eLaEH-yTLKfTReBZn"
zip_path = "vit_food_final.zip"
model_dir = "./vit_food_final"

if not os.path.exists(model_dir):
    print("Downloading fine-tuned model from Google Drive...")
    gdown.download(drive_url, zip_path, quiet=False)

    print("Extracting model...")
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(model_dir)
    print("Extracted to:", model_dir)
else:
    print("Model folder already exists, skipping download.")

# Load model and processor

device = "cuda" if torch.cuda.is_available() else "cpu"

vit_processor = ViTImageProcessor.from_pretrained(model_dir)
vit_model = ViTForImageClassification.from_pretrained(model_dir).to(device)

# Automatically pulled from config.json
id2label = vit_model.config.id2label
label2id = vit_model.config.label2id

print(f"âœ“ Loaded ViT model with {len(id2label)} labels")


# Prediction Function
def predict_food(image):
    """Predict food class from uploaded image"""
    try:
        # Prepare image
        if not isinstance(image, Image.Image):
            image = Image.fromarray(image)
        image = image.convert('RGB').resize((224, 224))
        
        # Prepare inputs
        inputs = vit_processor(image, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        vit_model.to(device)
        vit_model.eval()

        with torch.no_grad():
            outputs = vit_model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
        
        # Get top 5 predictions
        top5_prob, top5_idx = torch.topk(probs, 5)
        results = {}

        for prob, idx in zip(top5_prob.cpu().numpy(), top5_idx.cpu().numpy()):
            # Convert index to str key if needed
            idx_str = str(int(idx))
            # Ensure we handle both string/int keys in id2label safely
            label_key = id2label.get(idx_str, id2label.get(int(idx), f"Unknown-{idx}"))
            
            # Clean and format label
            class_name = str(label_key).replace('_', ' ').title()
            results[class_name] = float(round(prob, 4))  # ensure pure float

        print("Predictions:", results)
        return results

    except Exception as e:
        print("Error:", e)
        return {"Error": str(e)}

# Gradio Interface

demo = gr.Interface(
    fn=predict_food,
    inputs=gr.Image(type="pil", label="Upload Food Image"),
    outputs=gr.Label(num_top_classes=5, label="Top 5 Predictions"),
    title="Food-101 Classifier (Fine-Tuned ViT)",
    description="""
    Upload a food image to classify it into one of 101 food categories.  
    **Model:** Fine-tuned ViT (Base Patch-16 224)  
    **Dataset:** Food-101 | Trained using limited Kaggle GPU resources.
    """,
    # examples=examples,
    theme=gr.themes.Soft()
)

print("Gradio interface ready")
demo.launch(share=True, debug=True)


Model folder already exists, skipping download.
âœ“ Loaded ViT model with 101 labels
Gradio interface ready
* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://f638edbbb276b5d011.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Predictions: {'Hamburger': 0.9922999739646912, 'Grilled Cheese Sandwich': 0.0010999999940395355, 'Pulled Pork Sandwich': 0.0008999999845400453, 'French Fries': 0.0005000000237487257, 'Falafel': 0.00039999998989515007}
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://f638edbbb276b5d011.gradio.live




In [None]:
# # Cell: Gradio Interface
# import gradio as gr
# from PIL import Image
# import torch

# def predict_food(image):
#     """Predict food class from uploaded image"""
#     try:
#         # Prepare image
#         if not isinstance(image, Image.Image):
#             image = Image.fromarray(image)
#         image = image.convert('RGB').resize((224, 224))
        
#         # Get predictions
#         inputs = vit_processor(image, return_tensors="pt")
#         if torch.cuda.is_available():
#             inputs = {k: v.to(device) for k, v in inputs.items()}
#             vit_model.to(device)
        
#         vit_model.eval()
#         with torch.no_grad():
#             outputs = vit_model(**inputs)
#             probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
        
#         # Get top 5 predictions
#         top5_prob, top5_idx = torch.topk(probs, 5)
#         results = {}
#         for prob, idx in zip(top5_prob.cpu().numpy(), top5_idx.cpu().numpy()):
#             class_name = idx_to_label[int(idx)].replace('_', ' ').title()
#             results[class_name] = float(prob)
        
#         return results
#     except Exception as e:
#         return {"Error": str(e)}

# # Prepare examples
# examples = test_paths[:5] if len(test_paths) >= 5 else []

# # Create interface
# demo = gr.Interface(
#     fn=predict_food,
#     inputs=gr.Image(type="pil", label="Upload Food Image"),
#     outputs=gr.Label(num_top_classes=5, label="Top 5 Predictions"),
#     title="Food-41 Classifier",
#     description=f"""
#     Upload a food image to classify it into one of {len(class_folders)} categories.
    
#     **Model:** Vision Transformer (ViT) | **Parameters:** 85.9M
#     **Dataset:** {len(train_paths):,} training images | {len(val_paths):,} validation | {len(test_paths):,} test
#     """,
#     examples=examples if examples else None,
#     theme=gr.themes.Soft()
# )

# print("GRADIO INTERFACE READY")
# demo.launch(share=True, debug=True)