======================================================<br>
0. IMPORTS<br>
======================================================

In [None]:
import os
import random
import time
import itertools
from datetime import datetime
import numpy as np
import pandas as pd
import ast
from PIL import Image
from tqdm.auto import tqdm

PyTorch & Transformers

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import ViTForImageClassification, ViTImageProcessor
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau

klearn

In [None]:
from sklearn.model_selection import train_test_split

======================================================<br>
# 1. CONFIGURATION & CONSTANTS<br>
======================================================

In [None]:
SEED = 42 #Reproducibility
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

Paths (Adjust as needed for your environment)

In [None]:
DATA_ROOT = "/kaggle/input/nutrition14"
IMAGES_DIR = os.path.join(DATA_ROOT, "data")
JSON_PATH = os.path.join(DATA_ROOT, "filtered_data.json")

Output for checkpoints

In [None]:
OUTPUT_DIR = "/kaggle/working/nutriscan_pytorch"
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUTPUT_CSV_PATH = os.path.join(OUTPUT_DIR, "experiment_raw_data_M2.csv")

Experiment Input (For resuming previous runs)<br>
Set to None if this is the first run. Set to path if resuming.

In [None]:
INPUT_CSV_PATH = "/kaggle/input/results/experiment_raw_data_M2.csv"
# INPUT_CSV_PATH = None 

Hyperparameters

In [None]:
IMAGE_SIZE = 224
EPOCHS = 8
WEIGHT_DECAY = 0.01
PATIENCE = 3         # Early stopping patience
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
print(f"Device detected: {DEVICE}")

======================================================<br>
2. DATA PROCESSING & MAPPING<br>
======================================================

In [None]:
print("Loading JSON:", JSON_PATH)
df = pd.read_json(JSON_PATH, dtype={'total_mass': 'float64'})

Ensure label column is a list

In [None]:
try:
    df['label'] = df['label'].apply(ast.literal_eval)
except Exception:
    pass

Filtered Semantic Map (Only the 14 requested classes)

In [None]:
SEMANTIC_MAP = {
    'CHICKEN': ['chicken'],
    'BERRIES': ['berries', 'strawberry'],
    'HIGH_SUGAR_FRUITS': ['fruit', 'apple', 'banana', 'orange'],
    'COOKING_VEGS': ['cucumbers', 'broccoli', 'carrot', 'asparagus'],
    'OATMEAL/CEREALS': ['cereal', 'oatmeal'],
    'PORK': ['pork', 'bacon', 'ham', 'sausage'],
    'BEEF': ['beef', 'steak', 'veal', 'meatball'],
    'PIZZA': ['pizza'],
    'POTATOES': ['potato', 'potatoes', 'fries', 'french fries', 'mashed potatoes'],
    'LEAFY_GREENS': ['salad', 'vegetable', 'lettuce', 'spinach'],
    'EGGS': ['eggs', 'egg', 'omelette', 'scrambled eggs', 'fried egg'],
    'LEGUMES': ['beans', 'lentils', 'chickpeas', 'legume'],
    'RICE': ['rice', 'risotto'],
    'FISH': ['fish', 'salmon', 'tuna', 'cod', 'tilapia']
}

In [None]:
def map_to_generic_class(ingredient_list):
    """Maps a list of ingredients to one of the 14 Generic Classes or OTHER."""
    if not isinstance(ingredient_list, list):
        return 'OTHER'
    for generic_class, keywords in SEMANTIC_MAP.items():
        for keyword in keywords:
            for ingredient in ingredient_list:
                if keyword in ingredient.lower():
                    return generic_class
    return 'OTHER'

In [None]:
df['generic_class'] = df['label'].apply(map_to_generic_class)

--- Balancing Strategy ---<br>
1. Filter classes with enough samples

In [None]:
MIN_IMAGES_PER_CLASS = 150
class_counts = df['generic_class'].value_counts()
class_list = class_counts[class_counts >= MIN_IMAGES_PER_CLASS].index.tolist()

In [None]:
if 'OTHER' in class_list:
    class_list.remove('OTHER')

Filter the dataframe to only include the valid 14 classes

In [None]:
filtered_counts = class_counts[class_list]

2. Undersample to the size of the smallest class

In [None]:
SAMPLES_PER_CLASS = int(filtered_counts.min()) 

In [None]:
df_list = []
for class_name in class_list:
    class_df = df[df['generic_class'] == class_name]
    df_list.append(class_df.sample(SAMPLES_PER_CLASS, random_state=SEED))

In [None]:
df_curato = pd.concat(df_list).sample(frac=1, random_state=SEED).reset_index(drop=True)

Encode labels

In [None]:
df_curato['label_code'] = df_curato['generic_class'].astype('category').cat.codes
class_map = dict(enumerate(df_curato['generic_class'].astype('category').cat.categories))
num_classes = len(class_map)

In [None]:
print(f"--- Dataset Prepared ---")
print(f"Total classes: {num_classes} (Target: 14)")
print(f"Total images: {len(df_curato)}")
print(f"Samples per class: {SAMPLES_PER_CLASS}")
print("Classes:", list(class_map.values()))

Split train/val/test (70/15/15)

In [None]:
train_df, temp_df = train_test_split(df_curato, test_size=0.30, stratify=df_curato['label_code'], random_state=SEED)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label_code'], random_state=SEED)

--- Image Path Validation ---<br>
Fixes broken links common in Kaggle datasets

In [None]:
print("Validating image paths...")
all_valid_rel_paths = set()
for root, dirs, files in os.walk(IMAGES_DIR):
    for f in files:
        if f.lower().endswith(('.png', '.jpg', '.jpeg')):
            full_path = os.path.join(root, f)
            rel_path = os.path.relpath(full_path, IMAGES_DIR)
            all_valid_rel_paths.add(rel_path)

In [None]:
def fix_links(dataframe):
    new_links = []
    
    def clean_path(path_str):
        if not isinstance(path_str, str): return None
        # Remove common prefixes
        for prefix in ['./data/', 'data/', './']:
            if path_str.startswith(prefix):
                return path_str[len(prefix):]
        return path_str
    dataframe['image_link_cleaned'] = dataframe['image_link'].apply(clean_path)
    for path in dataframe['image_link_cleaned']:
        if path and path in all_valid_rel_paths:
            new_links.append(path)
        else:
            new_links.append(None)
    dataframe['image_link'] = new_links
    return dataframe[dataframe['image_link'].notnull()].reset_index(drop=True)

In [None]:
train_df = fix_links(train_df)
val_df   = fix_links(val_df)
test_df  = fix_links(test_df)

In [None]:
print(f"Final Train size: {len(train_df)}")

======================================================<br>
3. DATASET CLASS & TRANSFORMS<br>
======================================================

In [None]:
processor = ViTImageProcessor.from_pretrained("nateraw/food")

In [None]:
train_transform = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=processor.image_mean, std=processor.image_std)
])

In [None]:
val_transform = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=processor.image_mean, std=processor.image_std)
])

In [None]:
class NutritionDataset(Dataset):
    def __init__(self, df, directory, transform):
        self.df = df.reset_index(drop=True)
        self.dir = directory
        self.transform = transform
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.dir, row['image_link'])
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        label = int(row['label_code'])
        return image, label

In [None]:
train_ds = NutritionDataset(train_df, directory=IMAGES_DIR, transform=train_transform)
val_ds   = NutritionDataset(val_df, directory=IMAGES_DIR, transform=val_transform)
test_ds  = NutritionDataset(test_df, directory=IMAGES_DIR, transform=val_transform)

In [None]:
datasets = (train_ds, val_ds, test_ds)

======================================================<br>
4. TRAINING FUNCTION<br>
======================================================

In [None]:
def set_seed(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

In [None]:
def run_experiment(lr, batch_size, run_seed, datasets, num_classes):
    """
    Runs a single experiment. 
    Returns the Validation Accuracy of the LAST epoch (final state).
    """
    start_time = time.time()
    
    # 1. Setup
    print(f"--- Starting Run: LR={lr}, BS={batch_size}, Seed={run_seed} ---")
    set_seed(run_seed)
    
    train_ds, val_ds, _ = datasets
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
    val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
    model = ViTForImageClassification.from_pretrained(
        "nateraw/food",
        num_labels=num_classes,
        ignore_mismatched_sizes=True
    )
    model.to(DEVICE)
    
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=WEIGHT_DECAY)
    criterion = nn.CrossEntropyLoss()
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

    # 2. Training Loop
    last_val_acc = 0.0 # Variable to store the accuracy of the current epoch
    best_val_loss = float('inf') # Only used for Early Stopping patience
    patience_counter = 0 
    for epoch in range(EPOCHS): 
        # Train
        model.train()
        running_loss = 0.0
        for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=False):
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            
            optimizer.zero_grad()
            outputs = model(pixel_values=images)
            loss = criterion(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        train_loss = running_loss / len(train_loader)
        
        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(DEVICE), labels.to(DEVICE)
                outputs = model(pixel_values=images)
                loss = criterion(outputs.logits, labels)
                val_loss += loss.item()
                preds = outputs.logits.argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        val_loss = val_loss / len(val_loader)
        last_val_acc = correct / total # Update with current epoch accuracy
        print(f"Epoch {epoch+1} â€” Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {last_val_acc:.4f}")
        
        scheduler.step(val_loss)

        # Early Stopping Check
        # Note: We are tracking patience based on Loss, but reporting the LAST accuracy.
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE: 
                print(f"Early stopping triggered at epoch {epoch+1}")
                break
    
    training_duration_sec = time.time() - start_time
    print(f"--- Run Finished: Final (Last) Validation Accuracy = {last_val_acc:.4f} ---")
    return {
        'last_validation_accuracy': last_val_acc, 
        'training_time_sec': training_duration_sec 
    }

======================================================<br>
5. MASTER EXPERIMENT LOOP<br>
======================================================

In [None]:
print("\n========================================")
print("STARTING STATISTICAL EXPERIMENT")
print("========================================")

In [None]:
factors = {
    'lr': [1e-5, 5e-5, 1e-4],
    'batch_size': [16, 32, 64]
}
NUM_REPETITIONS = 15
RESPONSE_VAR_NAME = 'last_validation_accuracy'

In [None]:
combinations = list(itertools.product(factors['lr'], factors['batch_size']))
total_runs = len(combinations) * NUM_REPETITIONS

Load existing results if available

In [None]:
all_results = []
try:
    if INPUT_CSV_PATH and os.path.exists(INPUT_CSV_PATH):
        all_results = pd.read_csv(INPUT_CSV_PATH).to_dict('records')
        print(f"Loaded {len(all_results)} existing results.")
    else:
        print("Starting new experiment (no valid input file found).")
except Exception as e:
    print(f"Error loading CSV: {e}. Starting fresh.")

In [None]:
run_counter = 0
experiment_start_time = datetime.now()

In [None]:
for rep in range(NUM_REPETITIONS):
    print(f"\n====== REPETITION {rep + 1}/{NUM_REPETITIONS} ======\n")
    repetition_seed = SEED + rep 
    
    for (lr_val, bs_val) in combinations:
        run_counter += 1
        
        # Check if run is already done
        run_done = False
        if all_results:
            for res in all_results:
                if (res['repetition'] == rep + 1 and 
                    res['lr'] == lr_val and 
                    res['batch_size'] == bs_val and 
                    res.get(RESPONSE_VAR_NAME) is not None):
                    run_done = True
                    break
        
        if run_done:
            print(f"--- [SKIP] Run {run_counter}/{total_runs} already completed.")
            continue 

        # Execute Run
        print(f"--- Running {run_counter}/{total_runs} (Rep {rep+1}, LR={lr_val}, BS={bs_val}) ---")
        combo_index = combinations.index((lr_val, bs_val))
        run_seed = repetition_seed * 1000 + combo_index
        try:
            metrics = run_experiment(
                lr=lr_val,
                batch_size=bs_val,
                run_seed=run_seed,
                datasets=datasets,
                num_classes=num_classes
            )
            
            result_row = {
                'repetition': rep + 1,
                'lr': lr_val,
                'batch_size': bs_val,
                RESPONSE_VAR_NAME: metrics['last_validation_accuracy'],
                'training_time_sec': metrics['training_time_sec']
            }
            
        except Exception as e:
            print(f"!!!!!! CRITICAL ERROR in run {run_counter}: {e} !!!!!!")
            torch.cuda.empty_cache()
            result_row = {
                'repetition': rep + 1,
                'lr': lr_val,
                'batch_size': bs_val,
                RESPONSE_VAR_NAME: None,
                'training_time_sec': None
            }
        
        all_results.append(result_row)
        
        # Save incrementally
        try:
            pd.DataFrame(all_results).to_csv(OUTPUT_CSV_PATH, index=False)
        except Exception as e:
            print(f"Error saving CSV: {e}")

In [None]:
print(f"\n========================================")
print(f"====== EXPERIMENT COMPLETED ======")
print(f"Total time: {datetime.now() - experiment_start_time}")
print(f"Results saved to: {OUTPUT_CSV_PATH}")
print("========================================")