## Inverse Cooking: Recipe Generation from Food Images

In [None]:
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import numpy as np
import os
from args import get_parser
import pickle
from model import get_model
from torchvision import transforms
from utils.output_utils import prepare_output
from PIL import Image
import time

Set ```data_dir``` to the path including vocabularies and model checkpoint

In [None]:
data_dir = '../data'

In [None]:
# code will run in gpu if available and if the flag is set to True, else it will run on cpu
use_gpu = False
device = torch.device('cuda' if torch.cuda.is_available() and use_gpu else 'cpu')
map_loc = None if torch.cuda.is_available() and use_gpu else 'cpu'

In [None]:
# code below was used to save vocab files so that they can be loaded without Vocabulary class
#ingrs_vocab = pickle.load(open(os.path.join(data_dir, 'final_recipe1m_vocab_ingrs.pkl'), 'rb'))
#ingrs_vocab = [min(w, key=len) if not isinstance(w, str) else w for w in ingrs_vocab.idx2word.values()]
#vocab = pickle.load(open(os.path.join(data_dir, 'final_recipe1m_vocab_toks.pkl'), 'rb')).idx2word
#pickle.dump(ingrs_vocab, open('../demo/ingr_vocab.pkl', 'wb'))
#pickle.dump(vocab, open('../demo/instr_vocab.pkl', 'wb'))

ingrs_vocab = pickle.load(open(os.path.join(data_dir, 'ingr_vocab.pkl'), 'rb'))
vocab = pickle.load(open(os.path.join(data_dir, 'instr_vocab.pkl'), 'rb'))

ingr_vocab_size = len(ingrs_vocab)
instrs_vocab_size = len(vocab)
output_dim = instrs_vocab_size

In [None]:
print (instrs_vocab_size, ingr_vocab_size)

In [None]:
t = time.time()
import sys; sys.argv=['']; del sys
args = get_parser()
args.maxseqlen = 15
args.ingrs_only=False
model = get_model(args, ingr_vocab_size, instrs_vocab_size)
# Load the trained model parameters
model_path = os.path.join(data_dir, 'modelbest.ckpt')
model.load_state_dict(torch.load(model_path, map_location=map_loc))
model.to(device)
model.eval()
model.ingrs_only = False
model.recipe_only = False
print ('loaded model')
print ("Elapsed time:", time.time() -t)


In [None]:
transf_list_batch = []
transf_list_batch.append(transforms.ToTensor())
transf_list_batch.append(transforms.Normalize((0.485, 0.456, 0.406), 
                                              (0.229, 0.224, 0.225)))
to_input_transf = transforms.Compose(transf_list_batch)

In [None]:
greedy = [True, False, False, False]
beam = [-1, -1, -1, -1]
temperature = 1.0
numgens = len(greedy)

Set ```use_urls = True``` to get recipes for images in ```demo_urls```. 

You can also set ```use_urls = False``` and get recipes for images in the path in ```data_dir/test_imgs```.

In [None]:
import requests
from io import BytesIO
import random
from collections import Counter
use_urls = False # set to true to load images from demo_urls instead of those in test_imgs folder
show_anyways = False #if True, it will show the recipe even if it's not valid
image_folder = os.path.join(data_dir, 'demo_imgs')

# Ver 4: Remove random to get a fix set of images
if not use_urls:
    demo_imgs = os.listdir(image_folder)
    # random.shuffle(demo_imgs)

demo_urls = ['https://food.fnr.sndimg.com/content/dam/images/food/fullset/2013/12/9/0/FNK_Cheesecake_s4x3.jpg.rend.hgtvcom.826.620.suffix/1387411272847.jpeg',
            'https://www.196flavors.com/wp-content/uploads/2014/10/california-roll-3-FP.jpg']

demo_files = demo_urls if use_urls else demo_imgs

In [None]:
import torch
from utils.metrics import softIoU
from utils.metrics import update_error_types, compute_metrics

# Function to convert label indices to one-hot encoded vectors
def label2onehot(labels, pad_value):
    inp_ = torch.unsqueeze(labels, 2)
    one_hot = (
        torch.FloatTensor(labels.size(0), labels.size(1), pad_value + 1)
        .zero_()
        .to(device)
    )
    one_hot.scatter_(2, inp_, 1)
    one_hot, _ = one_hot.max(dim=1)
    one_hot = one_hot[:, 1:-1]
    one_hot[:, 0] = 0
    return one_hot

In [None]:
# Load evaluation metrics
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

In [None]:
import pickle

# Load ground truth data
with open("../data/recipe1m_test.pkl", "rb") as f:
    test_data = pickle.load(f)

In [None]:
# Build mapping from image filename to ground truth sample
imgfile_to_gt = {}
for sample in test_data:
    for img_filename in sample["images"]:
        imgfile_to_gt[img_filename] = sample

In [None]:
# Initialize accumulators for evaluation
predicted_list = []
ground_truth_list = []
all_ious = []
error_types = {
    "tp_i": 0,
    "fp_i": 0,
    "fn_i": 0,
    "tn_i": 0,
    "tp_all": 0,
    "fp_all": 0,
    "fn_all": 0,
}

In [None]:
import json

# Collect ground truth and predictions
gt_json_list = []
pred_json_list = []

# Load and preprocess the image
for img_file in demo_files:

    # Update ground truth sample retrieval
    gt_sample = imgfile_to_gt.get(img_file)
    if gt_sample is None:
        continue

    if use_urls:
        response = requests.get(img_file)
        image = Image.open(BytesIO(response.content))
    else:
        image_path = os.path.join(image_folder, img_file)
        image = Image.open(image_path).convert("RGB")

    transf_list = []
    transf_list.append(transforms.Resize(256))
    transf_list.append(transforms.CenterCrop(224))
    transform = transforms.Compose(transf_list)

    image_transf = transform(image)
    image_tensor = to_input_transf(image_transf).unsqueeze(0).to(device)

    # Ground truth entry
    gt_entry = {
        "image_id": img_file,
        "title": " ".join(gt_sample.get("title", [])),
        "ingredients": gt_sample["ingredients"],
        "instructions": gt_sample["instructions"],
    }

    # Predicted recipes for current image
    pred_recipes = []
    for i in range(numgens):
        with torch.no_grad():
            outputs = model.sample(
                image_tensor,
                greedy=greedy[i],
                temperature=temperature,
                beam=beam[i],
                true_ingrs=None,
            )

        ingr_ids = outputs["ingr_ids"].cpu().numpy()
        recipe_ids = outputs["recipe_ids"].cpu().numpy()
        outs, valid = prepare_output(recipe_ids[0], ingr_ids[0], ingrs_vocab, vocab)

        pred_entry = {
            "title": outs["title"],
            "ingredients": outs["ingrs"],
            "instructions": outs["recipe"],
        }
        pred_recipes.append(pred_entry)

        # Get ground truth ingredients and instructions
        gt_ingrs = gt_sample["ingredients"]
        gt_instrs = gt_sample["instructions"]

        # Convert ground truth ingredients to indices using ingr_vocab
        gt_ingr_indices = [
            ingrs_vocab.index(ingr) for ingr in gt_ingrs if ingr in ingrs_vocab
        ]
        if len(gt_ingr_indices) == 0:
            continue
        pred_ingr_indices = ingr_ids[0]

        predicted_instruction = " ".join(outs["recipe"])
        actual_instruction = " ".join(gt_instrs)
        predicted_list.append(predicted_instruction)
        ground_truth_list.append(actual_instruction)

        pred_tensor = torch.tensor([pred_ingr_indices])
        gt_tensor = torch.tensor([gt_ingr_indices])
        pred_one_hot = label2onehot(pred_tensor, len(ingrs_vocab) - 1)
        gt_one_hot = label2onehot(gt_tensor, len(ingrs_vocab) - 1)

        # Print one-hot vectors for debugging:
        print("Predicted ingredients:", [ingrs_vocab[i] for i in pred_ingr_indices if i < len(ingrs_vocab)])
        print("Ground truth ingredients:", [ingrs_vocab[i] for i in gt_ingr_indices if i < len(ingrs_vocab)])

        # Find matching ingredients
        matching_indices = set(pred_ingr_indices) & set(gt_ingr_indices)
        print("Matching ingredients:", [ingrs_vocab[i] for i in matching_indices if i < len(ingrs_vocab)])

        iou = torch.mean(softIoU(pred_one_hot, gt_one_hot)).item()
        all_ious.append(iou)
        update_error_types(error_types, pred_one_hot, gt_one_hot)

    pred_json = {"image_id": img_file, "recipes": pred_recipes}

    # Write ground truth to JSON
    with open(f"Image{img_file}_GroundTruth.json", "w", encoding="utf-8") as f:
        json.dump(gt_entry, f, indent=2, ensure_ascii=False)

    # Write predictions to JSON
    with open(f"Image{img_file}_Predicted.json", "w", encoding="utf-8") as f:
        json.dump(pred_json, f, indent=2, ensure_ascii=False)

In [None]:
# Compute metrics after all samples
ret_metrics = {"accuracy": [], "f1": [], "jaccard": [], "f1_ingredients": []}
compute_metrics(ret_metrics, error_types, ["f1"])

In [None]:
# Calculate BLEU for all predictions
bleu_scores = []
smooth = SmoothingFunction().method1
for pred, gt in zip(predicted_list, ground_truth_list):
    # nltk expects a list of tokens, so split by whitespace
    bleu = sentence_bleu([gt.split()], pred.split(), smoothing_function=smooth)
    bleu_scores.append(bleu)

In [None]:
# Calculate ROUGE-L for all predictions
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
rouge_l_scores = []
for pred, gt in zip(predicted_list, ground_truth_list):
    score = scorer.score(gt, pred)
    rouge_l_scores.append(score["rougeL"].fmeasure)

In [None]:
# Print final metrics
if len(all_ious) == 0 or len(predicted_list) == 0 or len(ground_truth_list) == 0:
    print("No valid predictions or ground truths found for metric calculation.")
else:
    print("Mean Ingredient IoU:", np.mean(all_ious))
    print("Ingredient F1:", np.mean(ret_metrics["f1"]))
    print("Mean BLEU:", np.mean(bleu_scores))
    print("Mean ROUGE-L:", np.mean(rouge_l_scores))