In [1]:
IMAGE_SIZE = 224
NUM_CHANNELS = 3
LMDB_DIR_PATH = "/mnt/data_ssd/lmdb"
MODEL_NAME = "efficientnet_b3"

# Compute Image Embeddings and Persist them via LMDB

In [2]:
%load_ext google.cloud.bigquery
%load_ext lab_black
%load_ext line_profiler

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import os

from seefood.features import EfficientNetFeatureExtractor
from seefood.data import LMDBEmbeddingWriter
from seefood.data import ImageDataset
from seefood.data import get_default_transform

import torch

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from PIL import ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True

Using cache found in /home/mike/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master


In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [6]:
feature_extractor = EfficientNetFeatureExtractor(MODEL_NAME).to(device)

Using cache found in /home/mike/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master
Using cache found in /home/mike/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master


In [7]:
TRAIN_DATA_FILE = f"{LMDB_DIR_PATH}/seefood_train_data_{MODEL_NAME}"
TEST_DATA_FILE = f"{LMDB_DIR_PATH}/seefood_test_data_{MODEL_NAME}"

## Load and Shuffle Data

In [8]:
%%bigquery df --project zenscr-seefood-dev

SELECT recipe_id, title, image_path as image_path_suffix, total_calories
FROM `zenscr-seefood-dev.sparkrecipes.base_filtered`
INNER JOIN `zenscr-seefood-dev.sparkrecipes.image_path`
USING (recipe_id)

In [9]:
df["image_path"] = df["image_path_suffix"].map(
    lambda suffix: os.path.join(IMAGE_BASE_PATH, suffix)
)

In [10]:
df

Unnamed: 0,recipe_id,title,image_path_suffix,total_calories,image_path
0,19,Turkey Stuffed Cabbage,19/000002,155.5,/mnt/data_ssd/datasets/sparkrecipes/19/000002
1,25,Easy Lemon Chicken,25/000001,318.0,/mnt/data_ssd/datasets/sparkrecipes/25/000001
2,257,Chocolate Cinnamon Bread Pudding,257/000004,422.8,/mnt/data_ssd/datasets/sparkrecipes/257/000004
3,274,Cabbage Vegetable Soup,274/000007,165.2,/mnt/data_ssd/datasets/sparkrecipes/274/000007
4,280,Simply Spiced Apple Cider,280/000006,119.6,/mnt/data_ssd/datasets/sparkrecipes/280/000006
...,...,...,...,...,...
1311732,435051,Warm You Up Chicken Tortilla Soup,435051/000014,322.2,/mnt/data_ssd/datasets/sparkrecipes/435051/000014
1311733,435064,hAMBURGER/POTATOE SOUP,435064/000015,422.1,/mnt/data_ssd/datasets/sparkrecipes/435064/000015
1311734,435071,Martha's Spaghetti,435071/000007,344.9,/mnt/data_ssd/datasets/sparkrecipes/435071/000007
1311735,435097,Bean Salad,435097/000003,180.6,/mnt/data_ssd/datasets/sparkrecipes/435097/000003


In [11]:
df_shuffled = shuffle(df)

In [12]:
df_shuffled

Unnamed: 0,recipe_id,title,image_path_suffix,total_calories,image_path
155641,403691,Banana Blueberry Bread,403691/000003,210.7,/mnt/data_ssd/datasets/sparkrecipes/403691/000003
463742,289003,Multi grain and seed spelt bread,289003/000015,110.7,/mnt/data_ssd/datasets/sparkrecipes/289003/000015
1010172,211309,Wet Burrito (Chicken),211309/000006,127.5,/mnt/data_ssd/datasets/sparkrecipes/211309/000006
52934,4862,Algerian salad,4862/000006,76.5,/mnt/data_ssd/datasets/sparkrecipes/4862/000006
887618,358426,Lowfat biscuits,358426/000010,180.5,/mnt/data_ssd/datasets/sparkrecipes/358426/000010
...,...,...,...,...,...
1149955,349128,Crockpot Vegan Leftover Black Bean and Potato ...,349128/000007,182.0,/mnt/data_ssd/datasets/sparkrecipes/349128/000007
424462,79855,Frozen Fruit Sherbert,79855/000012,77.9,/mnt/data_ssd/datasets/sparkrecipes/79855/000012
1290629,92213,QUICK DELICIOUS MEXICAN MEAL,92213/000014,348.8,/mnt/data_ssd/datasets/sparkrecipes/92213/000014
98023,314376,Black Bean Salsa,314376/000008,82.3,/mnt/data_ssd/datasets/sparkrecipes/314376/000008


## Train-Test Split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    df_shuffled[["image_path"]],
    df_shuffled.total_calories,
    test_size=0.33,
    random_state=42,
)

In [14]:
df_train = X_train.assign(target=y_train).reset_index(drop=True)
df_test = X_test.assign(target=y_test).reset_index(drop=True)

## Write Embeddings to LMDB

In [15]:
transform = get_default_transform(IMAGE_SIZE)

dataloader_train = torch.utils.data.DataLoader(
    ImageDataset(df_train, transform), batch_size=64, shuffle=False, num_workers=4
)

dataloader_test = torch.utils.data.DataLoader(
    ImageDataset(df_test, transform), batch_size=64, shuffle=False, num_workers=4
)

embedding_writer = LMDBEmbeddingWriter(feature_extractor, device)
map_size = 1073741824 * 10  # 10GB

In [16]:
embedding_writer.write(TRAIN_DATA_FILE, dataloader_train, map_size)

HBox(children=(FloatProgress(value=0.0, max=13733.0), HTML(value='')))






In [17]:
embedding_writer.write(TEST_DATA_FILE, dataloader_test, map_size)

HBox(children=(FloatProgress(value=0.0, max=6764.0), HTML(value='')))




