In [1]:
IMAGE_SIZE = 224
NUM_CHANNELS = 3
LMDB_DIR_PATH = "/mnt/lab_lmdb"

# Store Train-Test Data to LMDB

In [2]:
%load_ext google.cloud.bigquery
%load_ext lab_black
%load_ext line_profiler

In [3]:
import lmdb
import pickle
from PIL import Image, ImageFile
import numpy as np
from pympler import asizeof
import torch
from tqdm.notebook import tqdm
from torchvision import datasets, models, transforms
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import torch.nn as nn

In [4]:
TRAIN_DATA_FILE = f"{LMDB_DIR_PATH}/seefood_train_data"
TEST_DATA_FILE = f"{LMDB_DIR_PATH}/seefood_test_data"

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
cpu = torch.device("cpu")

In [6]:
def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False


model = models.mobilenet_v2(pretrained=True)
set_parameter_requires_grad(model, True)

## Load Data

In [8]:
%%bigquery df --project zenscr-seefood-dev

SELECT recipe_id, title, image_path, total_calories
FROM `zenscr-seefood-dev.sparkrecipes.base_filtered`
INNER JOIN `zenscr-seefood-dev.sparkrecipes.image_path`
USING (recipe_id)

In [9]:
df

Unnamed: 0,recipe_id,title,image_path,total_calories
0,70865,Fruit Cocktail Dessert/Salad,../../data/images/70865/000001,30.0
1,70865,Fruit Cocktail Dessert/Salad,../../data/images/70865/000010,30.0
2,70865,Fruit Cocktail Dessert/Salad,../../data/images/70865/000011,30.0
3,70865,Fruit Cocktail Dessert/Salad,../../data/images/70865/000012,30.0
4,70865,Fruit Cocktail Dessert/Salad,../../data/images/70865/000013,30.0
...,...,...,...,...
1311732,418865,Banana Bread Muffins,../../data/images/418865/000005,134.9
1311733,418865,Banana Bread Muffins,../../data/images/418865/000006,134.9
1311734,418865,Banana Bread Muffins,../../data/images/418865/000007,134.9
1311735,418865,Banana Bread Muffins,../../data/images/418865/000008,134.9


In [10]:
df_shuffled = shuffle(df)

In [11]:
df_shuffled

Unnamed: 0,recipe_id,title,image_path,total_calories
580214,60247,Tofu Noodle Stir Fry Soup,../../data/images/60247/000008,317.2
743496,395614,Lemony Light Cooler,../../data/images/395614/000013,127.7
754228,152304,roast beef/potato/carrot,../../data/images/152304/000007,371.0
438073,67088,Coconut Rice,../../data/images/67088/000015,334.8
23707,90673,Tru's Smashed Cauliflower,../../data/images/90673/000013,129.1
...,...,...,...,...
755052,354257,Hollandaise,../../data/images/354257/000006,38.5
124110,16814,cheesy fried green tomatoes,../../data/images/16814/000007,106.1
762806,363896,Vegan Carrot Parsnip Soup,../../data/images/363896/000003,211.0
1108972,344642,cucumber salad,../../data/images/344642/000015,53.4


## Train-Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    df_shuffled[["image_path"]],
    df_shuffled.total_calories,
    test_size=0.33,
    random_state=42,
)

In [13]:
df_train = X_train.assign(target=y_train).reset_index(drop=True)
df_test = X_test.assign(target=y_test).reset_index(drop=True)

In [14]:
df_train

Unnamed: 0,image_path,target
0,../../data/images/234936/000014,238.7
1,../../data/images/275146/000009,131.9
2,../../data/images/269798/000006,350.4
3,../../data/images/414439/000002,300.6
4,../../data/images/298095/000001,195.2
...,...,...
878858,../../data/images/305994/000006,135.3
878859,../../data/images/302902/000013,128.1
878860,../../data/images/25604/000005,236.0
878861,../../data/images/364713/000014,57.3


In [30]:
df_train[["image_path"]].reset_index().to_gbq(
    "sparkrecipes.train_image_mapping",
    if_exists="replace",
    project_id="zenscr-seefood-dev",
)

1it [00:24, 24.22s/it]


In [31]:
df_test[["image_path"]].reset_index().to_gbq(
    "sparkrecipes.test_image_mapping",
    if_exists="replace",
    project_id="zenscr-seefood-dev",
)

1it [00:08,  8.73s/it]


## Write Embeddings to LMDB

In [22]:
class FeatureExtractor(nn.Module):
    def __init__(self, model):
        super(FeatureExtractor, self).__init__()
        self.features = model.features
        for p in self.features.parameters():
            p.requires_grad = False

    def forward(self, x):
        return self.features(x)


class Features:
    def __init__(self, features, target):
        self.shape = features.shape
        self.features = features.numpy().tobytes()
        self.target = target.round().item()

    def get_features(self):
        features = np.frombuffer(self.features, dtype=np.float32)
        return torch.from_numpy(features.reshape(self.shape))


class ImageDataset(torch.utils.data.Dataset):
    def __init__(self, df, transform):
        self.images = df["image_path"].reset_index(drop=True)
        self.targets = df["target"].reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_path = self.images.iloc[idx]
        with Image.open(img_path) as f:
            image = f.convert("RGB")
        image = self.transform(image)
        return image, self.targets.iloc[idx]


transform = transforms.Compose(
    [
        transforms.Resize(IMAGE_SIZE),
        transforms.CenterCrop(IMAGE_SIZE),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ]
)

dataloader_train = torch.utils.data.DataLoader(
    ImageDataset(df_train, transform), batch_size=256, shuffle=False, num_workers=4
)

dataloader_test = torch.utils.data.DataLoader(
    ImageDataset(df_test, transform), batch_size=256, shuffle=False, num_workers=4
)

In [16]:
feature_extractor = FeatureExtractor(model).to(device)

In [17]:
torch.cuda.empty_cache()

In [18]:
def store_to_lmdb(lmdb_filename, dataloader, image_size=IMAGE_SIZE):
    num_images = len(dataloader.dataset)

    map_size = num_images * (
        300000 + asizeof.asizeof(f"{0:08}") + 2048
    )  # approximate map size

    index = 0
    with lmdb.open(lmdb_filename, map_size=map_size) as env:
        for images, targets in tqdm(dataloader):
            with env.begin(write=True) as txn:
                images = images.to(device)
                features = feature_extractor(images).to(cpu)
                for f, t in zip(features, targets):
                    key = f"{index:08}".encode("ascii")
                    value = Features(f, t)
                    txn.put(key, pickle.dumps(value))
                    index += 1

In [19]:
store_to_lmdb(TRAIN_DATA_FILE, dataloader_train)

HBox(children=(FloatProgress(value=0.0, max=3434.0), HTML(value='')))




In [24]:
store_to_lmdb("/mnt/lmdb_storage/seefood_test_data", dataloader_test)

HBox(children=(FloatProgress(value=0.0, max=1691.0), HTML(value='')))




## Read Embeddings from LMDB

In [None]:
def read_from_lmdb_multiple_txns(lmdb_filename, df):
    features = []
    targets = []

    with lmdb.open(lmdb_filename, readonly=True) as env:
        for i, _ in df.image_path.iteritems():
            with env.begin() as txn:
                data = txn.get(f"{i:08}".encode("ascii"))
                data = pickle.loads(data)
                features.append(data.get_features())
                targets.append(data.target)
    return features, targets

In [None]:
%time features, targets = read_from_lmdb_multiple_txns(TRAIN_DATA_FILE, df_train)

In [None]:
len(features)

In [None]:
features[:5]