In [78]:
import pandas as pd

import torch
import torch.nn as nn
from torchvision import datasets, models, transforms
from PIL import Image
from tqdm.notebook import tqdm

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [27]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Load data

In [17]:
df_base = pd.read_csv("../../data/sparkrecipes_base.csv")
df_images = pd.read_csv("../../data/sparkrecipes_images.csv")

In [43]:
df = df_base.merge(df_images, on="recipe_id", how="inner").drop(["image_url"], axis=1)

In [44]:
df

Unnamed: 0,recipe_id,title,total_calories,url,image_1,image_2,image_3,image_4,image_5,image_6,image_7,image_8,image_9
0,0,Caribbean Black Beans and Rice,479.6,https://recipes.sparkpeople.com/recipe-detail....,../../data/images/0/000001,../../data/images/0/000002,../../data/images/0/000003,../../data/images/0/000004,../../data/images/0/000005,../../data/images/0/000006,../../data/images/0/000007,../../data/images/0/000008,../../data/images/0/000009
1,1,Minestrone Soup,153.1,https://recipes.sparkpeople.com/recipe-detail....,../../data/images/1/000001,../../data/images/1/000002,../../data/images/1/000003,../../data/images/1/000004,../../data/images/1/000005,../../data/images/1/000006,../../data/images/1/000007,../../data/images/1/000008,../../data/images/1/000009
2,2,20-Minute Chicken Creole,269.3,https://recipes.sparkpeople.com/recipe-detail....,../../data/images/2/000001,../../data/images/2/000002,../../data/images/2/000003,../../data/images/2/000004,../../data/images/2/000005,../../data/images/2/000006,../../data/images/2/000007,../../data/images/2/000008,../../data/images/2/000009
3,3,Beef and Vegetable Stir-Fry,245.5,https://recipes.sparkpeople.com/recipe-detail....,../../data/images/3/000001,../../data/images/3/000002,../../data/images/3/000003,../../data/images/3/000004,../../data/images/3/000005,../../data/images/3/000006,../../data/images/3/000007,../../data/images/3/000008,../../data/images/3/000009
4,4,Italian Vegetable Bake,37.2,https://recipes.sparkpeople.com/recipe-detail....,../../data/images/4/000001,../../data/images/4/000002,../../data/images/4/000003,../../data/images/4/000004,../../data/images/4/000005,../../data/images/4/000006,../../data/images/4/000007,../../data/images/4/000008,../../data/images/4/000009


In [53]:
df_melted = pd.melt(df, 
                    id_vars='recipe_id', 
                    value_vars=list(df.columns[1:]),
                    var_name='column', 
                    value_name='value').sort_values(by=["recipe_id", "column"]).reset_index(drop=True)

df_melted.head()

Unnamed: 0,recipe_id,column,value
15,0,image_1,../../data/images/0/000001
20,0,image_2,../../data/images/0/000002
25,0,image_3,../../data/images/0/000003
30,0,image_4,../../data/images/0/000004
35,0,image_5,../../data/images/0/000005


## Create image embeddings

In [105]:
df_all_images = df_melted[df_melted.column.str.startswith("image_")][["recipe_id", "value"]].reset_index(drop=True)
df_all_images.columns = ["recipe_id", "image_path"]

In [106]:
df_all_images

Unnamed: 0,recipe_id,image_path
0,0,../../data/images/0/000001
1,0,../../data/images/0/000002
2,0,../../data/images/0/000003
3,0,../../data/images/0/000004
4,0,../../data/images/0/000005
5,0,../../data/images/0/000006
6,0,../../data/images/0/000007
7,0,../../data/images/0/000008
8,0,../../data/images/0/000009
9,1,../../data/images/1/000001


In [107]:
def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False

In [108]:
class ImageDataset(torch.utils.data.Dataset):
    def __init__(self, df, transform):
        self.images = df["image_path"].reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_path = self.images.iloc[idx]
        image = Image.open(img_path).convert("RGB")
        image = self.transform(image)
        return image

In [109]:
squeezenet = models.squeezenet1_0(pretrained=True)
set_parameter_requires_grad(squeezenet, True)

In [110]:
class FeatureExtractor(nn.Module):
    def __init__(self, model):
        super(FeatureExtractor, self).__init__()
        self.features = model.features
        self.avg_pool2d = nn.AdaptiveAvgPool2d(1)

    def forward(self, x):
        x = self.features(x)
        x = self.avg_pool2d(x)
        x = x.squeeze()
        return x

In [111]:
feature_extractor = FeatureExtractor(squeezenet).to(device)

In [112]:
IMAGE_SIZE = 224

data_transforms = transforms.Compose(
    [
        transforms.Resize(IMAGE_SIZE),
        transforms.CenterCrop(IMAGE_SIZE),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ]
)

In [113]:
dataset = ImageDataset(df_all_images, data_transforms)

In [114]:
dataloader = torch.utils.data.DataLoader(
    dataset, batch_size=10, shuffle=False, num_workers=2
)

In [118]:
embeddings = []
for item in tqdm(dataloader):
    item = item.to(device)
    embeddings.append(feature_extractor(item))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




### Persist results

In [122]:
df_embeddings = df_all_images.join(pd.DataFrame(torch.cat(embeddings).tolist(), index=df_all_images.index)).drop(["image_path"], axis=1)
df_embeddings

Unnamed: 0,recipe_id,0,1,2,3,4,5,6,7,8,...,502,503,504,505,506,507,508,509,510,511
0,0,1.312647,1.745611,0.024549,4.041584,0.282331,0.036875,0.214602,0.412729,2.34834,...,0.264106,0.0,0.105781,0.0,0.230087,0.341734,0.211301,0.220547,0.797521,0.0
1,0,1.421376,1.250723,3.132711,0.144188,2.020523,2.047225,0.132746,0.910985,2.074128,...,2.422223,0.114516,0.764152,0.009001,0.283613,0.788782,0.584368,0.299237,5.853399,2.520212
2,0,0.137828,1.564686,0.667484,0.580247,0.069224,0.124425,0.412704,0.0,1.201348,...,0.642398,0.661652,0.036482,0.0,3.479456,0.154165,1.197968,1.103921,1.237342,0.618344
3,0,0.710551,1.478248,0.963002,0.0,0.085762,0.022721,0.613554,0.31235,0.06478,...,2.035271,0.0,0.0,0.0,1.013479,0.043301,2.850164,0.194671,1.380409,1.058532
4,0,0.024385,0.623414,0.007182,0.294297,0.0,0.062329,0.367889,0.115501,0.225115,...,0.962519,0.438362,0.510686,0.0,4.816212,0.036955,4.948285,2.77277,8.331865,1.83073
5,0,0.817215,0.556298,1.512543,0.165556,7.005704,0.990555,0.019743,1.591008,2.531086,...,0.366615,0.284539,1.128675,0.62917,4.972097,0.327147,3.405946,1.701481,2.117233,0.202062
6,0,0.241889,1.460734,0.6363,0.45606,0.526707,0.0,2.827007,0.0,1.138758,...,0.131064,0.193987,0.52935,0.0,0.714559,0.101676,1.424459,0.442109,2.378555,1.624691
7,0,0.277707,0.300233,0.012929,2.102414,2.345455,0.423612,1.119672,0.878051,2.812878,...,1.953308,1.717288,0.172593,0.0,0.358948,2.084613,5.652442,0.579122,3.709535,1.90563
8,0,0.0,2.177485,0.478392,0.718729,0.0,0.364932,0.482541,0.035491,1.49619,...,0.0,0.467968,0.047027,0.202389,1.01062,0.142592,2.578675,2.130555,0.501882,0.184687
9,1,0.667305,0.307148,0.755098,0.525186,0.0,0.219823,0.429681,2.539841,2.843086,...,0.13779,0.0,0.345843,0.0,1.602077,0.180274,1.83614,0.0,1.361435,0.0


In [121]:
df_embeddings.to_csv("../../data/sparkrecipes_embeddings.csv" ,index=False)