In [123]:
import pandas as pd

import torch
import torch.nn as nn
from torchvision import datasets, models, transforms
from PIL import Image
from tqdm.notebook import tqdm

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [124]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Load data

In [125]:
df_base = pd.read_csv("../../data/sparkrecipes_base.csv")
df_images = pd.read_csv("../../data/sparkrecipes_images.csv")

In [126]:
df = df_base.merge(df_images, on="recipe_id", how="inner").drop(["image_url"], axis=1)

In [127]:
df

Unnamed: 0,recipe_id,title,total_calories,url,image_1,image_2,image_3,image_4,image_5,image_6,image_7,image_8,image_9
0,0,Caribbean Black Beans and Rice,479.6,https://recipes.sparkpeople.com/recipe-detail....,../../data/images/0/000001,../../data/images/0/000002,../../data/images/0/000003,../../data/images/0/000004,../../data/images/0/000005,../../data/images/0/000006,../../data/images/0/000007,../../data/images/0/000008,../../data/images/0/000009
1,1,Minestrone Soup,153.1,https://recipes.sparkpeople.com/recipe-detail....,../../data/images/1/000001,../../data/images/1/000002,../../data/images/1/000003,../../data/images/1/000004,../../data/images/1/000005,../../data/images/1/000006,../../data/images/1/000007,../../data/images/1/000008,../../data/images/1/000009
2,2,20-Minute Chicken Creole,269.3,https://recipes.sparkpeople.com/recipe-detail....,../../data/images/2/000001,../../data/images/2/000002,../../data/images/2/000003,../../data/images/2/000004,../../data/images/2/000005,../../data/images/2/000006,../../data/images/2/000007,../../data/images/2/000008,../../data/images/2/000009
3,3,Beef and Vegetable Stir-Fry,245.5,https://recipes.sparkpeople.com/recipe-detail....,../../data/images/3/000001,../../data/images/3/000002,../../data/images/3/000003,../../data/images/3/000004,../../data/images/3/000005,../../data/images/3/000006,../../data/images/3/000007,../../data/images/3/000008,../../data/images/3/000009
4,4,Italian Vegetable Bake,37.2,https://recipes.sparkpeople.com/recipe-detail....,../../data/images/4/000001,../../data/images/4/000002,../../data/images/4/000003,../../data/images/4/000004,../../data/images/4/000005,../../data/images/4/000006,../../data/images/4/000007,../../data/images/4/000008,../../data/images/4/000009
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,Angel Food Cake,165.6,https://recipes.sparkpeople.com/recipe-detail....,../../data/images/96/000001,../../data/images/96/000002,../../data/images/96/000003,../../data/images/96/000004,../../data/images/96/000005,../../data/images/96/000006,../../data/images/96/000007,../../data/images/96/000008,../../data/images/96/000009
96,97,Lentil Vegetable Soup,228.7,https://recipes.sparkpeople.com/recipe-detail....,../../data/images/97/000001,../../data/images/97/000002,../../data/images/97/000003,../../data/images/97/000004,../../data/images/97/000005,../../data/images/97/000006,../../data/images/97/000007,../../data/images/97/000008,../../data/images/97/000009
97,98,Tortellini with Vegetables,335.3,https://recipes.sparkpeople.com/recipe-detail....,../../data/images/98/000001,../../data/images/98/000002,../../data/images/98/000003,../../data/images/98/000004,../../data/images/98/000005,../../data/images/98/000006,../../data/images/98/000007,../../data/images/98/000008,../../data/images/98/000009
98,99,Meatless Tacos with Vegetable Protein Crumbles,240.9,https://recipes.sparkpeople.com/recipe-detail....,../../data/images/99/000001,../../data/images/99/000002,../../data/images/99/000003,../../data/images/99/000004,../../data/images/99/000005,../../data/images/99/000006,../../data/images/99/000007,../../data/images/99/000008,../../data/images/99/000009


In [128]:
df_melted = pd.melt(df, 
                    id_vars='recipe_id', 
                    value_vars=list(df.columns[1:]),
                    var_name='column', 
                    value_name='value').sort_values(by=["recipe_id", "column"]).reset_index(drop=True)

df_melted.head()

Unnamed: 0,recipe_id,column,value
0,0,image_1,../../data/images/0/000001
1,0,image_2,../../data/images/0/000002
2,0,image_3,../../data/images/0/000003
3,0,image_4,../../data/images/0/000004
4,0,image_5,../../data/images/0/000005


## Create image embeddings

In [129]:
df_all_images = df_melted[df_melted.column.str.startswith("image_")][["recipe_id", "value"]].reset_index(drop=True)
df_all_images.columns = ["recipe_id", "image_path"]

In [130]:
df_all_images

Unnamed: 0,recipe_id,image_path
0,0,../../data/images/0/000001
1,0,../../data/images/0/000002
2,0,../../data/images/0/000003
3,0,../../data/images/0/000004
4,0,../../data/images/0/000005
...,...,...
895,100,../../data/images/100/000005
896,100,../../data/images/100/000006
897,100,../../data/images/100/000007
898,100,../../data/images/100/000008


In [131]:
def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False

In [132]:
class ImageDataset(torch.utils.data.Dataset):
    def __init__(self, df, transform):
        self.images = df["image_path"].reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_path = self.images.iloc[idx]
        image = Image.open(img_path).convert("RGB")
        image = self.transform(image)
        return image

In [133]:
squeezenet = models.squeezenet1_0(pretrained=True)
set_parameter_requires_grad(squeezenet, True)

In [134]:
class FeatureExtractor(nn.Module):
    def __init__(self, model):
        super(FeatureExtractor, self).__init__()
        self.features = model.features
        self.avg_pool2d = nn.AdaptiveAvgPool2d(1)

    def forward(self, x):
        x = self.features(x)
        x = self.avg_pool2d(x)
        x = x.squeeze()
        return x

In [135]:
feature_extractor = FeatureExtractor(squeezenet).to(device)

In [136]:
IMAGE_SIZE = 224

data_transforms = transforms.Compose(
    [
        transforms.Resize(IMAGE_SIZE),
        transforms.CenterCrop(IMAGE_SIZE),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ]
)

In [137]:
dataset = ImageDataset(df_all_images, data_transforms)

In [138]:
dataloader = torch.utils.data.DataLoader(
    dataset, batch_size=10, shuffle=False, num_workers=2
)

In [139]:
embeddings = []
for item in tqdm(dataloader):
    item = item.to(device)
    embeddings.append(feature_extractor(item))

HBox(children=(IntProgress(value=0, max=90), HTML(value='')))

  "Palette images with Transparency expressed in bytes should be "





### Persist results

In [140]:
df_embeddings = df_all_images.join(pd.DataFrame(torch.cat(embeddings).tolist(), index=df_all_images.index)).drop(["image_path"], axis=1)
df_embeddings

Unnamed: 0,recipe_id,0,1,2,3,4,5,6,7,8,...,502,503,504,505,506,507,508,509,510,511
0,0,1.312647,1.745611,0.024549,4.041584,0.282331,0.036875,0.214602,0.412729,2.348340,...,0.264106,0.000000,0.105781,0.000000,0.230087,0.341734,0.211301,0.220547,0.797521,0.000000
1,0,1.421376,1.250723,3.132711,0.144188,2.020523,2.047225,0.132746,0.910985,2.074128,...,2.422223,0.114516,0.764152,0.009001,0.283613,0.788782,0.584368,0.299237,5.853399,2.520212
2,0,0.137828,1.564686,0.667484,0.580247,0.069224,0.124425,0.412704,0.000000,1.201348,...,0.642398,0.661652,0.036482,0.000000,3.479456,0.154165,1.197968,1.103921,1.237342,0.618344
3,0,0.710551,1.478248,0.963002,0.000000,0.085762,0.022721,0.613554,0.312350,0.064780,...,2.035271,0.000000,0.000000,0.000000,1.013479,0.043301,2.850164,0.194671,1.380409,1.058532
4,0,0.024385,0.623414,0.007182,0.294297,0.000000,0.062329,0.367889,0.115501,0.225115,...,0.962519,0.438362,0.510686,0.000000,4.816212,0.036955,4.948285,2.772770,8.331865,1.830730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,100,0.377971,4.872329,0.575517,0.019348,0.000000,0.027056,1.188223,0.369930,0.873347,...,0.299889,0.000000,0.016181,0.000000,1.827582,0.000000,0.089033,5.332653,0.000000,0.020983
896,100,0.000000,5.397524,0.888348,2.068654,0.000000,0.562460,0.003531,0.986885,0.740844,...,0.000000,0.252174,0.058146,0.000000,0.733451,0.000000,1.381266,2.675857,0.876063,0.596842
897,100,0.159002,2.890098,0.051838,3.359712,0.013939,2.566859,0.154271,5.851393,3.710575,...,8.327135,3.743063,1.816733,1.717501,4.421910,0.036610,2.479548,2.066324,0.590810,0.811048
898,100,0.540162,1.269297,0.829861,1.629488,2.358884,1.293041,0.293311,1.226157,1.973932,...,6.990195,1.385442,0.000000,0.047327,4.372643,3.567914,0.126858,2.700228,3.486959,5.461864


In [141]:
df_embeddings.to_csv("../../data/sparkrecipes_embeddings.csv" ,index=False)