In [1]:
import nltk
import pickle
from collections import Counter
import os
from tqdm import *
import numpy as np
import re
import pandas as pd
import ast
import string

nltk.download('punkt')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/michaelsun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
class ArgumentsProxy:
    def __init__(self):
        self.epicurious_path = 'path/to/epicurious'
        self.save_path = '../data/'
        self.suff = ''
        self.threshold_ingrs = 10
        self.threshold_words = 10
        self.maxnuminstrs = 20
        self.maxnumingrs = 20
        self.minnuminstrs = 2
        self.minnumingrs = 2
        self.minnumwords = 20
        self.forcegen = False
    

args = ArgumentsProxy()

In [3]:
class Vocabulary(object):
    """Simple vocabulary wrapper."""
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word, idx=None):
        if idx is None:
            if not word in self.word2idx:
                self.word2idx[word] = self.idx
                self.idx2word[self.idx] = word
                self.idx += 1
            return self.idx
        else:
            if not word in self.word2idx:
                self.word2idx[word] = idx
                if idx in self.idx2word.keys():
                    self.idx2word[idx].append(word)
                else:
                    self.idx2word[idx] = [word]

                return idx

    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx['<pad>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

In [4]:
print ("Loading data...")

## Load data from CSV into pickle files
CSV_PATH = './archive/epicurious_data.csv'
COLS = ['ID','Title','Ingredients','Instructions','Image_Name','Cleaned_Ingredients'] # not using the index or uncleaned Ingredients column in CSV
DTYPES = {
    'ID': 'int',
    'Title': 'str',
    'Ingredients': 'str',
    'Instructions': 'str',
    'Image_Name': 'str',
    'Cleaned_Ingredients': 'str'
}

# Read the CSV into a pandas dataframe for ease of manipulation
dataset_df = pd.read_csv(CSV_PATH)

print("Loaded data.")
print(f"Loaded {dataset_df.shape[0]} recipes from the Epicurious Dataset.")

Loading data...
Loaded data.
Loaded 13501 recipes from the Epicurious Dataset.


In [5]:
dataset_df.head(10)

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher..."
1,1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (..."
2,2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ..."
3,3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in..."
4,4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho..."
5,5,Warm Comfort,"['2 chamomile tea bags', '1½ oz. reposado tequ...",Place 2 chamomile tea bags in a heatsafe vesse...,warm-comfort-tequila-chamomile-toddy,"['2 chamomile tea bags', '1½ oz. reposado tequ..."
6,6,Apples and Oranges,"['3 oz. Grand Marnier', '1 oz. Amaro Averna', ...","Add 3 oz. Grand Marnier, 1 oz. Amaro Averna, a...",apples-and-oranges-spiked-cider,"['3 oz. Grand Marnier', '1 oz. Amaro Averna', ..."
7,7,Turmeric Hot Toddy,"['¼ cup granulated sugar', '¾ tsp. ground turm...","For the turmeric syrup, combine ½ cup hot wate...",turmeric-hot-toddy-claire-sprouse,"['¼ cup granulated sugar', '¾ tsp. ground turm..."
8,8,Instant Pot Lamb Haleem,"['¾ cup assorted dals (such as chana dal, moon...","Combine dals, rice, and barley in a medium bow...",instant-pot-lamb-haleem,"['¾ cup assorted dals (such as chana dal, moon..."
9,9,Spiced Lentil and Caramelized Onion Baked Eggs,"['1 (14.5-ounce) can basic lentil soup, like A...","Place an oven rack in the center of the oven, ...",spiced-lentil-and-caramelized-onion-baked-eggs,"['1 (14.5-ounce) can basic lentil soup, like A..."


In [6]:
dataset_size = dataset_df.shape[0]
print(dataset_size)
train_index = (dataset_size // 7) * 5
val_index = train_index + (dataset_size // 7)
test_index = dataset_size
print("Training samples:", train_index)
print("Validation samples:", val_index - train_index)
print("Test samples:", test_index - val_index)

13501
Training samples: 9640
Validation samples: 1928
Test samples: 1933


In [7]:
# Split the dataset into train, val, and test sets
dataset_df.loc[:, 'Partition'] = ''
dataset_df.iloc[:train_index, dataset_df.columns.get_loc('Partition')] = 'train'
dataset_df.iloc[train_index:val_index, dataset_df.columns.get_loc('Partition')] = 'val'
dataset_df.iloc[val_index:test_index, dataset_df.columns.get_loc('Partition')] = 'test'

# dataset_df.head(10)
# dataset_df.iloc[train_index:val_index].tail(10)
# dataset_df.iloc[val_index:test_index].tail(10)

In [8]:
# Strip the ingredients of measurements and preparation style
def strip_ingredients(ingrs):
    '''
    ingrs: List of ingredients
    Strip every ingredient of its measurements and preparation style. For example, 
        - "1 medium onion, chopped" should be "onion". 
        - "1 cup dry white wine" should be "dry white wine"
    Return the list of stripped ingredients.
    '''
    # Strip the ingredient of words after the punctuation
    stripped_ingrs = []
    fraction_pattern = r"\d+\s*\/\s*\d+"
    parenthesis_pattern = r'\([^)]*\)'
    fracs = "½ ¼ ¾ ⅓ ⅔ ⅕ ⅖ ⅗ ⅘ ⅙ ⅚ ⅛ ⅜ ⅝ ⅞"
    for ingr in ingrs:
        if 'ingredient info' in ingr.lower():
            continue
        clean_ingr = re.sub(parenthesis_pattern, '', ingr)
        stopwords = ["cup", "cups", "teaspoon", "teaspoons", "tablespoon", "tablespoons",
                 "tbsp", "tsp", "ounce", "ounces", "oz", "fl oz", "fluid ounce",
                 "pint", "pints", "quart", "quarts", "gallon", "gallons", "pound",
                 "pounds", "lb", "mg", "milligram", "grams", "g", "kg", "kilogram",
                 "medium", "large", "small", "diced", "chopped", "pinch", "pinches",
                 "slice", "slices", "piece", "pieces", "cloves", "clove", "cans", "can",
                 "of", "in", "with", "for", "to", "as", "from", "on", "at", "by", "plus", "sticks", "equipment", "serving"]
        tokenized_ingr = nltk.tokenize.word_tokenize(clean_ingr)
        stripped_tokens = []
        for tok in tokenized_ingr:
            if tok in ";:,":
                break
            if tok.isdigit() or tok.lower() in stopwords or re.match(fraction_pattern, tok) or tok in string.punctuation or tok in fracs:
                continue
            stripped_tokens.append(tok)
        stripped_ingr = ' '.join(stripped_tokens)
        stripped_ingr = stripped_ingr.strip()
        if len(stripped_ingr) == 0:
            continue
        else:
            stripped_ingrs.append(stripped_ingr)
    
    return stripped_ingrs

In [9]:
print(strip_ingredients([
    "1 medium onion, chopped", 
    "1 cup dry white wine", 
    "2 large egg whites",
    "1 pound new potatoes (about 1 inch in diameter)"]))

['onion', 'dry white wine', 'egg whites', 'new potatoes']


In [10]:
# more helper functions
def get_ingredient(det_ingr, replace_dict):
    det_ingr_undrs = det_ingr.lower()
    det_ingr_undrs = ''.join(i for i in det_ingr_undrs if not i.isdigit())

    for rep, char_list in replace_dict.items():
        for c_ in char_list:
            if c_ in det_ingr_undrs:
                det_ingr_undrs = det_ingr_undrs.replace(c_, rep)
    det_ingr_undrs = det_ingr_undrs.strip()
    det_ingr_undrs = det_ingr_undrs.replace(' ', '_')

    return det_ingr_undrs


def get_instruction(instruction, replace_dict, instruction_mode=True):
    instruction = instruction.lower()

    for rep, char_list in replace_dict.items():
        for c_ in char_list:
            if c_ in instruction:
                instruction = instruction.replace(c_, rep)
        instruction = instruction.strip()
    # remove sentences starting with "1.", "2.", ... from the targets
    if len(instruction) > 0 and instruction[0].isdigit() and instruction_mode:
        instruction = ''
    return instruction

def update_counter(list_, counter_toks, istrain=False):
    for sentence in list_:
        tokens = nltk.tokenize.word_tokenize(sentence)
        if istrain:
            counter_toks.update(tokens)

In [11]:
replace_dict_ingrs = {'and': ['&', "'n"], '': ['%', ',', '.', '#', '[', ']', '!', '?']}
replace_dict_instrs = {'and': ['&', "'n"], '': ['#', '[', ']']}
id2im = {}

ingrs_file = args.save_path + 'allingrs_count.pkl'
instrs_file = args.save_path + 'allwords_count.pkl'

#####
# 1. Count words in dataset and clean
#####
if os.path.exists(ingrs_file) and os.path.exists(instrs_file) and not args.forcegen:
    print ("loading pre-extracted word counters")
    counter_ingrs = pickle.load(open(args.save_path + 'allingrs_count.pkl', 'rb'))
    counter_toks = pickle.load(open(args.save_path + 'allwords_count.pkl', 'rb'))
else:
    counter_ingrs = Counter()
    counter_toks = Counter()
    counter_ingrs_raw = Counter()

    for i, row in dataset_df.iterrows():
        # add an entry to the id -> image dictionary
        id2im[i] = row['Image_Name']

        # get the instructions for this recipe
        instrs: str = row['Instructions']
        ingrs: str = row['Cleaned_Ingredients']

        # split the recipe into a list of instructions (list of words)
        acc_len = 0 # cumulative num of words
        instrs_list = []
        if isinstance(instrs, float):
            continue
        for instr in instrs.split('\n'):
            instr = get_instruction(instr, replace_dict_instrs)
            if len(instr) > 0:
                instrs_list.append(instr)
                acc_len += len(instr.split(' '))
            
        # convert the cleaned ingredients into a Python list
        ingrs_list = ast.literal_eval(ingrs)
        ingrs_list = strip_ingredients(ingrs_list)
        filtered_ingrs = []
        for j, ingr in enumerate(ingrs_list):
            if len(ingr.split(' ')) > 0:
                filtered_ingr = get_ingredient(ingr, replace_dict_ingrs)
                filtered_ingrs.append(filtered_ingr)

        # discard recipes with too few or too many ingredients or instruction words
        if len(filtered_ingrs) < args.minnumingrs or len(filtered_ingrs) >= args.maxnumingrs \
            or len(instrs_list) < args.minnuminstrs or len(instrs_list) >= args.maxnuminstrs \
            or acc_len < args.minnumwords:
            continue

        # tokenize sentences and update counter
        update_counter(instrs_list, counter_toks, istrain=row['Partition'] == 'train')
        title = nltk.tokenize.word_tokenize(row['Title'].lower())
        if row['Partition'] == 'train':
            counter_toks.update(title)
        if row['Partition'] == 'train':
            counter_ingrs.update(filtered_ingrs)

In [12]:
print(counter_toks.most_common(100))

[('.', 113505), (',', 104995), ('and', 73943), ('the', 50860), ('to', 44703), ('a', 42693), ('in', 32666), ('with', 31551), ('until', 24082), ('minutes', 18961), ('of', 18240), ('add', 13094), ('heat', 12174), (';', 12108), ('over', 11826), ('1', 11452), ('bowl', 11398), ('about', 10759), ('salt', 10664), ('into', 9964), ('on', 9765), ('2', 8542), ('(', 8109), (')', 8106), ('oil', 7993), ('or', 7985), ('for', 7878), ('large', 7742), ('cook', 7470), ('is', 7367), ('transfer', 6665), ('medium', 6639), ('water', 6521), ('then', 6426), ('mixture', 6405), ('let', 6030), ('pan', 5918), ('pepper', 5906), ('oven', 5652), ('baking', 5646), ('at', 5529), ('remaining', 5419), ('from', 5370), ('place', 5266), ('it', 5159), ('sugar', 4933), ('butter', 4920), ('season', 4891), ('cover', 4797), ('cup', 4770), ('cool', 4719), ('top', 4698), ('if', 4621), ('stir', 4594), ('stirring', 4485), ('small', 4453), ('3', 4446), ('be', 4399), ('skillet', 4334), ('using', 4151), ('dough', 4108), ('ahead', 4098),

In [13]:
print(counter_ingrs.most_common(100))

[('kosher_salt', 3376), ('garlic', 2010), ('sugar', 1717), ('unsalted_butter', 1687), ('olive_oil', 1484), ('extra-virgin_olive_oil', 1232), ('freshly_ground_black_pepper', 1204), ('eggs', 998), ('salt', 977), ('fresh_lemon_juice', 971), ('all-purpose_flour', 931), ('vegetable_oil', 791), ('water', 561), ('onion', 537), ('vanilla_extract', 512), ('honey', 475), ('baking_powder', 448), ('heavy_cream', 447), ('whole_milk', 421), ('fresh_lime_juice', 397), ('granulated_sugar', 370), ('egg', 359), ('ground_cinnamon', 351), ('baking_soda', 320), ('crushed_red_pepper_flakes', 292), ('scallions', 266), ('ground_cumin', 259), ('finely_grated_lemon_zest', 254), ('carrots', 253), ('red_onion', 252), ('butter', 251), ('dry_white_wine', 248), ('dijon_mustard', 245), ('egg_yolks', 243), ('flaky_sea_salt', 229), ('coarse_kosher_salt', 229), ('shallot', 224), ('cornstarch', 214), ('light_brown_sugar', 214), ('mayonnaise', 211), ('red_wine_vinegar', 208), ('lemon', 205), ('soy_sauce', 205), ('cayenne_

In [14]:
# some helper functions...

def cluster_ingredients(counter_ingrs):
    '''
    Cluster similar ingredients (e.g. oil, cheese, pasta).
    '''
    mydict = dict()
    mydict_ingrs = dict()

    for k, v in counter_ingrs.items():

        w1 = k.split('_')[-1]
        w2 = k.split('_')[0]
        lw = [w1, w2]
        if len(k.split('_')) > 1:
            w3 = k.split('_')[0] + '_' + k.split('_')[1]
            w4 = k.split('_')[-2] + '_' + k.split('_')[-1]

            lw = [w1, w2, w4, w3]

        gotit = 0
        for w in lw:
            if w in counter_ingrs.keys():
                # check if its parts are
                parts = w.split('_')
                if len(parts) > 0:
                    if parts[0] in counter_ingrs.keys():
                        w = parts[0]
                    elif parts[1] in counter_ingrs.keys():
                        w = parts[1]
                if w in mydict.keys():
                    mydict[w] += v
                    mydict_ingrs[w].append(k)
                else:
                    mydict[w] = v
                    mydict_ingrs[w] = [k]
                gotit = 1
                break
        if gotit == 0:
            mydict[k] = v
            mydict_ingrs[k] = [k]

    return mydict, mydict_ingrs

def remove_plurals(counter_ingrs, ingr_clusters):
    del_ingrs = []

    for k, v in counter_ingrs.items():

        if len(k) == 0:
            del_ingrs.append(k)
            continue

        gotit = 0
        if k[-2:] == 'es':
            if k[:-2] in counter_ingrs.keys():
                counter_ingrs[k[:-2]] += v
                ingr_clusters[k[:-2]].extend(ingr_clusters[k])
                del_ingrs.append(k)
                gotit = 1

        if k[-1] == 's' and gotit == 0:
            if k[:-1] in counter_ingrs.keys():
                counter_ingrs[k[:-1]] += v
                ingr_clusters[k[:-1]].extend(ingr_clusters[k])
                del_ingrs.append(k)
    for item in del_ingrs:
        del counter_ingrs[item]
        del ingr_clusters[item]
    return counter_ingrs, ingr_clusters


In [15]:
# Cluster ingredients

# manually add missing entries for better clustering
base_words = ['peppers', 'tomato', 'spinach_leaves', 'turkey_breast', 'lettuce_leaf',
                'chicken_thighs', 'milk_powder', 'bread_crumbs', 'onion_flakes',
                'red_pepper', 'pepper_flakes', 'juice_concentrate', 'cracker_crumbs', 'hot_chili',
                'seasoning_mix', 'dill_weed', 'pepper_sauce', 'sprouts', 'cooking_spray', 'cheese_blend',
                'basil_leaves', 'pineapple_chunks', 'marshmallow', 'chile_powder',
                'cheese_blend', 'corn_kernels', 'tomato_sauce', 'chickens', 'cracker_crust',
                'lemonade_concentrate', 'red_chili', 'mushroom_caps', 'mushroom_cap', 'breaded_chicken',
                'frozen_pineapple', 'pineapple_chunks', 'seasoning_mix', 'seaweed', 'onion_flakes',
                'bouillon_granules', 'lettuce_leaf', 'stuffing_mix', 'parsley_flakes', 'chicken_breast',
                'basil_leaves', 'baguettes', 'green_tea', 'peanut_butter', 'green_onion', 'fresh_cilantro',
                'breaded_chicken', 'hot_pepper', 'dried_lavender', 'white_chocolate',
                'dill_weed', 'cake_mix', 'cheese_spread', 'turkey_breast', 'chucken_thighs', 'basil_leaves',
                'mandarin_orange', 'laurel', 'cabbage_head', 'pistachio', 'cheese_dip',
                'thyme_leave', 'boneless_pork', 'red_pepper', 'onion_dip', 'skinless_chicken', 'dark_chocolate',
                'canned_corn', 'muffin', 'cracker_crust', 'bread_crumbs', 'frozen_broccoli',
                'philadelphia', 'cracker_crust', 'chicken_breast']

for base_word in base_words:

    if base_word not in counter_ingrs.keys():
        counter_ingrs[base_word] = 1

counter_ingrs, cluster_ingrs = cluster_ingredients(counter_ingrs)
print("Counter Ingredients")
print(Counter(counter_ingrs).most_common(100))
print("Cluster Ingredients")
print(list(cluster_ingrs.items())[:100])

Counter Ingredients
[('salt', 5593), ('oil', 4577), ('sugar', 3426), ('butter', 2622), ('pepper', 2602), ('garlic', 2428), ('fresh', 1809), ('flour', 1685), ('vinegar', 1398), ('leaves', 1324), ('onion', 1256), ('cream', 1186), ('eggs', 1132), ('seeds', 1073), ('water', 986), ('vanilla', 937), ('egg', 884), ('milk', 872), ('cheese', 833), ('chicken', 664), ('lemon', 655), ('parsley', 652), ('finely', 646), ('tomatoes', 637), ('ginger', 631), ('honey', 583), ('cinnamon', 551), ('baking_powder', 469), ('potatoes', 458), ('cilantro', 450), ('mustard', 441), ('onions', 439), ('thyme', 431), ('a', 425), ('scallions', 404), ('chocolate', 368), ('bread', 363), ('dry', 347), ('chiles', 337), ('celery', 334), ('carrots', 333), ('yogurt', 330), ('crushed', 324), ('baking_soda', 324), ('soy_sauce', 323), ('``', 315), ('rice', 312), ('paprika', 305), ('shallot', 291), ('almonds', 285), ('chile', 284), ('cumin', 282), ('chives', 276), ('parmesan', 268), ('rosemary', 267), ('mushrooms', 265), ('shal

In [16]:
counter_ingrs, cluster_ingrs = remove_plurals(counter_ingrs, cluster_ingrs)
print("Counter Ingredients")
print(Counter(counter_ingrs).most_common(100))
print("Cluster Ingredients")
print(list(cluster_ingrs.items())[:100])

Counter Ingredients
[('salt', 5593), ('oil', 4577), ('sugar', 3426), ('pepper', 2749), ('butter', 2622), ('garlic', 2428), ('egg', 2016), ('fresh', 1809), ('onion', 1695), ('flour', 1685), ('vinegar', 1398), ('leaves', 1324), ('cream', 1186), ('seeds', 1073), ('water', 986), ('vanilla', 937), ('tomato', 890), ('milk', 872), ('cheese', 835), ('lemon', 766), ('chicken', 687), ('parsley', 652), ('finely', 646), ('ginger', 631), ('chile', 621), ('honey', 583), ('cinnamon', 551), ('potato', 549), ('shallot', 548), ('baking_powder', 469), ('carrot', 466), ('scallion', 454), ('cilantro', 450), ('mustard', 441), ('thyme', 431), ('a', 425), ('chocolate', 368), ('bread', 363), ('dry', 347), ('celery', 334), ('yogurt', 330), ('crushed', 324), ('baking_soda', 324), ('soy_sauce', 323), ('lime', 321), ('``', 315), ('rice', 312), ('paprika', 305), ('green', 305), ('orange', 300), ('almonds', 285), ('cumin', 282), ('chives', 276), ('parmesan', 268), ('rosemary', 267), ('mushrooms', 265), ('mayonnaise'

In [17]:
print("Counter tokens:", len(counter_toks))
print("Counter ingredients:", len(counter_ingrs))
words = [word for word, cnt in counter_toks.items() if cnt >= args.threshold_words]
ingrs = {word: cnt for word, cnt in counter_ingrs.items() if cnt >= args.threshold_ingrs}
print("Total words:", len(words))
print(words)
print("Total ingredients:", len(ingrs))
print(ingrs)

Counter tokens: 15251
Counter ingredients: 2794
Total words: 3650
['preheat', 'oven', 'to', '400°f', 'and', 'line', 'a', 'rimmed', 'baking', 'sheet', 'with', 'parchment', '.', 'in', 'large', 'bowl', ',', 'whisk', 'the', 'egg', 'whites', 'until', 'foamy', '(', 'there', 'shouldn', '’', 't', 'be', 'any', 'liquid', ')', 'add', 'potatoes', 'toss', 'they', 're', 'well', 'coated', 'then', 'transfer', 'strainer', 'or', 'colander', 'let', 'excess', 'drain', 'season', 'salt', 'pepper', 'herbs', 'scatter', 'on', 'make', 'sure', 'not', 'touching', 'roast', 'are', 'very', 'crispy', 'tender', 'when', 'poked', 'knife', '15', '20', 'minutes', 'depending', 'size', 'of', 'serve', 'place', 'rack', 'middle', ';', '400°', 'bring', 'evaporated', 'milk', 'whole', 'bare', 'simmer', 'saucepan', 'over', 'medium', 'heat', 'garlic', 'powder', 'onion', 'paprika', '1', 'tsp', 'working', 'batches', 'three', 'cheddar', 'all', 'cream', 'cheese', 'meanwhile', 'pot', 'generously', 'salted', 'water', 'boil', 'it', 'shoul

In [18]:
# Create a vocabulary
vocab_toks = Vocabulary()
vocab_toks.add_word('<start>')
vocab_toks.add_word('<end>')
vocab_toks.add_word('<eoi>') # end of recipe

# Add the words to the vocabulary.
for i, word in enumerate(words):
    vocab_toks.add_word(word)
vocab_toks.add_word('<pad>')

3654

In [19]:
# Ingredient vocab
# Create a vocab wrapper for ingredients
vocab_ingrs = Vocabulary()
idx = vocab_ingrs.add_word('<end>')
# this returns the next idx to add words to
# Add the ingredients to the vocabulary.
for k, _ in ingrs.items():
    for ingr in cluster_ingrs[k]:
        idx = vocab_ingrs.add_word(ingr, idx)
    idx += 1
_ = vocab_ingrs.add_word('<pad>', idx)

print("Total ingr vocabulary size: {}".format(len(vocab_ingrs)))
print("Total token vocabulary size: {}".format(len(vocab_toks)))

Total ingr vocabulary size: 451
Total token vocabulary size: 3654


In [20]:
dataset = {'train': [], 'val': [], 'test': []}

######
# 2. Tokenize and build dataset based on vocabularies.
######
IMAGE_DIR = '../archive/Food Images/'
for i, row in dataset_df.iterrows():
    instrs_list = []
    ingrs_list = []
    images_list = []

    # retrieve pre-detected ingredients for this entry
    labels = []

    ingrs = row['Cleaned_Ingredients']
    ingrs = ast.literal_eval(ingrs)
    ingrs = strip_ingredients(ingrs)
    filtered_ingrs = []
    for j, ingr in enumerate(ingrs):
        if len(ingr.split(' ')) > 0:
            filtered_ingr = get_ingredient(ingr, replace_dict_ingrs)
            filtered_ingrs.append(filtered_ingr)

    for j, ingr in enumerate(filtered_ingrs):
        if len(ingr) > 0:
            filtered_ingr_undrs = get_ingredient(ingr, replace_dict_ingrs)
            ingrs_list.append(filtered_ingr_undrs)
            label_idx = vocab_ingrs(filtered_ingr_undrs)
            if label_idx is not vocab_ingrs('<pad>') and label_idx not in labels:
                labels.append(label_idx)

    # get raw text for instructions of this entry

    # get all instructions for this recipe
    instrs = row['Instructions']
    acc_len = 0
    if isinstance(instrs, float):
        continue
    for instr in instrs.split('\n'):
        instr = get_instruction(instr, replace_dict_instrs)
        if len(instr) > 0:
            acc_len += len(instr.split(' '))
            instrs_list.append(instr)

    # we discard recipes with too many or too few ingredients or instruction words
    if len(labels) < args.minnumingrs or len(instrs_list) < args.minnuminstrs \
            or len(instrs_list) >= args.maxnuminstrs or len(labels) >= args.maxnumingrs \
            or acc_len < args.minnumwords:
        continue

    # if an image path exists, append it to the images list
    if len(id2im[i]) > 0:
        images_list.append(id2im[i])

    # tokenize sentences
    toks = []
    
    for instr in instrs_list:
        tokens = nltk.tokenize.word_tokenize(instr)
        toks.append(tokens)

    title = nltk.tokenize.word_tokenize(row['Title'].lower())
    # print("creating new entry")
    newentry = {'id': i, 'instructions': instrs_list, 'tokenized': toks,
                'ingredients': ingrs_list, 'images': images_list, 'title': title} # NOTE: 'images' => list[str]
    dataset[row['Partition']].append(newentry)

print('Dataset size:')
total_size = 0
for split in dataset.keys():
    split_size = len(dataset[split])
    total_size += split_size
    print(split, ':', split_size)
print("total size :", total_size)

Dataset size:
train : 7879
val : 1541
test : 1604
total size : 11024
