In [1]:
pip install kaggle



In [2]:
# get api key from kaggle settings, save kaggle.json to the files
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# recipe ingredients dataset to train standardization model
!kaggle datasets download kaggle/recipe-ingredients-dataset
!unzip recipe-ingredients-dataset.zip

# dataset to be used on by model to get standardized recipe lists
!kaggle datasets download pes12017000148/food-ingredients-and-recipe-dataset-with-images
!unzip food-ingredients-and-recipe-dataset-with-images.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: Food Images/Food Images/penang-fried-rice-noodles-368909.jpg  
  inflating: Food Images/Food Images/penang-rice-salad-232028.jpg  
  inflating: Food Images/Food Images/penne-alla-vodka-106042.jpg  
  inflating: Food Images/Food Images/penne-rigate-with-mixed-greens-and-pine-nuts-232975.jpg  
  inflating: Food Images/Food Images/penne-with-almond-pesto-and-green-beans-56389804.jpg  
  inflating: Food Images/Food Images/penne-with-broccoli-rabe-walnuts-and-pecorino-240085.jpg  
  inflating: Food Images/Food Images/penne-with-garrotxa-serrano-ham-and-sun-dried-tomatoes-51201410.jpg  
  inflating: Food Images/Food Images/penne-with-green-olives-and-feta-241865.jpg  
  inflating: Food Images/Food Images/penne-with-grilled-eggplant-and-radicchio-sauce-239051.jpg  
  inflating: Food Images/Food Images/penne-with-grilled-zucchini-ricotta-salata-and-mint-239062.jpg  
  inflating: Food Images/Food Images/penne-with-haz

In [3]:
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import json

In [4]:
# open unzipped files
f_test = open('test.json')
f_train = open('train.json')
data = json.load(f_train) + json.load(f_test)

In [5]:
ingredients = set()

# iterate through all ingredients in each recipe
for recipe in data:
  ings = [ing for ing in recipe['ingredients']]
  for ing in ings:
    # use set to avoid duplicate entries
    ingredients.add(ing)

# convert to list for easier operations
ingredients = list(ingredients)

print(f"length of ingredients list: {len(ingredients)}")
print(f"first 10 ingredients: {ingredients[:10]}")

length of ingredients list: 7137
first 10 ingredients: ['salad oil', 'pork sausages', 'knorr cilantro minicub', 'Pace Salsa', 'chile powder', 'vanilla', 'sesame salt', 'horse gram', 'king salmon', 'greater galangal']


In [6]:
# set of adjectives to look out for
mods = set()
mods1 = ['baked', 'blanched', 'blackened', 'braised', 'breaded', 'broiled', 'caramelized', 'charred', 'fermented', 'fried',
         'glazed', 'infused', 'marinated', 'poached', 'roasted', 'sauteed', 'seared', 'smoked', 'whipped']
mods2 = ['diced', 'battered', 'blackened', 'blanched', 'blended', 'boiled', 'boned', 'braised', 'brewed', 'broiled',
           'browned', 'butterflied', 'candied', 'canned', 'caramelized', 'charred', 'chilled', 'chopped', 'clarified',
           'condensed', 'creamed', 'crystalized', 'curdled', 'cured', 'curried', 'dehydrated', 'deviled', 'diluted',
           'dredged', 'drenched', 'dried', 'drizzled', 'dry roasted', 'dusted', 'escalloped', 'evaporated', 'fermented',
           'filled', 'folded', 'freeze dried', 'fricaseed', 'fried', 'glazed', 'granulated', 'grated', 'griddled', 'grilled',
           'hardboiled', 'homogenized', 'kneaded', 'malted', 'mashed', 'minced', 'mixed', 'medium', 'small', 'large',
           'packed', 'pan-fried', 'parboiled', 'parched', 'pasteurized', 'peppered', 'pickled', 'powdered', 'preserved',
           'pulverized', 'pureed', 'redolent', 'reduced', 'refrigerated', 'chilled', 'roasted', 'rolled', 'salted',
           'saturated', 'scalded', 'scorched', 'scrambled', 'seared', 'seasoned', 'shredded', 'skimmed', 'sliced',
           'slivered', 'smothered', 'soaked', 'soft-boiled', 'hard-boiled', 'stewed', 'stuffed', 'toasted', 'whipped',
           'wilted', 'wrapped']
# remove duplicates
for adj_list in [mods1, mods2]:
    for mod in adj_list:
        mods.add(mod)

# convert to list
mods = list(mods)

In [7]:
# set of units to look out for
units = set()

units1 = ['l', 'dl', 'milliliter', 'liter', 'deciliter', 'teaspoon', 't.', 'tsp.',
             'milliliters', 'liters', 'deciliters', 'teaspoons', 't.', 'tsp.',
            'tablespoon', 'T.', 'tbsp.', 'ounce', 'fl oz', 'cup', 'c.', 'pint', 'pt.',
            'tablespoons', 'ounces', 'fl ozs', 'cups', 'pints', 'quarts', 'gallons', 'grams', 'kilograms',
            'quart', 'qt.', 'gallon', 'gal', 'mg', 'milligram', 'g', 'gram', 'kg', 'kilogram', 'milligrams',
            'pound', 'lb', 'ounce', 'oz', 'count', 'pints', 'quarts', 'cups', 'tablespoons',
            'pounds', 'lbs', 'ounces', 'units', 'drops', 'tsps.', 'tbsps.', 'Ts.', 'ts.',
            'teaspoons', 'dash', 'pinch', 'drop', 'dram', 'smidgeon', 'dashes', 'pinches', 'drops',
             'drams', 'smidgeons', ]

for unit_list in [units1]:
    for unit in unit_list:
        units.add(unit)

units = list(units)

In [8]:
print(len(mods))
print(len(units))

101
62


In [9]:
# set of quantities to look out for, in both string and integer representations
quantities = {
    "1/2": 0.5,
    "1/4": 0.25,
    "1/3": 0.333,
    "2/3": 0.666,
    "3/4": 0.75,
    "half": 0.5,
    "third": 0.333,
    "quarter": 0.25,
    "1": 1,
    "2": 2,
    "3": 3,
    "4": 4,
    "5": 5,
    "6": 6,
    "7": 7,
    "8": 8,
    "9": 9,
    "10": 10,
    "11": 11,
    "12": 12,
    "a dozen": 12,
    "a baker's dozen": 13,
    "two dozen": 24,
    "three dozen": 36,
    "one": 1,
    "two": 2,
    "three": 3,
    "four": 4,
    "five": 5,
    "six": 6,
    "seven": 7,
    "eight": 8,
    "nine": 9,
    "ten": 10,
    "eleven": 11,
    "twelve": 12
}

In [10]:
# build the training data
# generate ingredient data
def generate_data(num):
  # lists of length num
  X = [0] * num
  Y = [0] * num

  for i in range(num):
    # pick random quantity from list (either string or integer form)
    rnd_quantity_str, rnd_quantity_int = random.choice(list(quantities.items()))

    # pick random unit
    rnd_unit = random.choice(units)

    # only pick random mod 1/3 of time
    choose_mod = random.choice([None, None, True])

    # pick random modifier if to be used (if selected true in statement above)
    rnd_mod = random.choice(mods)

    # pick random ingredient
    rnd_ing = random.choice(ingredients)

    # no unit 1/5 of time (ex: 5 pickles, chopped) use 'count' as placeholder
    no_unit = random.choice([False, False, False, False, True])

    if no_unit:
      rnd_unit = 'count'

    # output Y, words separated into dictionary
    if choose_mod:
      # ex: {"qty": 36, "unit": "count", "item": "eggs", "mod": "scrambled"}
      Y[i] = f'{{ quantity: {rnd_quantity_int} , unit: {rnd_unit} , item: {rnd_ing} , mod: {rnd_mod} }}'
    else:
      # ex: "mod": None
      Y[i] = f'{{ quantity: {rnd_quantity_int} , unit: {rnd_unit} , item: {rnd_ing} , mod: {None} }}'

    # input X, in original form (ex: "3 dozen scrambled eggs")

    # some ingredients have modifications at end (ex: 5 eggs, scrambled vs 3 dozen scrambled eggs)
    # 1/3 of time there is a modification, 1/2 of time the modification is end (1/6)
    rnd_end_mod = random.choice([False, True])

    if choose_mod:
      # no units
      if no_unit:
        # modification at end or in middle of phrase
        if rnd_end_mod:
          # ex: 3 eggs, scrambled
          X[i] = f'{rnd_quantity_str} {rnd_ing} , {rnd_mod}'
        else:
          # ex: 3 scrambled eggs
          X[i] = f'{rnd_quantity_str} {rnd_mod} {rnd_ing}'
      else:
        # add unit compared to previous
        if rnd_end_mod:
          # ex: 3 cups eggs, scrambled
          X[i] = f'{rnd_quantity_str} {rnd_unit} {rnd_ing} , {rnd_mod}'
        else:
          # ex: 3 cups scrambled eggs
          X[i] = f'{rnd_quantity_str} {rnd_unit} {rnd_mod} {rnd_ing}'
    else:
      # no modification
      if no_unit:
        # ex: 3 eggs
        X[i] = f'{rnd_quantity_str} {rnd_ing}'
      else:
        # ex: 3 cups eggs
        X[i] = f'{rnd_quantity_str} {rnd_unit} {rnd_ing}'

  return ((X, Y))

In [11]:
# run some examples
N_TRAINING = 100_000

x, y = generate_data(N_TRAINING)

for i in range(15):
    print(f"{x[i]}  ==>  {y[i]}")

10 quickcooking grits  ==>  { quantity: 10 , unit: count , item: quickcooking grits , mod: None }
ten carrot sticks  ==>  { quantity: 10 , unit: count , item: carrot sticks , mod: None }
third smidgeon cod fillets  ==>  { quantity: 0.333 , unit: smidgeon , item: cod fillets , mod: None }
three dozen ounce corn syrup  ==>  { quantity: 36 , unit: ounce , item: corn syrup , mod: None }
six quart pears  ==>  { quantity: 6 , unit: quart , item: pears , mod: None }
third laurel leaves  ==>  { quantity: 0.333 , unit: count , item: laurel leaves , mod: None }
half dashes Niçoise olives , candied  ==>  { quantity: 0.5 , unit: dashes , item: Niçoise olives , mod: candied }
1/2 quart tabbouleh  ==>  { quantity: 0.5 , unit: quart , item: tabbouleh , mod: None }
quarter drams frozen cod fillets  ==>  { quantity: 0.25 , unit: drams , item: frozen cod fillets , mod: None }
2/3 lbs Kraft Miracle Whip Dressing  ==>  { quantity: 0.666 , unit: lbs , item: Kraft Miracle Whip Dressing , mod: None }
twelve 

In [12]:
# seq2seq model in this case
# word tokenization
print(f"{len(ingredients) + len(mods) + len(units) + len(quantities)} unique words used for training")
# round up to 7400
VOCAB_SIZE = 7400

max_len = -1
for i in range(N_TRAINING):
    x_len = len(x[i].split())
    y_len = len(y[i].split())
    if x_len > max_len:
        max_len = x_len
    if y_len > max_len:
        max_len = y_len

print(f"Longest number of words in input or output: {max_len}")
MAX_LENGTH = 30 # round up to be safe

# text vectorization using keras functions
# output_sequence_length are all same length by adding padding
tv_layer_x = keras.layers.TextVectorization(VOCAB_SIZE, standardize=None, split='whitespace', output_sequence_length=MAX_LENGTH)
tv_layer_y = keras.layers.TextVectorization(VOCAB_SIZE, standardize=None, split='whitespace', output_sequence_length=MAX_LENGTH)

# use on training data
tv_layer_x.adapt(x)
# add start token and end token to data
tv_layer_y.adapt([f"SOS {seq} EOS" for seq in y])

print(tv_layer_x.get_vocabulary()[:25])
print("")
print(tv_layer_y.get_vocabulary()[:25])

7336 unique words used for training
Longest number of words in input or output: 24
['', '[UNK]', ',', 'dozen', 'a', 'two', 'three', '1/2', '2/3', 'third', 'half', 'one', "baker's", '3', 'six', '1', '3/4', 'ten', '4', '5', '7', '10', 'five', '8', '1/4']

['', '[UNK]', ',', '}', '{', 'unit:', 'quantity:', 'mod:', 'item:', 'SOS', 'EOS', 'None', 'count', '12', '0.5', '1', '10', '0.333', '5', '6', '0.25', '8', '2', '4', '9']


In [13]:
# setting up inputs for model
train_size = int(0.9 * N_TRAINING)

encoder_input_train = tf.convert_to_tensor(x[:train_size])
encoder_input_val = tf.convert_to_tensor(x[train_size:])

# decoder input is what we want it to predict at the end of that timestep
decoder_input_train = tf.convert_to_tensor([f"SOS {seq}" for seq in y[:train_size]])
decoder_input_val = tf.convert_to_tensor([f"SOS {seq}" for seq in y[train_size:]])

# use layer since need integer tokens instead of string
decoder_targets_train = tv_layer_y([f"{seq} EOS" for seq in y[:train_size]])
decoder_targets_val = tv_layer_y([f"{seq} EOS" for seq in y[train_size:]])

print(f"encoder input train shape: {encoder_input_train.shape}")
print(f"decoder input train shape: {decoder_input_train.shape}")
print(f"decoder targets train shape: {decoder_targets_train.shape}")

print(f"encoder input train example: {encoder_input_train[0].numpy()}")
print(f"decoder input train example: {decoder_input_train[0].numpy()}")
print(f"decoder target train example: {decoder_targets_train[0].numpy()}")

encoder input train shape: (90000,)
decoder input train shape: (90000,)
decoder targets train shape: (90000, 30)
encoder input train example: b'10 quickcooking grits'
decoder input train example: b'SOS { quantity: 10 , unit: count , item: quickcooking grits , mod: None }'
decoder target train example: [   4    6   16    2    5   12    2    8 3698  662    2    7   11    3
   10    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [14]:
# MODEL BACKGROUND INFO by simon

  # recurrent neural network
    # different amounts of input data
    # feedback loop allows using sequencial data (past data)
    # input is both current and previous data
    # problem: long sequences cause vanishing gradient descent (too long causes weights to become super big or small)

  # LSTM (long short term memory)
    # avoids exploding/vanishing gradient problem
      # instead of using the same feedback loop, split into long term and short term memories
    # forget gate: short term memory output determines what percent of long term memory is remembered (sigmoid)
    # new long term memory is input to determine new short term memory to pass on

  # word embedding and word2vec
    # neural network assigns numbers to words based on context
      #better than randomly assign as similar words have similar embeddings
    # to include more context
      # continuous bag of words: uses surrounding words to predict what goes in the middle
      # skip gram: use middle word to predict surrounding words
    # negative sampling: randomly selecting subset of words to not predict, optimizes training by reducing number of words

   #seq2seq (uses all of the above)
    # encoder: turns input into collection of long and short term memories (cell and hidden states)
      # embedding layer to tokenize phrase into words and their embeddings
      # layers of LSTM
      # last cell and hidden states are called context vector
    # decoder:
      # input is context vector, decodes into output sentence
      # new sets of LSTM
      # lead to fully connected layer (basic neural network)
        # softmax function picks output words

In [15]:
# setting up seq2seq model
# turn strings into numbers and add dimensionality for better representation as input to computer
encoder_inputs = keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = keras.layers.Input(shape=[], dtype=tf.string)

EMBED_SIZE = 96

encoder_input_ids = tv_layer_x(encoder_inputs)
decoder_input_ids = tv_layer_y(decoder_inputs)

print(f"encoder input id shape: {encoder_input_ids.shape}")
print(f"encoder input id type: {encoder_input_ids.dtype}")

# input dim. x output dim. ==> (7000, 64)
encoder_embedding_layer = keras.layers.Embedding(VOCAB_SIZE, EMBED_SIZE, mask_zero=True)
decoder_embedding_layer = keras.layers.Embedding(VOCAB_SIZE, EMBED_SIZE, mask_zero=True)

# keras functional api
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

print(f"encoder embeddings shape: {encoder_embeddings.shape}")
print(f"decoder embeddings shape: {decoder_embeddings.shape}")

encoder input id shape: (None, 30)
encoder input id type: <dtype: 'int64'>
encoder embeddings shape: (None, 30, 96)
decoder embeddings shape: (None, 30, 96)


In [16]:
# set up encoder
encoder = keras.layers.Bidirectional(keras.layers.LSTM(256, return_state=True))

# discard outputs, keep cell and hidden states
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

# short term memory is 0 and 2, long term memory is 1 and 3
encoder_state = [tf.concat(encoder_state[::2], axis=-1), tf.concat(encoder_state[1::2], axis=-1)]

In [17]:
# set up decoder
# should be twice the size as encoder as bidirectional is concatenated (forward + backward = 2 * forward)
decoder = keras.layers.LSTM(512, return_sequences=True)

# decoder input is encoder states
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

In [18]:
# set up dense layer to make predictions
# softmax gives probability of all words, choose the highest as next word
output_layer = keras.layers.Dense(VOCAB_SIZE, activation="softmax")
Y_probabilities = output_layer(decoder_outputs)

In [19]:
# compile model
model = keras.Model(inputs=[encoder_inputs,
                    decoder_inputs],
                    outputs=[Y_probabilities])
model.compile(loss="sparse_categorical_crossentropy",  # sparse because each word token gets its own number (not one-hot)
              optimizer="nadam",
              metrics=['accuracy'])

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None,)]                    0         []                            
                                                                                                  
 text_vectorization (TextVe  (None, 30)                   0         ['input_1[0][0]']             
 ctorization)                                                                                     
                                                                                                  
 input_2 (InputLayer)        [(None,)]                    0         []                            
                                                                                                  
 embedding (Embedding)       (None, 30, 96)               710400    ['text_vectorization[0][0]

In [20]:
# turns the model.summary() into a picture
# from keras.utils.vis_utils import plot_model
# plot_model(model, to_file = 'model.png', show_shapes = True, show_dtype = True)

In [21]:
EPOCHS = 11
model.fit((encoder_input_train, decoder_input_train),
          decoder_targets_train,
          epochs = EPOCHS,
          validation_data = ((encoder_input_val, decoder_input_val), decoder_targets_val))

Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11


<keras.src.callbacks.History at 0x7886a020f250>

In [39]:
model.save("ingredientsmodel.keras", save_format="keras")

In [26]:
testmodel = tf.keras.models.load_model("ingredientsmodel.keras")

In [None]:
#model.export("test.tf")

In [None]:
#testmodel = tf.saved_model.load("test.tf")

In [36]:
# ingredient string to json formatted dictionary representation
def convert(ingredient):
  # translation is blank at first
  translation = ""
  for word_id in range(MAX_LENGTH):
    # string to be translated on encoder input
    x_enc = np.array([ingredient])
    # string to be translated on decoder input
    x_dec = np.array(["SOS " + translation])
    # softmax probability distribution for predicted next word
    y_probs = testmodel.predict((x_enc, x_dec), verbose = 0)[0, word_id]
    # choose highest probability word
    predicted_id = np.argmax(y_probs)
    # find word based on id
    predicted_word = tv_layer_y.get_vocabulary()[predicted_id]
    # if word is EOS, means reached max length or end
    if predicted_word == "EOS":
      break
    # else, add to prediction sequence and loop again for next word
    translation += " " + predicted_word
  return translation.strip()


In [40]:
print(convert("2 cucumbers"))
print(convert("2 tablespoons olive oil"))
print(convert("3 cups of rice"))
print(convert("2 sliced chicken breast"))

{ quantity: 2 , unit: count , item: cucumbers , mod: None }
{ quantity: 2 , unit: tablespoons , item: olive oil , mod: None }
{ quantity: 3 , unit: cups , item: recipe mix , mod: None }
{ quantity: 2 , unit: count , item: chicken drumsticks , mod: sliced }


In [None]:
# IGNORE THE REST

In [None]:
df = pd.read_csv("Food Ingredients and Recipe Dataset with Image Name Mapping.csv")
df.head()
# the images are in a separate folder
# indexes should match to the recipes

df.columns

In [None]:
print(convert("4 pounds jasmine rice"))

In [None]:
import ast

ingredients_column = []

for recipe_ingredients in df['Ingredients'].items():
  # list has quotes around it, need to convert from string to list
  recipe_ingredients = ast.literal_eval(recipe_ingredients[1])
  # make a list of the ingredients for each recipe
  ingredients_list = []
  for ingredient in recipe_ingredients:
    if len(ingredient) != 0:
      print(ingredient)
      standardized = convert(ingredient)
      print(standardized)
      if standardized.find("item: ") > -1:
        if (standardized.find(", mod: ")) > -1:
          standardized = standardized[standardized.index("item: ") + 6 : standardized.index(" , mod: ")]
        else:
          standardized = standardized[standardized.index("item: ") + 6]
        ingredients_list.append(standardized.strip())
  ingredients_column.append({'Recipe Ingredients': ingredients_list})

In [None]:
ingredients = pd.DataFrame(ingredients_column)
ingredients.head()

In [None]:
df.to_csv('standardized_ingredients.csv')
from google.colab import files
files.download("standardized_ingredients.csv")