# Tutorials Utilized for this Notebook:


*   Word2Vec with ingredients: https://www.kaggle.com/code/ccorbi/word2vec-with-ingredients/notebook
*   Gensim Word2Vec tutorial: https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial

**Code that has been taken from these tutorials has been labeled in the cells**



In [230]:
from gensim.models import word2vec
import numpy as np
import pandas as pd
from time import time
import re
import json

In [231]:
# method to do top-n words calculations
def topWordsCalculations(model_to_use, words):
  for word in words:
    top10 = model_to_use.wv.most_similar(word)
    print('word=', word)
    print('output:', top10)
    print('top-1 average:', top10[0][1])
    print('top-5 average:', sum([w[1] for w in top10[0:5]])/5)
    print('top-10 average:', sum([w[1] for w in top10])/10)
    print('\n')

# Data Loading / Processing

In [232]:
path_to_data = '/content/recipes.json'

# code taken from:  https://www.kaggle.com/code/ccorbi/word2vec-with-ingredients/notebook (word2vec with ingredients)
train_recipes = json.load(open(path_to_data,'r'))

raw_ingredients = list()

for recipe in train_recipes:
    for ingredient in recipe[u'ingredients']:
        raw_ingredients.append(ingredient.strip())

raw_ingredients

['romaine lettuce',
 'black olives',
 'grape tomatoes',
 'garlic',
 'pepper',
 'purple onion',
 'seasoning',
 'garbanzo beans',
 'feta cheese crumbles',
 'plain flour',
 'ground pepper',
 'salt',
 'tomatoes',
 'ground black pepper',
 'thyme',
 'eggs',
 'green tomatoes',
 'yellow corn meal',
 'milk',
 'vegetable oil',
 'eggs',
 'pepper',
 'salt',
 'mayonaise',
 'cooking oil',
 'green chilies',
 'grilled chicken breasts',
 'garlic powder',
 'yellow onion',
 'soy sauce',
 'butter',
 'chicken livers',
 'water',
 'vegetable oil',
 'wheat',
 'salt',
 'black pepper',
 'shallots',
 'cornflour',
 'cayenne pepper',
 'onions',
 'garlic paste',
 'milk',
 'butter',
 'salt',
 'lemon juice',
 'water',
 'chili powder',
 'passata',
 'oil',
 'ground cumin',
 'boneless chicken skinless thigh',
 'garam masala',
 'double cream',
 'natural yogurt',
 'bay leaf',
 'plain flour',
 'sugar',
 'butter',
 'eggs',
 'fresh ginger root',
 'salt',
 'ground cinnamon',
 'milk',
 'vanilla extract',
 'ground ginger',
 'po

In [233]:
import collections
word_counts = collections.Counter(raw_ingredients)
word_counts.most_common()

[('salt', 18049),
 ('onions', 7972),
 ('olive oil', 7972),
 ('water', 7457),
 ('garlic', 7380),
 ('sugar', 6434),
 ('garlic cloves', 6237),
 ('butter', 4848),
 ('ground black pepper', 4785),
 ('all-purpose flour', 4632),
 ('pepper', 4438),
 ('vegetable oil', 4385),
 ('eggs', 3388),
 ('soy sauce', 3296),
 ('kosher salt', 3113),
 ('green onions', 3078),
 ('tomatoes', 3058),
 ('large eggs', 2948),
 ('carrots', 2814),
 ('unsalted butter', 2782),
 ('ground cumin', 2747),
 ('extra-virgin olive oil', 2747),
 ('black pepper', 2627),
 ('milk', 2263),
 ('chili powder', 2036),
 ('oil', 1970),
 ('red bell pepper', 1939),
 ('purple onion', 1896),
 ('scallions', 1891),
 ('grated parmesan cheese', 1886),
 ('sesame oil', 1773),
 ('corn starch', 1757),
 ('ginger', 1755),
 ('baking powder', 1738),
 ('jalapeno chilies', 1730),
 ('dried oregano', 1707),
 ('chopped cilantro fresh', 1698),
 ('fresh lemon juice', 1679),
 ('diced tomatoes', 1624),
 ('fresh parsley', 1604),
 ('minced garlic', 1583),
 ('chicken

In [234]:
# list of 5 words to test (to do calculations on)
test_words = ['fresh basil', 'cilantro', 'kalamata olives', 'black beans', 'garam masala']

In [235]:
# code taken from:  https://www.kaggle.com/code/ccorbi/word2vec-with-ingredients/notebook (word2vec with ingredients)
symbols = list()

for recipe in train_recipes:
    for ingredient in recipe['ingredients']:
        if re.match("\(|@|\$\?", ingredient.lower()):
            symbols.append(ingredient)
len(symbols)
counts_symbols = collections.Counter(symbols)
counts_symbols.most_common(20)

[('(    oz.) tomato sauce', 11),
 ('(   oz.) tomato paste', 9),
 ('(14.5 oz.) diced tomatoes', 3),
 ('(15 oz.) refried beans', 3),
 ('(10 oz.) frozen chopped spinach', 3),
 ('(10 oz.) frozen chopped spinach, thawed and squeezed dry', 2),
 ('(14 oz.) sweetened condensed milk', 2)]

In [236]:
# code taken from:  https://www.kaggle.com/code/ccorbi/word2vec-with-ingredients/notebook (word2vec with ingredients)
sentences = list()
# one hot ingredients


for recipt in train_recipes:
    clean_recipt = list()
    for ingredient in recipt['ingredients']:
        # remove this description from the ingredients
        # minimal preprocessing
        ingredient =  re.sub(r'\(.*oz.\)|crushed|crumbles|ground|minced|powder|chopped|sliced',
                             '',
                             ingredient)
        clean_recipt.append(ingredient.strip())
    sentences.append(clean_recipt)

len(sentences)

39774

# Word2Vec Model 1

In [237]:
# code taken from:  https://www.kaggle.com/code/ccorbi/word2vec-with-ingredients/notebook (word2vec with ingredients)

# Set values for NN parameters
num_features = 300    # Word vector dimensionality
min_word_count = 3    # 50% of the corpus
num_workers = 4       # Number of CPUs
context = 10          # window size
downsampling = 1e-3   # threshold for configuring which
                      # higher-frequency words are randomly downsampled

# Initialize and train the model
model = word2vec.Word2Vec(workers=num_workers, \
            vector_size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

In [238]:
# code taken from: https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial (gensim word2vec tutorial)
t = time()

model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab:', time() - t)

Time to build vocab: 0.17156410217285156


In [239]:
# code taken from: https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial (gensim word2vec tutorial)
t = time()

model.train(sentences, total_examples=model.corpus_count, epochs=10, report_delay=1)

print('Time to train the model:', time() - t)

Time to train the model: 9.646828889846802


## Top-1, Top-5, Top-10 for model 1

In [240]:
topWordsCalculations(model, test_words)

word= fresh basil
output: [('fresh basil leaves', 0.7428125143051147), ('basil leaves', 0.7322972416877747), ('chees fresh mozzarella', 0.6709271669387817), ('penne', 0.6659830808639526), ('Italian turkey sausage', 0.6659452319145203), ('olive oil flavored cooking spray', 0.647891104221344), ('bow-tie pasta', 0.6407570838928223), ('pesto', 0.6385067701339722), ('yellow squash', 0.6304804086685181), ('spinach leaves', 0.6240914463996887)]
top-1 average: 0.7428125143051147
top-5 average: 0.6955930471420289
top-10 average: 0.6659692049026489


word= cilantro
output: [('fresh cilantro', 0.8856534361839294), ('cilantro leaves', 0.858186662197113), ('cilantro fresh', 0.7167872786521912), ('fresh coriander', 0.6562798023223877), ('cilantro stems', 0.6503143310546875), ('sweet corn', 0.5807778239250183), ('serrano peppers', 0.5773773789405823), ('chile', 0.5482127666473389), ('hot chili', 0.5370160937309265), ('Mexican oregano', 0.5146968960762024)]
top-1 average: 0.8856534361839294
top-5 aver

In [241]:
model.wv.most_similar(['eggs', 'milk', 'vanilla extract'])

[('evaporated milk', 0.8159870505332947),
 ('pastry', 0.7985823750495911),
 ('softened butter', 0.7826281189918518),
 ('beaten eggs', 0.780570924282074),
 ('melted butter', 0.772914707660675),
 ('semi-sweet chocolate morsels', 0.7638320922851562),
 ('shortening', 0.7602747082710266),
 ('single crust pie', 0.7598553895950317),
 ('pie crust', 0.7565968632698059),
 ('whole wheat pastry flour', 0.7527552247047424)]

# Word2Vec Model 2 - Decreased Window Size by 1/2

In [242]:
# code taken from:  https://www.kaggle.com/code/ccorbi/word2vec-with-ingredients/notebook (word2vec with ingredients)

# Set values for NN parameters
num_features = 300    # Word vector dimensionality
min_word_count = 3    # 50% of the corpus
num_workers = 4       # Number of CPUs
context = 5          # window size
downsampling = 1e-3   # threshold for configuring which
                      # higher-frequency words are randomly downsampled

# Initialize and train the model
model2 = word2vec.Word2Vec(workers=num_workers, \
            vector_size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

In [243]:
# code taken from: https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial (gensim word2vec tutorial)
t = time()

model2.build_vocab(sentences, progress_per=10000)

print('Time to build vocab:', time() - t)

Time to build vocab: 0.23670148849487305


In [244]:
# code taken from: https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial (gensim word2vec tutorial)
t = time()

model2.train(sentences, total_examples=model2.corpus_count, epochs=10, report_delay=1)

print('Time to train the model:', time() - t)

Time to train the model: 7.734004497528076


## Top-1, Top-5, Top-10 for model 2

In [245]:
topWordsCalculations(model2, test_words)

word= fresh basil
output: [('fresh basil leaves', 0.7621052861213684), ('basil leaves', 0.7479035258293152), ('linguine', 0.7066890001296997), ('bow-tie pasta', 0.7018566131591797), ('chees fresh mozzarella', 0.6827179789543152), ('penne', 0.6805809140205383), ('pesto', 0.662975549697876), ('yellow squash', 0.6602261066436768), ('spinach leaves', 0.6521127820014954), ('fusilli', 0.6481409668922424)]
top-1 average: 0.7621052861213684
top-5 average: 0.7202544808387756
top-10 average: 0.6905308723449707


word= cilantro
output: [('fresh cilantro', 0.8399322032928467), ('cilantro leaves', 0.8198192715644836), ('cilantro fresh', 0.6991016268730164), ('chile', 0.6756662130355835), ('serrano peppers', 0.6597104668617249), ('cooked brown rice', 0.6166772842407227), ('cilantro stems', 0.606759786605835), ('fresh coriander', 0.6028186082839966), ('hot chili', 0.6027405261993408), ('sweet corn', 0.5893930792808533)]
top-1 average: 0.8399322032928467
top-5 average: 0.738845956325531
top-10 average

In [246]:
model2.wv.most_similar(['eggs', 'milk', 'vanilla extract'])

[('low-fat milk', 0.8713571429252625),
 ('evaporated milk', 0.8594532012939453),
 ('melted butter', 0.8495365381240845),
 ('softened butter', 0.8274590373039246),
 ('pastry', 0.8174067735671997),
 ('shortening', 0.8144609332084656),
 ('single crust pie', 0.7883224487304688),
 ('medium eggs', 0.7871817350387573),
 ('lemon extract', 0.7867909669876099),
 ('golden syrup', 0.7859845757484436)]

# Word2Vec Model 3 - Added learning rate of 0.005

In [247]:
# code taken from:  https://www.kaggle.com/code/ccorbi/word2vec-with-ingredients/notebook (word2vec with ingredients)

# Set values for NN parameters
num_features = 300    # Word vector dimensionality
min_word_count = 3    # 50% of the corpus
num_workers = 4       # Number of CPUs
context = 10          # window size
downsampling = 1e-3   # threshold for configuring which
                      # higher-frequency words are randomly downsampled

# Initialize and train the model
model3 = word2vec.Word2Vec(workers=num_workers, \
            vector_size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling, alpha=0.005)

In [248]:
# code taken from: https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial (gensim word2vec tutorial)
t = time()

model3.build_vocab(sentences, progress_per=10000)

print('Time to build vocab:', time() - t)

Time to build vocab: 0.23292899131774902


In [249]:
# code taken from: https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial (gensim word2vec tutorial)
t = time()

model3.train(sentences, total_examples=model3.corpus_count, epochs=10, report_delay=1)

print('Time to train the model:', time() - t)

Time to train the model: 10.170792818069458


## Top-1, Top-5, Top-10 for model 3

In [250]:
topWordsCalculations(model3, test_words)

word= fresh basil
output: [('balsamic vinegar', 0.9886016249656677), ('fresh basil leaves', 0.9833713173866272), ('freshly  pepper', 0.9761232733726501), ('linguine', 0.9750982522964478), ('capers', 0.9738101959228516), ('pitted kalamata olives', 0.9735633730888367), ('fresh oregano', 0.9724856615066528), ('spaghetti', 0.9699956774711609), ('extra-virgin olive oil', 0.9664415121078491), ('fat free less sodium chicken broth', 0.966299295425415)]
top-1 average: 0.9886016249656677
top-5 average: 0.9794009327888489
top-10 average: 0.9745790183544158


word= cilantro
output: [('fresh cilantro', 0.9871524572372437), ('lime', 0.9787974953651428), ('lime juice', 0.9727945327758789), ('lime wedges', 0.9701160788536072), ('cilantro fresh', 0.9577324986457825), ('cilantro leaves', 0.954762876033783), ('chicken breasts', 0.9540164470672607), ('jalapeno chilies', 0.9519700407981873), ('white onion', 0.9512802362442017), ('chipotle paste', 0.9457230567932129)]
top-1 average: 0.9871524572372437
top-5

In [251]:
model3.wv.most_similar(['eggs', 'milk', 'vanilla extract'])

[('buttermilk', 0.9875501990318298),
 ('unbaked pie crusts', 0.982410192489624),
 ('baking', 0.9808518886566162),
 ('shortening', 0.980090320110321),
 ('melted butter', 0.9729699492454529),
 ('anise extract', 0.9702609181404114),
 ('cornmeal', 0.9696202278137207),
 ('evaporated milk', 0.9695850610733032),
 ('farmer cheese', 0.9665543437004089),
 ('self rising flour', 0.9648932218551636)]

# Word2Vec Model 4 - Decreased Window Size (window = 5), Learning Rate Added (lr = 0.005)

In [252]:
# code taken from:  https://www.kaggle.com/code/ccorbi/word2vec-with-ingredients/notebook (word2vec with ingredients)

# Set values for NN parameters
num_features = 300    # Word vector dimensionality
min_word_count = 3    # 50% of the corpus
num_workers = 4       # Number of CPUs
context = 5          # window size
downsampling = 1e-3   # threshold for configuring which
                      # higher-frequency words are randomly downsampled

# Initialize and train the model
model4 = word2vec.Word2Vec(workers=num_workers, \
            vector_size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling, alpha=0.005)

In [253]:
# code taken from: https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial (gensim word2vec tutorial)
t = time()

model4.build_vocab(sentences, progress_per=10000)

print('Time to build vocab:', time() - t)

Time to build vocab: 0.22496700286865234


In [254]:
# code taken from: https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial (gensim word2vec tutorial)
t = time()

model4.train(sentences, total_examples=model4.corpus_count, epochs=10, report_delay=1)

print('Time to train the model:', time() - t)

Time to train the model: 9.091677188873291


## Top-1, Top-5, Top-10 for model 4

In [255]:
topWordsCalculations(model4, test_words)

word= fresh basil
output: [('balsamic vinegar', 0.9876619577407837), ('extra-virgin olive oil', 0.9849947094917297), ('capers', 0.9828872680664062), ('fresh basil leaves', 0.979954719543457), ('fresh oregano', 0.9786477088928223), ('linguine', 0.9735621213912964), ('arborio rice', 0.9734840393066406), ('spaghetti', 0.9728142023086548), ('flat leaf parsley', 0.9714289307594299), ('freshly  pepper', 0.9708184003829956)]
top-1 average: 0.9876619577407837
top-5 average: 0.9828292727470398
top-10 average: 0.9776254057884216


word= cilantro
output: [('fresh cilantro', 0.9888460040092468), ('lime', 0.9884642362594604), ('lime juice', 0.9845849275588989), ('lime wedges', 0.9755030870437622), ('cilantro fresh', 0.9707152247428894), ('chicken breasts', 0.9647049903869629), ('jalapeno chilies', 0.9620205163955688), ('cilantro leaves', 0.9571397304534912), ('white onion', 0.9512550234794617), ('boneless skinless chicken breasts', 0.9496047496795654)]
top-1 average: 0.9888460040092468
top-5 averag

In [256]:
model4.wv.most_similar(['eggs', 'milk', 'vanilla extract'])

[('buttermilk', 0.9928349256515503),
 ('shortening', 0.9897841811180115),
 ('baking', 0.9897537231445312),
 ('cornmeal', 0.9829892516136169),
 ('pecans', 0.9821597933769226),
 ('baking soda', 0.9802330136299133),
 ('vanilla', 0.9801535606384277),
 ('melted butter', 0.9798804521560669),
 ('white cornmeal', 0.9791784882545471),
 ('confectioners sugar', 0.9790623188018799)]