<a href="https://colab.research.google.com/github/map222/Kibbeh/blob/master/ingredient_finder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ingredient recommender



## Setup:


### Google drive
First, mount files from Google drive (to copy the file to your share folder, click [this link](https://drive.google.com/drive/folders/1fh5C0Wlda0QMzBXqOj6znhS8SlsCZm9O?usp=sharing)):

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
folder_loc = r'/content/gdrive/My Drive/Colab Notebooks/data/recipes/'

### Hugging Face

In [2]:
from huggingface_hub import notebook_login
notebook_login()

ModuleNotFoundError: ignored

### Imports

In [None]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors, Word2Vec
import warnings
warnings.filterwarnings('ignore')
from collections import defaultdict, Counter

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from ast import literal_eval
fs = 22
font = {'family' : 'normal',
        'weight' : 'normal',
        'size'   : fs}

matplotlib.rc('font', **font)
plt.rc('xtick', labelsize=fs-6)
plt.rc('ytick', labelsize=fs-6)

## Load recipes

In [None]:
test = pd.read_csv(folder_loc + 'RecipeNLG_dataset.csv', nrows = 5, sep = ',', index_col = 0)

In [None]:
test.head()

Unnamed: 0,title,ingredients,directions,link,source,NER
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [None]:
%%time

recipes_pdf = pd.read_csv(folder_loc + 'RecipeNLG_dataset.csv', sep = ',',usecols = ['title', 'NER'], converters={"NER": literal_eval})

CPU times: user 46 s, sys: 3.96 s, total: 50 s
Wall time: 58.8 s


In [None]:
recipes_pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2231142 entries, 0 to 2231141
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   title   object
 1   NER     object
dtypes: object(2)
memory usage: 34.0+ MB


## Pre-processing

### Stemming

In [None]:
def pre_process(token_list):
  return [word.rstrip('s').lower() for word in token_list]
recipes_pdf['NER'] = recipes_pdf['NER'].apply(pre_process)

In [None]:
recipes_pdf['NER'].head()

0    [brown sugar, milk, vanilla, nut, butter, bite...
1    [beef, chicken breast, cream of mushroom soup,...
2    [frozen corn, cream cheese, butter, garlic pow...
3    [chicken, chicken gravy, cream of mushroom sou...
4    [peanut butter, graham cracker crumb, butter, ...
Name: NER, dtype: object

In [None]:
recipes_pdf['title'].value_counts().head(2000).tail()

Tuna Ball                   59
Easy Coconut Cake           59
Lemon Poppy Seed Muffins    59
Tomato Pudding              59
Mrs. Field'S Cookies        59
Name: title, dtype: int64

In [None]:
recipes_pdf['sorted_char'] = recipes_pdf['NER'].apply(lambda row: ''.join(sorted(''.join(row).replace(' ', ''))))


In [None]:
recipes_pdf['sorted_char'].value_counts().head()

                                                  574
aaaaabbdeeefggggiiikklllllmnnooprrrrsstttuuuvw    449
aefggiklllmorstu                                  336
aaegrrstuw                                        311
aaabbdeeefggggiikklllmnooprrrrsstttuuuw           307
Name: sorted_char, dtype: int64

In [None]:
recipes_pdf.loc[recipes_pdf['sorted_char'] == 'aaabbdeeefggggiikklllmnooprrrrsstttuuuw'].head()

Unnamed: 0,title,NER,sorted_char
175,Mom'S Pancakes,"[flour, baking powder, salt, sugar, egg, milk,...",aaabbdeeefggggiikklllmnooprrrrsstttuuuw
6031,Homemade Pancakes,"[flour, salt, sugar, egg, baking powder, milk,...",aaabbdeeefggggiikklllmnooprrrrsstttuuuw
7300,Plain Biscuits,"[flour, baking powder, sugar, salt, butter, mi...",aaabbdeeefggggiikklllmnooprrrrsstttuuuw
11873,Picklelets,"[egg, milk, baking powder, butter, sugar, flou...",aaabbdeeefggggiikklllmnooprrrrsstttuuuw
18121,Pancakes,"[baking powder, flour, sugar, salt, milk, egg,...",aaabbdeeefggggiikklllmnooprrrrsstttuuuw


In [None]:
recipes_pdf = recipes_pdf.drop_duplicates(subset = 'sorted_char')
recipes_pdf.shape

(1976599, 3)

Removed 200k recipes

## Build W2V model

In [None]:
%%time
recipe_w2v = Word2Vec(recipes_pdf['NER'])

CPU times: user 2min 31s, sys: 970 ms, total: 2min 32s
Wall time: 1min 28s


In [None]:
recipe_w2v.most_similar('fish sauce', topn=10)

[('asian fish sauce', 0.7992143630981445),
 ('rice noodle', 0.7358037829399109),
 ('nuoc', 0.7349714040756226),
 ('lemon gra', 0.727480411529541),
 ('mani', 0.7231417298316956),
 ('sweet soy sauce', 0.7184907793998718),
 ('stalk lemongra', 0.7184152603149414),
 ('ketjap mani', 0.7102898359298706),
 ('stalks lemongra', 0.710004448890686),
 ('rice vermicelli', 0.7045285701751709)]

In [None]:
recipe_w2v.most_similar('apple', topn=20)

[('tart apple', 0.9103819131851196),
 ('granny smith apple', 0.8886793255805969),
 ('green apple', 0.8739408254623413),
 ('golden delicious apple', 0.8227909803390503),
 ('cooking apple', 0.82215815782547),
 ('gala apple', 0.7559868693351746),
 ('peeled apple', 0.7551245093345642),
 ('baking apple', 0.754971444606781),
 ('red apple', 0.7516821622848511),
 ('sweet apple', 0.7216936349868774),
 ('apple -', 0.7083069086074829),
 ('red delicious apple', 0.6939435005187988),
 ('gala', 0.6930819749832153),
 ('braeburn apple', 0.6748573780059814),
 ('delicious apple', 0.6615157127380371),
 ('unpeeled apple', 0.6543431282043457),
 ('crisp apple', 0.6522817015647888),
 ('apple slice', 0.6502648591995239),
 ('fuji apple', 0.628289520740509),
 ('sweet-tart apple', 0.6156004667282104)]

In [None]:
# number of ingredients
len(recipe_w2v.wv.vocab)

31124

In [None]:
ingredient_count = Counter([token for row in recipes_pdf['NER'].values for token in row])

In [None]:
def calc_cooccurrence(ingredient: str,
                      candidates,
                      recipes):
  ''' Calc how often the top ingredient co-occurs with the candidates
    - also removes candidates that are re-phrase of ingredient (e.g. "beef" and "ground beef")
    ingredient: str name of an ingredient ("apple")
    candidates: potential other ingredients ("orange")
    recipes: iterable of possible ingredients
  '''


  co_count = {}
  for candidate in candidates:
    co_count[candidate] = sum([candidate in recipe and ingredient in recipe for recipe in recipes])
  return co_count

In [None]:
def get_fusion_ingredients(ingredient: str,
                           recipe_model, #gensim model
                           recipes, #iterable of recipes
                           ingredient_count: dict,
                           max_candidates = 20,
                           min_occurence_factor = 100 # minimum number of recipes an ingredient has to be in
                           ):

  ingredient_recipes = recipes.loc[recipes.apply(lambda row: ingredient in row)]

  ingredient_candidates = recipe_model.most_similar(ingredient, topn=50) # get top similar ingredients
  candidate_names = list(zip(*ingredient_candidates))[0]
  pruned_candidates = [candidate for candidate in candidate_names if ingredient not in candidate][:max_candidates] # clean up candidates to remove duplicates (e.g. "gala apple")
  cooccurrence_counts = calc_cooccurrence(ingredient, candidate_names, ingredient_recipes) # get counts for normalization
  # final score for sorting: similarity / how often co-occur / total occurences
  min_occurences = max(cooccurrence_counts.values()) / min_occurence_factor
  print(min_occurences)
  freq_norm_candidates = {candidate[0]: candidate[1] / (cooccurrence_counts[candidate[0]]+1) / ingredient_count[candidate[0]] for candidate in ingredient_candidates if candidate[0] in pruned_candidates and cooccurrence_counts[candidate[0]] > min_occurences}
  top_candidates = sorted([(k,v) for k,v in freq_norm_candidates.items()], key=lambda x: x[1])[-5:]
  return top_candidates, cooccurrence_counts, ingredient_candidates # return multiple for debugging

In [None]:
%%time
get_fusion_ingredients('orange', recipe_w2v, recipes_pdf['NER'], ingredient_count)

37.84
CPU times: user 1.91 s, sys: 93.3 ms, total: 2 s
Wall time: 1.89 s


([('red grapefruit', 1.608362360179704e-05),
  ('grapefruit juice', 1.6227832766752586e-05),
  ('kumquat', 2.9104959653233033e-05),
  ('clementine', 2.997461284399448e-05),
  ('marmalade', 4.245900077584349e-05)],
 {'tangerine': 96,
  'clementine': 56,
  'kumquat': 60,
  'grapefruit': 178,
  'red grapefruit': 119,
  'pink grapefruit': 152,
  'valencia orange': 5,
  'grapefruit juice': 51,
  'fresh orange': 9,
  'orange section': 23,
  'tangerine juice': 16,
  'mandarin': 10,
  'sweet orange': 6,
  'pomegranate': 90,
  'fresh orange section': 0,
  'orange slice': 113,
  'pomegranate juice': 108,
  'orange juiced': 7,
  'fresh orange zest': 11,
  'orange rind strip': 3,
  'orange zest': 881,
  'marmalade': 39,
  'black grape': 14,
  'zest': 9,
  'orange blossom honey': 27,
  'persimmon': 15,
  'pear': 224,
  'pomegranate aril': 11,
  'sweet white wine': 9,
  'orange rind': 305,
  'fresh juice': 4,
  'freshly squeezed orange juice': 222,
  'orange flower': 39,
  'mandarin orange': 117,
  

In [None]:
a,b,c = get_fusion_ingredients('apple', recipe_w2v, recipes_pdf['NER'], ingredient_count,20)

21.82


In [None]:
a

[('cranberrie', 1.523270798268965e-08),
 ('apricot', 2.300174697387224e-07),
 ('pear', 2.959156797837719e-07),
 ('fresh cranberrie', 3.0331627957046705e-07),
 ('bartlett', 1.2924371311070563e-05)]

In [None]:
fish_sauce = get_fusion_ingredients('fish sauce', recipe_w2v, recipes_pdf['NER'],ingredient_count, 10)

6.76


In [None]:
fish_sauce[0]

[('rice vermicelli', 1.458651284006565e-05),
 ('fresh lemongra', 1.4904556259032546e-05),
 ('sweet soy sauce', 0.00011279290100468946),
 ('mani', 0.00011893778451179203),
 ('ketjap mani', 0.00019862691161349848)]

In [None]:
fish_sauce[2]

[('asian fish sauce', 0.7992143630981445),
 ('rice noodle', 0.7358037829399109),
 ('nuoc', 0.7349714040756226),
 ('lemon gra', 0.727480411529541),
 ('mani', 0.7231417298316956),
 ('sweet soy sauce', 0.7184907793998718),
 ('stalk lemongra', 0.7184152603149414),
 ('ketjap mani', 0.7102898359298706),
 ('stalks lemongra', 0.710004448890686),
 ('rice vermicelli', 0.7045285701751709),
 ('fresh lemongra', 0.7023175954818726),
 ('lemongra', 0.700231671333313),
 ('nuoc nam', 0.6911463141441345),
 ('lime leaf', 0.6892822980880737),
 ('red chili paste', 0.6854043006896973),
 ('fresh galangal', 0.6847156882286072),
 ('bird chile', 0.6795110106468201),
 ('lime leave', 0.6791095733642578),
 ('fishsauce', 0.677825927734375),
 ('red curry', 0.6775223016738892),
 ('shrimp paste', 0.6735805869102478),
 ('tamarind juice', 0.672806441783905),
 ('oyster sauce', 0.6725000739097595),
 ('red bird', 0.6667814254760742),
 ('cilantro root', 0.6667782068252563),
 ('ﬁsh sauce', 0.6667441725730896),
 ('lemongrass s

In [None]:
a,b,c = get_fusion_ingredients('carrot', recipe_w2v, recipes_pdf['NER'], ingredient_count, 10)

23.51


In [None]:
a

[('bone', 3.4262489299384916e-05),
 ('beef bone', 6.591575503591108e-05),
 ('kale leave', 7.309549723339305e-05),
 ('chunk', 8.086041287258938e-05),
 ('soup bone', 0.00023302498216531717)]

In [None]:
a,b,c = get_fusion_ingredients('celery', recipe_w2v, recipes_pdf['NER'], ingredient_count, 10, 200)

20.97


In [None]:
a

[('kielbasa', 1.2254227779191268e-06),
 ('fresh green bean', 4.0977841990005564e-06),
 ('sweet pea', 5.708070176332871e-06),
 ('head of cabbage', 8.651156025192663e-06),
 ('red potato', 2.1824589695292288e-05)]

In [None]:
ingredient_count['kielbasa'], ingredient_count['rutabaga']

(1680, 356)

In [None]:
b['kielbasa'], b['rutabaga']

(204, 33)

In [None]:
c

[('stalks celery', 0.9442585706710815),
 ('celery stalk', 0.8846400380134583),
 ('stalk celery', 0.854391872882843),
 ('stalks of celery', 0.7531700730323792),
 ('fresh celery', 0.6134802103042603),
 ('celery rib', 0.5167121291160583),
 ('stalk of celery', 0.5093518495559692),
 ('celery heart', 0.49992477893829346),
 ('celery top', 0.47260695695877075),
 ('cabbage', 0.4697415828704834),
 ('celery root', 0.46823692321777344),
 ('head green cabbage', 0.46069324016571045),
 ('head of cabbage', 0.45221322774887085),
 ('green cabbage', 0.44685351848602295),
 ('celery salt', 0.4338985085487366),
 ('sweet pea', 0.4328030049800873),
 ('celery seed', 0.42702776193618774),
 ('new potato', 0.42619502544403076),
 ('fresh green bean', 0.4241206645965576),
 ('celery flake', 0.42285841703414917),
 ('kielbasa', 0.4220356047153473),
 ('rib of celery', 0.42201733589172363),
 ('head cabbage', 0.41616204380989075),
 ('red potato', 0.41484180092811584),
 ('wild rice', 0.4129950702190399),
 ('white turnip',

In [None]:
recipes_pdf.loc[(recipes_pdf['NER'].apply(lambda row: 'celery' in row)) & (recipes_pdf['NER'].apply(lambda row: 'kielbasa' in row)), 'title'].value_counts()

Jambalaya                         7
Kielbasa Bean Soup                4
Bean Soup                         3
Kielbasa Soup                     2
Sausage Stew                      2
                                 ..
Oven Baked Split Pea Soup         1
Turkey And Sausage Jambalaya      1
Kielbasa And Lentil Soup          1
Creole Stuffed Shrimp             1
Incredible 20 Minute Bean Soup    1
Name: title, Length: 184, dtype: int64