In [1]:
import engine as ng
from pathlib import Path
import pandas as pd
import json
from typing import List, Callable, Dict, Tuple
import networkx as nx
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

data_path = Path("data")
raw_data_path = Path("raw_data")

# Food.com

### Data summary

In [3]:
original_data_base_path = raw_data_path / "foodcom" / "archive"
raw_recipe_path = original_data_base_path / "RAW_recipes.csv"
raw_interactions_path = original_data_base_path / "RAW_interactions.csv"

In [3]:
raw_recipe = pd.read_csv(raw_recipe_path)
raw_recipe.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [4]:
print(len(raw_recipe))

231637


In [6]:
raw_interactions = pd.read_csv(raw_interactions_path)
raw_interactions.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


In [6]:
print(len(raw_interactions))

1132367


In [10]:
joint_information = pd.merge(raw_recipe.rename(columns={"id": "recipe_id"}), raw_interactions, on="recipe_id", how="inner")
joint_information.head()

Unnamed: 0,name,recipe_id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,user_id,date,rating,review
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,4470,2006-02-18,5,I used an acorn squash and recipe#137681 Swee...
1,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,593927,2010-08-21,5,This was a nice change. I used butternut squas...
2,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,178427,2011-12-05,5,Excellent recipe! I used butternut squash and ...
3,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,28603,2002-08-19,0,"Have not tried this, but it sounds delicious. ..."
4,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,346277,2006-08-27,5,This recipe was wonderful. Instead of using t...


In [36]:
grouped = joint_information.groupby('recipe_id')['rating'].mean()
grouped2 = joint_information.groupby('recipe_id')['recipe_id'].count()
grouped = pd.DataFrame({
    "recipe_id" : grouped.index,
    "recipe_mean": grouped.values
})
grouped2 = pd.DataFrame({
    "recipe_id" : grouped2.index,
    "recipe_count": grouped2.values
})
grouped = pd.merge(grouped, grouped2, on="recipe_id")
result = pd.merge(joint_information, grouped, on='recipe_id')[["recipe_id", "name", "recipe_mean", "recipe_count"]].drop_duplicates(["recipe_id"])
result.head()
# grouped2.head()

Unnamed: 0,recipe_id,name,recipe_mean,recipe_count
0,137739,arriba baked winter squash mexican style,5.0,3
3,31490,a bit different breakfast pizza,3.5,4
7,112140,all in the kitchen chili,4.0,1
8,59389,alouette potatoes,4.5,2
10,44061,amish tomato ketchup for canning,5.0,1


In [37]:
print(len(result))
print(len(result.drop_duplicates(["name"])))
print(result[result.duplicated(["name"])].sort_values(by="name"))
print(result[result["name"] == "10 bean soup"].sort_values(by="name"))

231637
230186
         recipe_id                                name  recipe_mean   
6124        470575                        10 bean soup         5.00  \
9186        313237                  3 bean baked beans         4.00   
9187        258846                        3 bean salad         4.00   
9247        323810  3 ingredient peanut butter cookies         3.40   
9285        197894              3 pepper cheese spread         5.00   
...            ...                                 ...          ...   
1131964     315535     zucchini chocolate chip muffins         5.00   
1131974      15267               zucchini cilantro dip         4.00   
1132001     323797                 zucchini lime bread         3.00   
1132002     292835                  zucchini oat bread         0.00   
1131370      11217              zucchini salsa  canned         4.15   

         recipe_count  
6124                5  
9186                1  
9187                1  
9247               10  
9285         

In [38]:
result.sort_values(by=["recipe_mean", "recipe_count"], ascending=False)

Unnamed: 0,recipe_id,name,recipe_mean,recipe_count
180730,55309,caprese salad tomatoes italian marinated toma...,5.0,52
110018,24768,berry cream cheese coffee cake,5.0,37
584857,166669,kittencal s caesar tortellini salad,5.0,36
596096,121941,kittencal s soft white baguette style bread,5.0,36
624551,62754,linda s special potato salad,5.0,32
...,...,...,...,...
1132002,292835,zucchini oat bread,0.0,1
1132177,88280,zuppa de clams or mussels,0.0,1
1132215,26261,zuppa inglese,0.0,1
1132266,16940,zurichgeschnetzeltes or diced chicken,0.0,1


In [41]:
raw_recipe2 = pd.merge(raw_recipe, result.rename(columns={"recipe_id": "id"}), how="outer", on="id", suffixes=["", "_ignore"])
raw_recipe2.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,name_ignore,recipe_mean,recipe_count
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,arriba baked winter squash mexican style,5.0,3
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,a bit different breakfast pizza,3.5,4
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,all in the kitchen chili,4.0,1
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11,alouette potatoes,4.5,2
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8,amish tomato ketchup for canning,5.0,1


### Saving Information to Json

In [43]:

def save_foodcom_json(raw_recipe: pd.DataFrame, path: Path):
    info_json = {}

    for i, row in raw_recipe[["name", "ingredients", "steps", "recipe_mean", "recipe_count"]].iterrows():
        recipe = row["name"]
        ingredients = eval(row["ingredients"])
        instructions = eval(row["steps"])
        info_json[recipe] = {
            "nombre": recipe,
            "rating": row["recipe_mean"],
            "cantidad_comentarios": row["recipe_count"],
            "ingredientes": [{"nombre": ingredient, } for ingredient in ingredients],
            "instrucciones": [ {"orden": i, "instruccion": instruction, } for i, instruction in enumerate(instructions)],
        }

    json.dump(info_json, path.open("+w"))

foodcom_json_path = data_path / "graphs" / "foodcom" / "food.com.recipe.json"
save_foodcom_json(raw_recipe2, foodcom_json_path)

### Process json information

In [4]:
from engine.processors import extract_ingredients_with_modifiers_nltk_grammar as normalize_ingredient

def process_foodcom_json(foodcom_recipes_json, path: Path, ingredient_normalizer: Callable[[str], List[Tuple[str,str]]]):
    new_foodcom_recipes = { }
    for recipe, info in foodcom_recipes_json.items():
        new_foodcom_recipes[recipe] = {
            "nombre": recipe,
            "rating": info["rating"],
            "cantidad_comentarios": info["cantidad_comentarios"],
            "ingredientes": [ 
                {
                    "nombre": new_ingredient, 
                    "modificador": modifier
                } for ingredient in info["ingredientes"]
                    for new_ingredient, modifier in ingredient_normalizer(ingredient["nombre"])
            ],
            "instrucciones": info["instrucciones"],
        }
    json.dump(new_foodcom_recipes, path.open(mode="+w"))

foodcom_json_path = data_path / "graphs" / "foodcom" / "food.com.recipe.json"
foodcom_recipes_norm_path = data_path / "graphs" / "foodcom" / "food.com.recipe_normalized.json"
foodcom_recipes_json = json.load(foodcom_json_path.open())
process_foodcom_json(foodcom_recipes_json, foodcom_recipes_norm_path, normalize_ingredient )

In [23]:
foodcom_recipes_norm_path = data_path / "graphs" / "foodcom" / "food.com.recipe_normalized.json"
foodcom_recipes_json = json.load(foodcom_recipes_norm_path.open())

### Building Graphs

- Full bipartite graph of ingredients and recipes
- Ingredient graph
- Recipe graph

In [3]:
from engine.graph_io import export_graph
from engine.graph_construction import build_ingredient_graph, build_ingredient_recipe_graph, build_recipe_graph
from engine.ranking import get_recipe_ranking

graph_base_path = data_path / "graphs" / "foodcom"


# Works 30 seconds
# G8 = build_ingredient_recipe_graph(foodcom_recipes_json)
# export_graph(G8, graph_base_path / "bipartite_recipe_ingredient.graphml")

# Works 17-20 minutes
# G7 = build_ingredient_graph(foodcom_recipes_json)
# nx.write_graphml(G7, graph_base_path / "ingredient_node_weighted.graphml")

# TOO EXPENSIVE
# G6 = build_recipe_graph(foodcom_recipes_json)
# nx.write_graphml(G6, graph_base_path / f"recipe_node_weighted.graphml")

# Works 17 seconds: 100000
# Works 1.7 seconds 10000
# Works 0.4 seconds 1000
take_first = 5000
recipe_relevant = set(x[1] for x in get_recipe_ranking(foodcom_recipes_json)[:take_first])
G = build_ingredient_recipe_graph({x:v for x,v in foodcom_recipes_json.items() if x in recipe_relevant})
export_graph(G, graph_base_path / f"bipartite_recipe_ingredient_reduced_{take_first}.graphml")

In [4]:
from engine.graph_io import import_graph
# Full Graph
graph_base_path = data_path / "graphs" / "foodcom"
B = import_graph(graph_base_path / "bipartite_recipe_ingredient.graphml")
assert nx.is_bipartite(B)

In [None]:
from engine.graph_construction import build_recipe_recipe_graph
graph_base_path = data_path / "graphs" / "foodcom"

# TOO EXPENSIVE
G6 = build_recipe_recipe_graph(B)
nx.write_graphml(G6, graph_base_path / f"recipe_node_weighted.graphml")

### Building reduced graphs

In [3]:
from engine.graph_construction import build_recipe_recipe_graph
graph_base_path = data_path / "graphs" / "foodcom"

# Works 5 seconds: 1000
# Works 30 seconds: 5000, weight_threshold=0.4
# Works 5 minutes: 5000, weight_threshold=0.1
# TOO EXPENSIVE: 10000
G6 = build_recipe_recipe_graph(B, weight_threshold=0.4)
nx.write_graphml(G6, graph_base_path / f"recipe_node_weighted_reduced_{take_first}.graphml")


In [2]:
from engine.graph_io import import_graph
# Reduced Graph
take_first = 5000
graph_base_path = data_path / "graphs" / "foodcom"
B = import_graph(graph_base_path / f"bipartite_recipe_ingredient_reduced_{take_first}.graphml")
assert nx.is_bipartite(B)

In [5]:
from engine.graph_io import export_graph
from engine.graph_construction import build_ingredient_graph
from engine.ranking import get_ingredient_cutoff

greatest_ingredients = get_ingredient_cutoff(B, ingredient_cutoff_percentage=0.90)
print(len(greatest_ingredients))
print(greatest_ingredients[:10])

# Works 1 minute
G_jaccard, G_pmi = build_ingredient_graph(foodcom_recipes_json, ingredients=[x.removesuffix("_ingredient") for x in greatest_ingredients])
export_graph(G_jaccard, graph_base_path / "ingredient_node_reduced_jaccard_weighted.graphml")
export_graph(G_pmi, graph_base_path / "ingredient_node_reduced_pmi_weighted.graphml")


521
['salt_ingredient', 'pepper_ingredient', 'onion_ingredient', 'oil_ingredient', 'butter_ingredient', 'sugar_ingredient', 'egg_ingredient', 'flour_ingredient', 'water_ingredient', 'clove_ingredient']


### Recipe Recipe Semantic Embedding

In [3]:
foodcom_semantic_dict = {}
for v in (data_path / "foodcom.sematic").iterdir():
    foodcom_semantic_dict = {
        **foodcom_semantic_dict,
        **np.load(v)
    }
print(len(foodcom_semantic_dict))

230185


In [4]:
from engine.ranking import get_edge_similarity_vector_ranking, exponential_similarity
from engine.graph_construction import build_weighted_graph_from_edge_list
from engine.graph_io import export_graph

l = [x for x in foodcom_semantic_dict.keys()]
print("Vector amount:", len(l))
r = [x.removesuffix("_recipe") for x in B.nodes if x.endswith("_recipe")]

print("Relevant Example:", r[0])
print("Relevant amount:", len(r))
print("Relevant Intersection:", len(set(l).intersection(r)))
l_relevant = list(set(l).intersection(r))

similarity_cutoff = 0.9

edge_ranking = get_edge_similarity_vector_ranking({x: foodcom_semantic_dict[x] for x in l_relevant}, exponential_similarity)
G = build_weighted_graph_from_edge_list([x for x in edge_ranking if x[0] >= similarity_cutoff])
graph_base_path = data_path / "graphs" / "foodcom"
export_graph(G, graph_base_path / f"recipe_node_semantic_weighted_reduced_{len(l_relevant)}_sim_cutoff_{similarity_cutoff}.graphml")


Vector amount: 230185
Relevant Example: how i got my family to eat spinach  spinach casserole
Relevant amount: 5000
Relevant Intersection: 5000


In [6]:
from engine.recipe_ingredient_bipartite_queries import return_available_recipes_given_ingredients, return_ingredient_given_query, return_ingredients_given_recipe, return_recipe_given_query


ingredients = return_ingredient_given_query("egg", B)
print(ingredients)
recipes = return_available_recipes_given_ingredients(["egg", "flour", "pineapple"], B, True)
print(recipes)
recipe_ingredients = return_ingredients_given_recipe("zucchini pineapple wheat bread", B)
print(recipe_ingredients)
recipes = return_recipe_given_query("bread", B)
print(recipes)

['egg', 'eggnog', 'egg roll', 'egg wash', 'egg yolk', 'egglands', 'eggplant', 'eggshell', 'taleggio', 'egg bread', 'egg matzo', 'egg white', "eggland's", 'jumbo egg', 'omega egg', 'quail egg', 'egg beater', 'egg noodle', 'egg tomato', 'egg-whites', 'soy eggnog', 'dairy eggnog', 'veggie shred', 'baby eggplant', 'chocolate egg', 'egg roll wrap', 'veggie burger', 'egg roll crepe', 'egg substitute', 'egg tortellini', 'free-range egg', 'globe eggplant', 'extra-large egg', 'veggie crumbles', 'egg roll wrapper', 'egg rotini pasta', 'egg sandwich bun', 'egg-roll wrapper', 'eggnog ice cream', 'wheat egg noodle', 'medium egg noodle', 'liquid egg product', 'spinach egg noodle', 'egg replacer powder', 'chocolate easter egg', 'liquid egg substitute', 'miniature chocolate egg', "yve's veggie ground round", 'parmigiano-reggiano cheese']
['zucchini pineapple wheat bread', 'zucchini pineapple loaf cake', 'zucchini pineapple cake', 'zucchini pineapple bread', 'zucchini kahlua bread', 'zucchini bread wit

In [18]:
import re

# Define the output text string
output_text = """
Epoch 1/40 1000/1000 [==============================] - 476s 472ms/step - loss: 7.4538e-05 - encoder_attr_loss: 3.5448e-06 - decoder_attr_loss: 7.4538e-05 - val_loss: 7.4247e-05 - val_encoder_attr_loss: 3.5407e-06 - val_decoder_attr_loss: 7.4247e-05
Epoch 2/40 1000/1000 [==============================] - 474s 474ms/step - loss: 7.4497e-05 - encoder_attr_loss: 3.5429e-06 - decoder_attr_loss: 7.4497e-05 - val_loss: 7.4208e-05 - val_encoder_attr_loss: 3.5389e-06 - val_decoder_attr_loss: 7.4208e-05
Epoch 3/40 1000/1000 [==============================] - 475s 475ms/step - loss: 7.4459e-05 - encoder_attr_loss: 3.5411e-06 - decoder_attr_loss: 7.4459e-05 - val_loss: 7.4173e-05 - val_encoder_attr_loss: 3.5372e-06 - val_decoder_attr_loss: 7.4173e-05
Epoch 4/40 1000/1000 [==============================] - 518s 518ms/step - loss: 7.4425e-05 - encoder_attr_loss: 3.5395e-06 - decoder_attr_loss: 7.4425e-05 - val_loss: 7.4141e-05 - val_encoder_attr_loss: 3.5356e-06 - val_decoder_attr_loss: 7.4141e-05
Epoch 5/40 1000/1000 [==============================] - 475s 475ms/step - loss: 7.4395e-05 - encoder_attr_loss: 3.5379e-06 - decoder_attr_loss: 7.4395e-05 - val_loss: 7.4111e-05 - val_encoder_attr_loss: 3.5341e-06 - val_decoder_attr_loss: 7.4111e-05
Epoch 6/40 1000/1000 [==============================] - 489s 489ms/step - loss: 7.4367e-05 - encoder_attr_loss: 3.5365e-06 - decoder_attr_loss: 7.4367e-05 - val_loss: 7.4085e-05 - val_encoder_attr_loss: 3.5327e-06 - val_decoder_attr_loss: 7.4085e-05
Epoch 7/40 1000/1000 [==============================] - 524s 524ms/step - loss: 7.4342e-05 - encoder_attr_loss: 3.5351e-06 - decoder_attr_loss: 7.4342e-05 - val_loss: 7.4061e-05 - val_encoder_attr_loss: 3.5314e-06 - val_decoder_attr_loss: 7.4061e-05
Epoch 8/40 1000/1000 [==============================] - 518s 518ms/step - loss: 7.4319e-05 - encoder_attr_loss: 3.5339e-06 - decoder_attr_loss: 7.4319e-05 - val_loss: 7.4039e-05 - val_encoder_attr_loss: 3.5302e-06 - val_decoder_attr_loss: 7.4039e-05
Epoch 9/40 1000/1000 [==============================] - 475s 475ms/step - loss: 7.4298e-05 - encoder_attr_loss: 3.5327e-06 - decoder_attr_loss: 7.4298e-05 - val_loss: 7.4019e-05 - val_encoder_attr_loss: 3.5291e-06 - val_decoder_attr_loss: 7.4019e-05
Epoch 10/40 1000/1000 [==============================] - 473s 473ms/step - loss: 7.4279e-05 - encoder_attr_loss: 3.5316e-06 - decoder_attr_loss: 7.4279e-05 - val_loss: 7.4001e-05 - val_encoder_attr_loss: 3.5280e-06 - val_decoder_attr_loss: 7.4001e-05
Epoch 11/40 1000/1000 [==============================] - 471s 470ms/step - loss: 7.4261e-05 - encoder_attr_loss: 3.5306e-06 - decoder_attr_loss: 7.4261e-05 - val_loss: 7.3985e-05 - val_encoder_attr_loss: 3.5271e-06 - val_decoder_attr_loss: 7.3985e-05
Epoch 12/40 1000/1000 [==============================] - 471s 471ms/step - loss: 7.4245e-05 - encoder_attr_loss: 3.5297e-06 - decoder_attr_loss: 7.4245e-05 - val_loss: 7.3970e-05 - val_encoder_attr_loss: 3.5262e-06 - val_decoder_attr_loss: 7.3970e-05
Epoch 13/40 1000/1000 [==============================] - 475s 475ms/step - loss: 7.4231e-05 - encoder_attr_loss: 3.5288e-06 - decoder_attr_loss: 7.4231e-05 - val_loss: 7.3956e-05 - val_encoder_attr_loss: 3.5253e-06 - val_decoder_attr_loss: 7.3956e-05
Epoch 14/40 1000/1000 [==============================] - 517s 517ms/step - loss: 7.4218e-05 - encoder_attr_loss: 3.5280e-06 - decoder_attr_loss: 7.4218e-05 - val_loss: 7.3943e-05 - val_encoder_attr_loss: 3.5245e-06 - val_decoder_attr_loss: 7.3943e-05
Epoch 15/40 1000/1000 [==============================] - 474s 474ms/step - loss: 7.4206e-05 - encoder_attr_loss: 3.5272e-06 - decoder_attr_loss: 7.4206e-05 - val_loss: 7.3932e-05 - val_encoder_attr_loss: 3.5238e-06 - val_decoder_attr_loss: 7.3932e-05
Epoch 16/40 1000/1000 [==============================] - 514s 514ms/step - loss: 7.4195e-05 - encoder_attr_loss: 3.5265e-06 - decoder_attr_loss: 7.4195e-05 - val_loss: 7.3922e-05 - val_encoder_attr_loss: 3.5231e-06 - val_decoder_attr_loss: 7.3922e-05
Epoch 17/40 1000/1000 [==============================] - 476s 476ms/step - loss: 7.4185e-05 - encoder_attr_loss: 3.5258e-06 - decoder_attr_loss: 7.4185e-05 - val_loss: 7.3912e-05 - val_encoder_attr_loss: 3.5225e-06 - val_decoder_attr_loss: 7.3912e-05
Epoch 18/40 1000/1000 [==============================] - 475s 475ms/step - loss: 7.4176e-05 - encoder_attr_loss: 3.5252e-06 - decoder_attr_loss: 7.4176e-05 - val_loss: 7.3903e-05 - val_encoder_attr_loss: 3.5219e-06 - val_decoder_attr_loss: 7.3903e-05
Epoch 19/40 1000/1000 [==============================] - 473s 473ms/step - loss: 7.4167e-05 - encoder_attr_loss: 3.5246e-06 - decoder_attr_loss: 7.4167e-05 - val_loss: 7.3895e-05 - val_encoder_attr_loss: 3.5213e-06 - val_decoder_attr_loss: 7.3895e-05
Epoch 20/40 1000/1000 [==============================] - 475s 475ms/step - loss: 7.4160e-05 - encoder_attr_loss: 3.5241e-06 - decoder_attr_loss: 7.4160e-05 - val_loss: 7.3888e-05 - val_encoder_attr_loss: 3.5208e-06 - val_decoder_attr_loss: 7.3888e-05
Epoch 21/40 1000/1000 [==============================] - 473s 473ms/step - loss: 7.4153e-05 - encoder_attr_loss: 3.5236e-06 - decoder_attr_loss: 7.4153e-05 - val_loss: 7.3881e-05 - val_encoder_attr_loss: 3.5203e-06 - val_decoder_attr_loss: 7.3881e-05
Epoch 22/40 1000/1000 [==============================] - 471s 471ms/step - loss: 7.4146e-05 - encoder_attr_loss: 3.5231e-06 - decoder_attr_loss: 7.4146e-05 - val_loss: 7.3875e-05 - val_encoder_attr_loss: 3.5198e-06 - val_decoder_attr_loss: 7.3875e-05
Epoch 23/40 1000/1000 [==============================] - 469s 469ms/step - loss: 7.4140e-05 - encoder_attr_loss: 3.5227e-06 - decoder_attr_loss: 7.4140e-05 - val_loss: 7.3870e-05 - val_encoder_attr_loss: 3.5194e-06 - val_decoder_attr_loss: 7.3870e-05
Epoch 24/40 1000/1000 [==============================] - 476s 476ms/step - loss: 7.4135e-05 - encoder_attr_loss: 3.5223e-06 - decoder_attr_loss: 7.4135e-05 - val_loss: 7.3865e-05 - val_encoder_attr_loss: 3.5190e-06 - val_decoder_attr_loss: 7.3865e-05
Epoch 25/40 1000/1000 [==============================] - 486s 486ms/step - loss: 7.4130e-05 - encoder_attr_loss: 3.5219e-06 - decoder_attr_loss: 7.4130e-05 - val_loss: 7.3860e-05 - val_encoder_attr_loss: 3.5186e-06 - val_decoder_attr_loss: 7.3860e-05
Epoch 26/40 1000/1000 [==============================] - 523s 523ms/step - loss: 7.4125e-05 - encoder_attr_loss: 3.5215e-06 - decoder_attr_loss: 7.4125e-05 - val_loss: 7.3856e-05 - val_encoder_attr_loss: 3.5183e-06 - val_decoder_attr_loss: 7.3856e-05
Epoch 27/40 1000/1000 [==============================] - 520s 520ms/step - loss: 7.4121e-05 - encoder_attr_loss: 3.5211e-06 - decoder_attr_loss: 7.4121e-05 - val_loss: 7.3852e-05 - val_encoder_attr_loss: 3.5179e-06 - val_decoder_attr_loss: 7.3852e-05
Epoch 28/40 1000/1000 [==============================] - 475s 475ms/step - loss: 7.4117e-05 - encoder_attr_loss: 3.5208e-06 - decoder_attr_loss: 7.4117e-05 - val_loss: 7.3848e-05 - val_encoder_attr_loss: 3.5176e-06 - val_decoder_attr_loss: 7.3848e-05
Epoch 29/40 1000/1000 [==============================] - 516s 516ms/step - loss: 7.4114e-05 - encoder_attr_loss: 3.5205e-06 - decoder_attr_loss: 7.4114e-05 - val_loss: 7.3845e-05 - val_encoder_attr_loss: 3.5173e-06 - val_decoder_attr_loss: 7.3845e-05
Epoch 30/40 1000/1000 [==============================] - 518s 518ms/step - loss: 7.4111e-05 - encoder_attr_loss: 3.5202e-06 - decoder_attr_loss: 7.4111e-05 - val_loss: 7.3842e-05 - val_encoder_attr_loss: 3.5171e-06 - val_decoder_attr_loss: 7.3842e-05
Epoch 31/40 1000/1000 [==============================] - 521s 521ms/step - loss: 7.4108e-05 - encoder_attr_loss: 3.5200e-06 - decoder_attr_loss: 7.4108e-05 - val_loss: 7.3839e-05 - val_encoder_attr_loss: 3.5168e-06 - val_decoder_attr_loss: 7.3839e-05
"""

# Define the regular expression to match the loss value

# Define the regular expressions to match the loss and val_loss values
epoch_regex = r"Epoch (\d+)/"
loss_regex = r"loss: (\d+\.\d+e?-?\d+)"
val_loss_regex = r"val_loss: (\d+\.\d+e?-?\d+)"

json_vals = {
    "loss": [],
    "val_loss": [], 
}

for line in output_text.splitlines():
    # Search for the loss value using the regular expression
    epoch_match = re.search(epoch_regex, line)
    loss_match = re.search(loss_regex, line)
    val_loss_match = re.search(val_loss_regex, line)

    # Extract the loss and val_loss values from the match objects
    if epoch_match:
        epoch_value = int(epoch_match.group(1))
        print("Epoch:", epoch_value)
    else:
        print("No match found for epoch value.")

    if loss_match:
        loss_value = float(loss_match.group(1))
        json_vals["loss"].append(loss_value)
        print("Training loss:", loss_value)
    else:
        print("No match found for training loss value.")
        
    if val_loss_match:
        val_loss_value = float(val_loss_match.group(1))
        json_vals["val_loss"].append(val_loss_value)
        print("Validation loss:", val_loss_value)
    else:
        print("No match found for validation loss value.")

import json

json.dump(json_vals, open("data/real_training_history.json", mode="w"))


No match found for epoch value.
No match found for training loss value.
No match found for validation loss value.
Epoch: 1
Training loss: 7.4538e-05
Validation loss: 7.4247e-05
Epoch: 2
Training loss: 7.4497e-05
Validation loss: 7.4208e-05
Epoch: 3
Training loss: 7.4459e-05
Validation loss: 7.4173e-05
Epoch: 4
Training loss: 7.4425e-05
Validation loss: 7.4141e-05
Epoch: 5
Training loss: 7.4395e-05
Validation loss: 7.4111e-05
Epoch: 6
Training loss: 7.4367e-05
Validation loss: 7.4085e-05
Epoch: 7
Training loss: 7.4342e-05
Validation loss: 7.4061e-05
Epoch: 8
Training loss: 7.4319e-05
Validation loss: 7.4039e-05
Epoch: 9
Training loss: 7.4298e-05
Validation loss: 7.4019e-05
Epoch: 10
Training loss: 7.4279e-05
Validation loss: 7.4001e-05
Epoch: 11
Training loss: 7.4261e-05
Validation loss: 7.3985e-05
Epoch: 12
Training loss: 7.4245e-05
Validation loss: 7.397e-05
Epoch: 13
Training loss: 7.4231e-05
Validation loss: 7.3956e-05
Epoch: 14
Training loss: 7.4218e-05
Validation loss: 7.3943e-05


In [16]:
import re

# Define the output text string
output_text = "Epoch 30/40 1000/1000 [==============================] - 518s 518ms/step - loss: 7.4111e-05 - encoder_attr_loss: 3.5202e-06 - decoder_attr_loss: 7.4111e-05 - val_loss: 7.3842e-05 - val_encoder_attr_loss: 3.5171e-06 - val_decoder_attr_loss: 7.3842e-05"

# Define the regular expressions to match the loss and val_loss values
epoch_regex = r"Epoch (\d+)/"
loss_regex = r"loss: (\d+\.\d+e?-?\d+)"
val_loss_regex = r"val_loss: (\d+\.\d+e?-?\d+)"

# Search for the loss value using the regular expression
epoch_match = re.search(epoch_regex, output_text)
loss_match = re.search(loss_regex, output_text)
val_loss_match = re.search(val_loss_regex, output_text)

# Extract the loss and val_loss values from the match objects
if epoch_match:
    epoch_value = int(epoch_match.group(1))
    print("Epoch:", epoch_value)
else:
    print("No match found for epoch value.")

if loss_match:
    loss_value = float(loss_match.group(1))
    print("Training loss:", loss_value)
else:
    print("No match found for training loss value.")
    
if val_loss_match:
    val_loss_value = float(val_loss_match.group(1))
    print("Validation loss:", val_loss_value)
else:
    print("No match found for validation loss value.")

Epoch: 30
Training loss: 7.4111e-05
Validation loss: 7.3842e-05


# Recipe NLG

In [3]:
raw_recipenlg_csv = pd.read_csv(raw_data_path / "recipenlg" / "dataset" / "full_dataset.csv")
raw_recipenlg_csv.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [4]:
print(len(raw_recipenlg_csv))

2231142


In [4]:
def save_recipenlg_json(recipe_nlg: pd.DataFrame, path: Path):
    info_json = {}

    for i, row in recipe_nlg[["title", "NER", "directions", "ingredients"]].iterrows():
        recipe = row["title"]
        ingredients = eval(row["NER"])
        instructions = eval(row["directions"])
        full_ingredients = eval(row["ingredients"])
        info_json[recipe] = {
            "nombre": recipe,
            "ingredientes": [{"nombre": ingredient.lower(), } for ingredient in ingredients],
            "instrucciones": [ {"orden": i, "instruccion": instruction.lower(), } for i, instruction in enumerate(instructions)],
            "ingredientes_completos": full_ingredients
        }

    json.dump(info_json, (path).open("+w"))

recipenlg_json_path = data_path / "graphs" / "recipenlg" / "recipenlg.recipe.json"
save_recipenlg_json(raw_recipenlg_csv, recipenlg_json_path)

In [3]:
recipenlg_json_path = data_path / "graphs" / "recipenlg" / "recipenlg.recipe.json"
recipenlg_recipes_json = json.load(recipenlg_json_path.open())

In [6]:
from engine.graph_io import export_graph
from engine.graph_construction import build_ingredient_graph, build_ingredient_recipe_graph, build_recipe_graph

graph_base_path = data_path / "graphs" / "recipenlg"

# Works 127 seconds
G8 = build_ingredient_recipe_graph(recipenlg_recipes_json)
export_graph(G8, graph_base_path / "bipartite_recipe_ingredient.graphml")

# Works ?? minutes
# G7 = build_ingredient_graph(foodcom_recipes_json)
# nx.write_graphml(G7, graph_base_path / "ingredient_node_weighted.graphml")

# TOO EXPENSIVE
# G6 = build_recipe_graph(foodcom_recipes_json)
# nx.write_graphml(G6, graph_base_path / f"recipe_node_weighted.graphml")


In [3]:
from engine.graph_io import import_graph

graph_base_path = data_path / "graphs" / "recipenlg"
B = import_graph(graph_base_path / "bipartite_recipe_ingredient.graphml")
assert nx.is_bipartite(B)

In [None]:
from engine.graph_construction import build_recipe_recipe_graph
graph_base_path = data_path / "graphs" / "recipenlg"

# TOO EXPENSIVE
G6 = build_recipe_recipe_graph(B)
nx.write_graphml(G6, graph_base_path / f"recipe_node_weighted.graphml")

In [5]:
from engine.graph_io import export_graph
from engine.graph_construction import build_ingredient_graph
from engine.ranking import get_ingredient_cutoff

greatest_ingredients = get_ingredient_cutoff(B, ingredient_cutoff_percentage=0.85)
print(len(greatest_ingredients))
print(greatest_ingredients[:10])

# Works 10 minutes
G_jaccard, G_pmi = build_ingredient_graph(recipenlg_recipes_json, ingredients=[x.removesuffix("_ingredient") for x in greatest_ingredients])
export_graph(G_jaccard, graph_base_path / "ingredient_node_reduced_jaccard_weighted.graphml")
export_graph(G_pmi, graph_base_path / "ingredient_node_reduced_pmi_weighted.graphml")

757
['salt_ingredient', 'sugar_ingredient', 'butter_ingredient', 'garlic_ingredient', 'flour_ingredient', 'onion_ingredient', 'eggs_ingredient', 'water_ingredient', 'milk_ingredient', 'olive oil_ingredient']


In [None]:
from engine.graph_io import export_graph
from engine.graph_construction import build_ingredient_graph
from engine.ranking import get_recipe_cutoff
from engine.graph_construction import build_recipe_recipe_graph
graph_base_path = data_path / "graphs" / "recipenlg"

# Works 8 minutes
greatest_recipes = get_recipe_cutoff(B, recipe_cutoff_percentage=0.8)
print(len(greatest_recipes))
print(min(greatest_recipes))
for x in greatest_recipes[:10]:
    print(x)

# Works ??
B1 = B.subgraph([x[1] for x in greatest_recipes] + [x for x in B if B.nodes[x]["type"] == "ingredient"])
nx.write_graphml(B1, graph_base_path / f"reduced_bipartite_recipe_ingredient.graphml")

# With 1000000 nodes stopped at 80 minutes
G6 = build_recipe_recipe_graph(B1)
nx.write_graphml(G6, graph_base_path / f"reduced_recipe_node_weighted.graphml")