In [1]:
import engine as ng
from pathlib import Path
import pandas as pd
import json
from typing import List, Callable, Dict, Tuple
import networkx as nx
import numpy as np

data_path = Path("data")
raw_data_path = Path("raw_data")

In [2]:
def get_ingredient_cutoff(bipartite_graph: nx.Graph, ingredient_cutoff_percentage = 0.9) -> List[str]:
    """
    Returns the ingredients that represents the `ingredient_cutoff_percentage` of the total, sorted by occurrence
    """
    ingredient_with_occurrence = [(len(list(bipartite_graph.neighbors(ingredient))) , ingredient) for ingredient in bipartite_graph.nodes if bipartite_graph.nodes[ingredient]["type"] == "ingredient"]
    ingredient_with_occurrence = sorted(ingredient_with_occurrence, reverse=True)

    # Extract the ingredient names and occurrence values
    occurrences = [x[0] for x in ingredient_with_occurrence]
    ingredients = [x[1] for x in ingredient_with_occurrence]
    cumulative_occurrences = np.cumsum(occurrences)

    cutoff_index = -1
    total = cumulative_occurrences[-1]
    for i, cumsum in enumerate(cumulative_occurrences):
        if cumsum / total >= ingredient_cutoff_percentage:
            cutoff_index = i
            break
    
    return ingredients[:cutoff_index]


# Food.com

In [3]:
original_data_base_path = raw_data_path / "foodcom" / "archive"
raw_recipe_path = original_data_base_path / "RAW_recipes.csv"
raw_interactions_path = original_data_base_path / "RAW_interactions.csv"

In [4]:
raw_recipe = pd.read_csv(raw_recipe_path)
raw_recipe.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [7]:
print(len(raw_recipe))

231637


In [5]:
raw_interactions = pd.read_csv(raw_interactions_path)
raw_interactions.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


In [6]:
print(len(raw_interactions))

1132367


In [7]:

def save_foodcom_json(raw_recipe: pd.DataFrame, path: Path):
    info_json = {}

    for i, row in raw_recipe[["name", "ingredients", "steps"]].iterrows():
        recipe = row["name"]
        ingredients = eval(row["ingredients"])
        instructions = eval(row["steps"])
        info_json[recipe] = {
            "nombre": recipe,
            "ingredientes": [{"nombre": ingredient, } for ingredient in ingredients],
            "instrucciones": [ {"orden": i, "instruccion": instruction, } for i, instruction in enumerate(instructions)],
        }

    json.dump(info_json, path.open("+w"))

foodcom_json_path = data_path / "graphs" / "foodcom" / "food.com.recipe.json"
save_foodcom_json(raw_recipe, foodcom_json_path)

In [8]:
from engine.processors import extract_ingredients_with_modifiers_nltk_grammar as normalize_ingredient

def process_foodcom_json(foodcom_recipes_json, path: Path, ingredient_normalizer: Callable[[str], List[Tuple[str,str]]]):
    new_foodcom_recipes = { }
    for recipe, info in foodcom_recipes_json.items():
        new_foodcom_recipes[recipe] = {
            "nombre": recipe,
            "ingredientes": [ 
                {
                    "nombre": new_ingredient, 
                    "modificador": modifier
                } for ingredient in info["ingredientes"]
                    for new_ingredient, modifier in ingredient_normalizer(ingredient["nombre"])
            ],
            "instrucciones": info["instrucciones"],
        }
    json.dump(new_foodcom_recipes, path.open(mode="+w"))

foodcom_recipes_norm_path = data_path / "graphs" / "foodcom" / "food.com.recipe_normalized.json"
foodcom_recipes_json = json.load(foodcom_json_path.open())
process_foodcom_json(foodcom_recipes_json, foodcom_recipes_norm_path, normalize_ingredient )

In [3]:
foodcom_recipes_norm_path = data_path / "graphs" / "foodcom" / "food.com.recipe_normalized.json"
foodcom_recipes_json = json.load(foodcom_recipes_norm_path.open())

In [4]:
from engine.graph_io import export_graph
from engine.graph_construction import build_ingredient_graph, build_ingredient_recipe_graph, build_recipe_graph

graph_base_path = data_path / "graphs" / "foodcom"

# Works 30 seconds
G8 = build_ingredient_recipe_graph(foodcom_recipes_json)
export_graph(G8, graph_base_path / "bipartite_recipe_ingredient.graphml")

# Works 17-20 minutes
# G7 = build_ingredient_graph(foodcom_recipes_json)
# nx.write_graphml(G7, graph_base_path / "ingredient_node_weighted.graphml")

# TOO EXPENSIVE
# G6 = build_recipe_graph(foodcom_recipes_json)
# nx.write_graphml(G6, graph_base_path / f"recipe_node_weighted.graphml")


In [4]:
from engine.graph_io import import_graph

graph_base_path = data_path / "graphs" / "foodcom"
B = import_graph(graph_base_path / "bipartite_recipe_ingredient.graphml")
assert nx.is_bipartite(B)


In [5]:
from engine.graph_io import export_graph
from engine.graph_construction import build_ingredient_graph

greatest_ingredients = get_ingredient_cutoff(B, ingredient_cutoff_percentage=0.90)
print(len(greatest_ingredients))
print(greatest_ingredients[:10])

# Works 1 minute
G_jaccard, G_pmi = build_ingredient_graph(foodcom_recipes_json, ingredients=[x.removesuffix("_ingredient") for x in greatest_ingredients])
export_graph(G_jaccard, graph_base_path / "ingredient_node_reduced_jaccard_weighted.graphml")
export_graph(G_pmi, graph_base_path / "ingredient_node_reduced_pmi_weighted.graphml")


521
['salt_ingredient', 'pepper_ingredient', 'onion_ingredient', 'oil_ingredient', 'butter_ingredient', 'sugar_ingredient', 'egg_ingredient', 'flour_ingredient', 'water_ingredient', 'clove_ingredient']


In [6]:
from engine.recipe_ingredient_bipartite_queries import return_available_recipes_given_ingredients, return_ingredient_given_query, return_ingredients_given_recipe, return_recipe_given_query


ingredients = return_ingredient_given_query("egg", B)
print(ingredients)
recipes = return_available_recipes_given_ingredients(["egg", "flour", "pineapple"], B, True)
print(recipes)
recipe_ingredients = return_ingredients_given_recipe("zucchini pineapple wheat bread", B)
print(recipe_ingredients)
recipes = return_recipe_given_query("bread", B)
print(recipes)

['egg', 'eggnog', 'egg roll', 'egg wash', 'egg yolk', 'egglands', 'eggplant', 'eggshell', 'taleggio', 'egg bread', 'egg matzo', 'egg white', "eggland's", 'jumbo egg', 'omega egg', 'quail egg', 'egg beater', 'egg noodle', 'egg tomato', 'egg-whites', 'soy eggnog', 'dairy eggnog', 'veggie shred', 'baby eggplant', 'chocolate egg', 'egg roll wrap', 'veggie burger', 'egg roll crepe', 'egg substitute', 'egg tortellini', 'free-range egg', 'globe eggplant', 'extra-large egg', 'veggie crumbles', 'egg roll wrapper', 'egg rotini pasta', 'egg sandwich bun', 'egg-roll wrapper', 'eggnog ice cream', 'wheat egg noodle', 'medium egg noodle', 'liquid egg product', 'spinach egg noodle', 'egg replacer powder', 'chocolate easter egg', 'liquid egg substitute', 'miniature chocolate egg', "yve's veggie ground round", 'parmigiano-reggiano cheese']
['zucchini pineapple wheat bread', 'zucchini pineapple loaf cake', 'zucchini pineapple cake', 'zucchini pineapple bread', 'zucchini kahlua bread', 'zucchini bread wit

# Recipe NLG

In [3]:
raw_recipenlg_csv = pd.read_csv(raw_data_path / "recipenlg" / "dataset" / "full_dataset.csv")
raw_recipenlg_csv.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [4]:
print(len(raw_recipenlg_csv))

2231142


In [4]:
def save_recipenlg_json(recipe_nlg: pd.DataFrame, path: Path):
    info_json = {}

    for i, row in recipe_nlg[["title", "NER", "directions", "ingredients"]].iterrows():
        recipe = row["title"]
        ingredients = eval(row["NER"])
        instructions = eval(row["directions"])
        full_ingredients = eval(row["ingredients"])
        info_json[recipe] = {
            "nombre": recipe,
            "ingredientes": [{"nombre": ingredient.lower(), } for ingredient in ingredients],
            "instrucciones": [ {"orden": i, "instruccion": instruction.lower(), } for i, instruction in enumerate(instructions)],
            "ingredientes_completos": full_ingredients
        }

    json.dump(info_json, (path).open("+w"))

recipenlg_json_path = data_path / "graphs" / "recipenlg" / "recipenlg.recipe.json"
save_recipenlg_json(raw_recipenlg_csv, recipenlg_json_path)

In [3]:
recipenlg_json_path = data_path / "graphs" / "recipenlg" / "recipenlg.recipe.json"
recipenlg_recipes_json = json.load(recipenlg_json_path.open())

In [6]:
from engine.graph_io import export_graph
from engine.graph_construction import build_ingredient_graph, build_ingredient_recipe_graph, build_recipe_graph

graph_base_path = data_path / "graphs" / "recipenlg"

# Works 127 seconds
G8 = build_ingredient_recipe_graph(recipenlg_recipes_json)
export_graph(G8, graph_base_path / "bipartite_recipe_ingredient.graphml")

# Works ?? minutes
# G7 = build_ingredient_graph(foodcom_recipes_json)
# nx.write_graphml(G7, graph_base_path / "ingredient_node_weighted.graphml")

# TOO EXPENSIVE
# G6 = build_recipe_graph(foodcom_recipes_json)
# nx.write_graphml(G6, graph_base_path / f"recipe_node_weighted.graphml")


In [4]:
from engine.graph_io import import_graph

graph_base_path = data_path / "graphs" / "recipenlg"
B = import_graph(graph_base_path / "bipartite_recipe_ingredient.graphml")
assert nx.is_bipartite(B)

In [5]:
from engine.graph_io import export_graph
from engine.graph_construction import build_ingredient_graph

greatest_ingredients = get_ingredient_cutoff(B, ingredient_cutoff_percentage=0.85)
print(len(greatest_ingredients))
print(greatest_ingredients[:10])

# Works 10 minutes
G_jaccard, G_pmi = build_ingredient_graph(recipenlg_recipes_json, ingredients=[x.removesuffix("_ingredient") for x in greatest_ingredients])
export_graph(G_jaccard, graph_base_path / "ingredient_node_reduced_jaccard_weighted.graphml")
export_graph(G_pmi, graph_base_path / "ingredient_node_reduced_pmi_weighted.graphml")

757
['salt_ingredient', 'sugar_ingredient', 'butter_ingredient', 'garlic_ingredient', 'flour_ingredient', 'onion_ingredient', 'eggs_ingredient', 'water_ingredient', 'milk_ingredient', 'olive oil_ingredient']
