In [None]:
import nltk
from pathlib import Path
base_path = Path("..", "..", "raw_data", "foodcom")
recipes_path = base_path / "food.com.recipe.json"

In [None]:
import json
recipes = json.loads(recipes_path.read_text())

In [None]:
from typing import List, Tuple

simple_grammar = \
"""
I: {<ADJ>*<NOUN>+}
"""

parser = nltk.RegexpParser(simple_grammar)
lemmatizer = nltk.WordNetLemmatizer()

def normalize(ingredient: str) -> List[Tuple[str, str]]:
    """
    Given an ingredient description extract all ingredients with its modifier.
    This implementation works by extracting all groups of adjacent adjectives and nouns
    as a single ingredient and then converting to singular all its terms. 

    ingredient: Must be a short description of ingredients.

    ## Examples:
    - normalize("the prepared pizza crust") => [('pizza crust', 'prepared')]
    - normalize("salt and pepper") => [('salt', ''), ('pepper', '')]
    - normalize("finely chopped onion") => [('onion', '')]
    - normalize("diced tomatoes") => [('tomato', '')]
    - normalize("tomato paste") => [('tomato paste', '')]
    - normalize("apple cider vinegar") => [('apple cider vinegar', '')]
    - normalize("fresh cilantro leaves") => [('cilantro leaf', 'fresh')]
    - normalize("black pepper") => [('pepper', 'black')]
    """

    def filter_tree(tree, tag):
        try:
            if tree.label() == tag:
                return [[x for x in tree]]
        except:
            return []
        current = []
        for node in tree:
            filtered_nodes = filter_tree(node, tag)
            current.extend(filtered_nodes)
        return current

    def reassemble_ingredient(parsed: List[Tuple[str, str]]) -> str:
        return " ".join([lemmatizer.lemmatize(x, pos="n") for x, y in parsed if y == "NOUN"])

    def get_modifiers(parsed: List[Tuple[str, str]]) -> str:
        return " ".join([lemmatizer.lemmatize(x, pos="a") for x, y in parsed if y == "ADJ"])

    pos = nltk.pos_tag([x.lower() for x in ingredient.split()], tagset="universal")
    result = parser.parse(pos)
    result = filter_tree(result, "I")
    if result:
        return list(zip(map(reassemble_ingredient, result), map(get_modifiers, result)))
    else:
        return [(ingredient, "")]
    
print(normalize("the prepared pizza crust"))
print(normalize("salt and pepper"))
print(normalize("finely chopped onion"))
print(normalize("diced tomatoes"))
print(normalize("juicy ripe tomato"))
print(normalize("tomato paste"))
print(normalize("apple cider vinegar"))
print(normalize("fresh cilantro leaves"))
print(normalize("black pepper"))


In [None]:
new_recipes = { }
for recipe, info in recipes.items():
    new_recipes[recipe] = {
        "nombre": recipe,
        "ingredientes": [ 
            {
                "nombre": new_ingredient, 
                "modificador": modifier
            } for ingredient in info["ingredientes"]
                for new_ingredient, modifier in normalize(ingredient["nombre"]) 
        ]
    }

recipes_norm_path = base_path / "food.com.recipe_normalized.json"
json.dump(new_recipes, recipes_norm_path.open(mode="+w"))

# TODO

- [ ] Ordenar por frecuencia y seleccionar hasta un punto las recetas.
- [ ] Hacer TF-IDF con las recetas sin normalizar y seleccionar las mejores.
- [ ] Visualizar los resultados y seleccionar el mejor conjunto de ingredientes.
