In [2]:
import json
from pathlib import Path
import networkx as nx
from itertools import combinations

json_path = Path("..", "..", "raw_data", "foodcom", "food.com.recipe_normalized.json")

recipes_json = json.load(json_path.open())

print("Total Recipes", len(recipes_json))
print("Sum of edge list", sum([len(recipes_json[x]["ingredientes"]) for x in recipes_json]))

graph_path = Path("..", "..", "raw_data", "graphs", "foodcom")

Total Recipes 230186
Sum of edge list 2168650


In [3]:
def build_recipe_graph(json: dict) -> nx.Graph:
    G_json = nx.Graph()
    
    ingredients = set()
    for recipe in json:
        ingredients.update(list(map(lambda x: x['nombre'], json[recipe]["ingredientes"])))
    ingredients = list(ingredients)

    ingredient_dict = { x: set() for x in ingredients }
    for recipe in json:
        for ingr in list(map(lambda x: x['nombre'], json[recipe]["ingredientes"])):
            ingredient_dict[ingr].add(recipe)

    edges = {}
    for ingr in ingredients:
        if len(ingredient_dict[ingr]) >= 1e4: continue    
        for rec1, rec2 in combinations(ingredient_dict[ingr], 2):
            if rec1 < rec2: rec1, rec2 = rec2, rec1
            try: edges[(rec1, rec2)] += 1
            except: edges[(rec1, rec2)] = 1
    
    for (i, j) in edges.keys():
        G_json.add_edge(i, j)

    return G_json

def build_ingredient_graph(json: dict) -> nx.Graph:
    G_json = nx.Graph()
    
    ingredients = set()
    for recipe in json:
        ingredients.update(list(map(lambda x: x['nombre'], json[recipe]["ingredientes"])))
    ingredients = list(ingredients)

    ingredient_dict = { x: set() for x in ingredients }
    for recipe in json:
        for ingr in list(map(lambda x: x['nombre'], json[recipe]["ingredientes"])):
            ingredient_dict[ingr].add(recipe)

    print("Start to build graph", len(ingredients))
    
    for ingr1, ingr2 in combinations(ingredients, 2):
        recipe1_ingredients_names = ingredient_dict[ingr1]
        recipe2_ingredients_names = ingredient_dict[ingr2]

        common_recipes =len(recipe1_ingredients_names.intersection(recipe2_ingredients_names))
        all_recipes = len(recipe1_ingredients_names.union(recipe2_ingredients_names))

        if common_recipes > 0:
            jaccard = common_recipes / all_recipes
            G_json.add_edge(ingr1, ingr2, weight=jaccard)
            
    return G_json

def build_ingredient_recipe_graph(json: dict, bipartite=True) -> nx.Graph:
    G_json = nx.Graph()

    ingredients = set()
    for recipe in json:
        ingredients.update(list(map(lambda x: x['nombre'], json[recipe]["ingredientes"])))
    ingredients = list(ingredients)

    for node in ingredients:
        G_json.add_node(node + (" (ingredient)" if bipartite else ""), type="ingredient")

    for node in json:
        G_json.add_node(node + (" (recipe)" if bipartite else ""), type="recipe")

    for recipe in json:
        for ingredient in json[recipe]["ingredientes"]:
            G_json.add_edge(recipe + (" (recipe)" if bipartite else ""), ingredient["nombre"] + (" (ingredient)" if bipartite else ""))

    return G_json


In [5]:
# Works 30 seconds
G8 = build_ingredient_recipe_graph(recipes_json)
nx.write_graphml(G8, graph_path / "bipartite_recipe_ingredient.graphml")

In [None]:
# Works 17-20 minutes
G7 = build_ingredient_graph(recipes_json)
nx.write_graphml(G7, graph_path / "ingredient_node_weighted.graphml")

In [None]:
# TOO EXPENSIVE, it consumes so much memory, most of the things that this graphs represents 
# are available in the bipartite graph too
G6 = build_recipe_graph(recipes_json)
nx.write_graphml(G6, graph_path / f"recipe_node_weighted.graphml")