In [1]:
import networkx
import json
import collections

In [2]:
# Load data from the scrape
# data = json.load(open('../bbc_ingredients/bbc_ingredients.2.json', 'r'));
data = json.load(open('../bbc_ingredients/bbc_ingredients.20150426_0911.json', 'r'));

In [3]:
ingredient_recipe_data = []
recipe_data = []

for element in data:
    if element['item_type'] == 'IngredientRecipeItem':
        ingredient_recipe_data.append(element)
    else:
        recipe_data.append(element)

KeyError: 'item_type'

In [23]:
def add_hypernodes(G, recipe_data):
    # Create all nodes in the graph
    for recipe in recipe_data:
        G.add_node(
            recipe['url'],
            recipe)
        G.node[recipe['url']]['type'] = 'hypernode'

In [29]:
def add_hyperedges(G, recipe_data):
    # Create all edges between nodes
    for recipe in recipe_data:
        recipe_url = recipe['url']
        assert G.has_node(recipe_url)
        for ingredient_ids, ingredient_text in zip(recipe.get('ingredient_urls', []), recipe.get('ingredients', [])):
            ingredient_ids = list(set(ingredient_ids))
            hyperedge_sources = [ingredient_ids]
            # If the line only contains one ingredient, just create 1 hyperedge
            if len(ingredient_ids) > 1:
                if ingredient_text.find(' or ') >= 0:
                    # If the line is an 'OR' of ingredient, create one hyperedge
                    pass
                else:
                    # Else, create one hyperedge for each ingredient.
                    hyperedge_sources = map(lambda x : [x], ingredient_ids)
            for hyperedge_source in hyperedge_sources:
                hyperedge_name = 'E(%s)' % ', '.join(hyperedge_source)
                if not G.has_node(hyperedge_name):
                    G.add_node(
                        hyperedge_name,
                        {
                            'sources': hyperedge_source,
                            'targets': []
                        })
                    G.node[hyperedge_name]['type'] = 'hyperedge'
                    # Connect the hyperedge to the source
                    for ingredient_id in hyperedge_source:
                        assert G.has_node(ingredient_id)
                        G.add_edge(ingredient_id, hyperedge_name)
                # Connect to the target
                G.node[hyperedge_name]['targets'].append(recipe_url)
                G.add_edge(hyperedge_name, recipe_url)

In [30]:
def create_ingredient_recipe_list(ingredient_recipe_data):
    # Group together the recipes of the ingredients
    ingredient_recipe_list = collections.defaultdict(set)
    for ingredient_recipe in ingredient_recipe_data:
        # Find the ingredient node
        ingredient_url = ingredient_recipe['ingredient_url']
        recipe_url = ingredient_recipe['recipe_url']
        ingredient_recipe_list[ingredient_url].add(recipe_url)
    return ingredient_recipe_list

In [31]:
def split_pseudo_ingredient_hypernodes(G, ingredient_recipe_data):
    # Split the ingredient node based on the different recipes
    ingredient_recipe_list = create_ingredient_recipe_list(ingredient_recipe_data)
    for ingredient_url, recipe_urls in ingredient_recipe_list.iteritems():
    #     print 'Ingredient: %s, recipes: %s' % (ingredient_url, ', '.join(recipe_urls))
        assert G.has_node(ingredient_url)
        assert G.in_degree(ingredient_url) == 0

        # Extract the outgoing and incoming edges
        out_edges = G.out_edges(ingredient_url)
        for recipe_url in recipe_urls:
            assert G.has_node(recipe_url), 'Unable to find node %s' % recipe_url
    #         print 'Ingredient: %s, recipe: %s' % (ingredient_url, recipe_url)
            if 'ingredient_url' not in G.node[recipe_url]:
                G.node[recipe_url]['ingredient_url'] = []
            # Annotate the recipe node with the ingredient url
            G.node[recipe_url]['ingredient_url'].append(ingredient_url)

            if len(G.node[recipe_url]['ingredient_url']) > 1:
                print 'Warning: recipe %s is shared (%s) ' % (recipe_url, ', '.join(G.node[recipe_url]['ingredient_url'])) 

            # Connect outgoing nodes
            for _, hyperedge in out_edges:
                G.add_edge(recipe_url, hyperedge)
        # Remove the ingredient node
        G.remove_node(ingredient_url)

In [32]:
G = networkx.DiGraph()
add_hypernodes(G, recipe_data)
print 'Nodes: %d, edges: %d' % (G.number_of_nodes(), G.number_of_edges())
add_hyperedges(G, recipe_data)
print 'Nodes: %d, edges: %d' % (G.number_of_nodes(), G.number_of_edges())
# Save check. The graph should be acyclic
assert len(list(networkx.algorithms.cycles.simple_cycles(G))) == 0

 Nodes: 9514, edges: 0
 Nodes: 10667, edges: 99913


In [4]:
# Split the pseudo-ingredient hypernodes
G_ingredient_split = G.copy()
split_pseudo_ingredient_hypernodes(G_ingredient_split, ingredient_recipe_data)
print 'Nodes: %d, edges: %d' % (G_ingredient_split.number_of_nodes(), G_ingredient_split.number_of_edges())
assert len(list(networkx.algorithms.cycles.simple_cycles(G_ingredient_split))) == 0

NameError: name 'G' is not defined

In [5]:
def remove_simple_hyperedges(G):
    # Removes hyperedges with one source and one target
    for x, d in G.nodes(data=True):
        if d['type'] == 'hyperedge':
            in_edges = G.in_edges[x]
            out_edges = G.out_edges[x]
            assert len(in_edges) > 0
            assert len(out_edges) > 0
            if len(in_edges) == 1 and len(out_edges) == 1:
                # Remove hyperedge
                G.add_edge(in_edges[0][0], out_edges[0][1], {'type': 'hyperedge'})
                G.remove_node(x)

In [11]:
G_compressed = G_ingredient_split.copy()
remove_simple_hyperedges(G_compressed)
print 'Nodes: %d, edges: %d' % (G_compressed.number_of_nodes(), G_compressed.number_of_edges())
# Save check. The graph should be acyclic
assert len(list(networkx.algorithms.cycles.simple_cycles(G_compressed))) == 0

False

In [54]:
# Remove some odd nodes
def remove_food_node(G, id):
    out_edges = G.out_edges[id]
    in_edges = G.in_edges[id]
    # Remove the node from the graph
    G.remove_node(id)
    # Remove all outgoing hyperedges, iff no other node is connected
    for _, hyperedge in out_edges:
        if G.in_degree(hyperedge) == 0:
            G.remove_node(hyperedge)
    # Remove all incoming hyperedges
    for _, hyperedge in in_edges:
        G.remove_node(hyperedge)

remove_food_node(G, 'zest')
remove_food_node(G, 'http://www.bbc.co.uk/food/quick_suppers')


TypeError: 'instancemethod' object has no attribute '__getitem__'