In [1]:
import os
import pickle
from collections import Counter
from collections import defaultdict

import networkx as nx
import nltk
from bs4 import BeautifulSoup
from nltk.util import ngrams
from pymongo import MongoClient

In [2]:
client = MongoClient()
db = client.recipes

# Parser

In [3]:
spanish_stopwords = set()
with open('data/spanish_stopwords.txt') as f:
    for line in f:
        word = line.strip()
        spanish_stopwords.add(word)

In [4]:
all_ingredients_graph = nx.read_gexf('data/spanish_ingredients_lexicon_5.gexf')

In [5]:
ingredients = all_ingredients_graph.nodes()
ingredients = [i for i in ingredients if i not in spanish_stopwords]

In [6]:
ingredients[:10]

['mantequilla extra sequias',
 'piñones frito',
 'puré papa',
 'cereal de sésamo arabiga',
 'sorbete cocos',
 'castanas jarabes',
 'queso bries',
 'ajonjolies negro tostado',
 'cerdo espín mar',
 'ajonjolí tostado']

In [7]:
all_techniques_graph = nx.read_gexf('data/spanish_techniques_lexicon_5.gexf')

In [8]:
techniques = all_techniques_graph.nodes()
techniques = [t for t in techniques if t not in spanish_stopwords]

In [9]:
techniques[:10]

['rótir a presionando',
 'barbacoa presionar',
 'licue',
 'horno presionar',
 'diamantamos',
 'sofreímos a presione',
 'guisando rojeando',
 'secado',
 'sofríe a presion',
 'cueza en caldee blanco']

In [10]:
def trim(s):
    return ' '.join(s.split())

In [11]:
def my_ngrams(s):
    ngrms = []
    tokens = nltk.word_tokenize(s)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

In [12]:
# %%time

# db.cookpad_raw.drop()

# count = 0
# rows = []

# for filename in os.listdir('/media/antonio/WD1T/datasets-recipes/cookpad/'):
#     with open('/media/antonio/WD1T/datasets-recipes/cookpad/' + filename) as f:
#         row = {
#             '_id': '',
#             'title': '',
#             'cuisine': '',
#             'year': 0,
#             'ingredients': set(),
#             'techniques': set(),
#         }
#         ide = filename.split('traditional_recipe_')[1].split('.')[0]
#         soup = BeautifulSoup(f.read(), 'html.parser')
#         cuisine = trim(soup.select(
#             'div#main_contents div.recipe-show div#editor section.intro div[data-field-group] div.metadata__field \
#              span[data-field-name="cuisine"]')[0].text.lower()
#         )
#         if 'españa' not in cuisine:
#             continue
#         ingreds = set()
#         elems = soup.select(
#             'div#main_contents section#ingredients div.ingredient-list ol li.ingredient span.ingredient__attribute--name'
#         )
#         ingrs = list(map(lambda x: trim(x.text.lower()), elems))
#         for ingr in ingrs:
#             ngrms = my_ngrams(ingr)
#             ngrms.reverse()
#             for ngrm in ngrms:
#                 if ngrm in ingredients:
#                     ingreds.add(ngrm)
#                     all_ingredients_graph.node[ngrm]['count'] += 1
#                     break
#         if not ingreds:
#             continue
#         techns = set()
#         elems = soup.select(
#             'div#main_contents section#steps ol li.step div.step__description \
#              div[itemprop="recipeInstructions"] p.step__text'
#         )
#         steps = list(map(lambda x: trim(x.text.lower()), elems))
#         for step in steps:
#             used_ngrams = set()
#             ngrms = my_ngrams(step)
#             ngrms.reverse()
#             for ngrm in ngrms:
#                 if ngrm in techniques and all(map(lambda x: ngrm not in x, used_ngrams)):
#                     techns.add(ngrm)
#                     all_techniques_graph.node[ngrm]['count'] += 1
#                     used_ngrams.add(ngrm)
#         if not techns:
#             continue
#         time = soup.select('time[itemprop="datePublished"]')[0]['datetime']
#         year = time.split('-')[0]
#         title = trim(soup.select(
#             'div#main_contents div.recipe-show div#editor section.intro h1.recipe-title')[0].text.lower()
#         )
#         row['_id'] = ide
#         row['title'] = title
#         row['cuisine'] = cuisine
#         row['year'] = year
#         row['ingredients'] = list(ingreds)
#         row['techniques'] = list(techns)
#         rows.append(row)

#         count += 1
#         if count % 100 == 0:
#             db.cookpad_raw.insert_many(rows)
#             rows = []
#             print(count, 'rows inserted')
# db.cookpad_raw.insert_many(rows)
# rows = []
# print(count, 'rows inserted')

# CPU times: user 1h 4min 21s, sys: 1.68 s, total: 1h 4min 22s
# Wall time: 1h 4min 21s

In [13]:
db.cookpad_raw.count()

7975

In [14]:
# nx.write_gexf(all_ingredients_graph, 'data/cookpad_ingredients_lexicon_1.gexf')

In [15]:
# nx.write_gexf(all_techniques_graph, 'data/cookpad_techniques_lexicon_1.gexf')

In [16]:
ingredients_graph = nx.read_gexf('data/cookpad_ingredients_lexicon_1.gexf')

In [17]:
techniques_graph = nx.read_gexf('data/cookpad_techniques_lexicon_1.gexf')

# Only ingredients and techniques found in Cookpad's recipes

In [18]:
def add_edge(g, n1, n2):
    if n1 != n2 and not nx.has_path(g, n1, n2):
        g.add_edge(n1, n2)

In [19]:
def del_node_and_link_neighbors(g, n):
    neighbors = g.neighbors(n)
    g.remove_node(n)
    if len(neighbors) > 1:
        add_edge(g, neighbors[0], neighbors[1])

In [20]:
for n in ingredients_graph.nodes():
    if ingredients_graph.node[n]['count'] == 0:
        del_node_and_link_neighbors(ingredients_graph, n)

In [21]:
len(ingredients_graph)

1630

In [22]:
ingredients_graph.number_of_edges()

523

In [23]:
nx.number_connected_components(ingredients_graph)

1107

In [24]:
nx.write_gexf(ingredients_graph, 'data/cookpad_ingredients_lexicon_2.gexf')

In [25]:
ingredients_graph = nx.read_gexf('data/cookpad_ingredients_lexicon_2.gexf')

In [26]:
for n in techniques_graph.nodes():
    if techniques_graph.node[n]['count'] == 0:
        del_node_and_link_neighbors(techniques_graph, n)

In [27]:
len(techniques_graph)

361

In [28]:
techniques_graph.number_of_edges()

252

In [29]:
nx.number_connected_components(techniques_graph)

109

In [30]:
nx.write_gexf(techniques_graph, 'data/cookpad_techniques_lexicon_2.gexf')

In [31]:
techniques_graph = nx.read_gexf('data/cookpad_techniques_lexicon_2.gexf')

# Representatives

In [32]:
# def value(g, n, r):
#     ms = [m for m, rels in g[n].items() for rel in rels.values() if rel['label'] == r]
#     return ms[0]

def value(g, n, r):
    ms = [y for x, y, d in g.edges_iter(data=True) if x == n and d['label'] == r]
    return ms[0]

In [33]:
ingredients_multidigraph = nx.MultiDiGraph()

for syns in nx.connected_components(ingredients_graph):
    max_ingr = ''
    max_count = 0
    total_count = 0
    for ingr in syns:
        dat = ingredients_graph.node[ingr]
        if dat['count'] > max_count:
            max_ingr = ingr
            max_count = dat['count']
        total_count += dat['count']
    ingredients_multidigraph.add_node(max_ingr, {'repr_count': total_count})
    for ingr in syns:
        ingredients_multidigraph.add_node(ingr, ingredients_graph.node[ingr])
        ingredients_multidigraph.add_edge(ingr, max_ingr, label='repr')

In [34]:
len(ingredients_multidigraph)

1630

In [35]:
ingredients_multidigraph.number_of_edges()

1630

In [36]:
nx.write_gexf(ingredients_multidigraph, 'data/cookpad_ingredients_lexicon_3.gexf')

In [37]:
ingredients_multidigraph = nx.read_gexf('data/cookpad_ingredients_lexicon_3.gexf')
ingredients_multidigraph = nx.MultiDiGraph(ingredients_multidigraph)

In [38]:
techniques_multidigraph = nx.MultiDiGraph()

for syns in nx.connected_components(techniques_graph):
    max_tech = ''
    max_count = 0
    total_count = 0
    for tech in syns:
        dat = techniques_graph.node[tech]
        if dat['count'] > max_count:
            max_tech = tech
            max_count = dat['count']
        total_count += dat['count']
    techniques_multidigraph.add_node(max_tech, {'repr_count': total_count})
    for tech in syns:
        techniques_multidigraph.add_node(tech, techniques_graph.node[tech])
        techniques_multidigraph.add_edge(tech, max_tech, label='repr')

In [39]:
len(techniques_multidigraph)

361

In [40]:
techniques_multidigraph.number_of_edges()

361

In [41]:
nx.write_gexf(techniques_multidigraph, 'data/cookpad_techniques_lexicon_3.gexf')

In [42]:
techniques_multidigraph = nx.read_gexf('data/cookpad_techniques_lexicon_3.gexf')
techniques_multidigraph = nx.MultiDiGraph(techniques_multidigraph)

# Superclasses

In [43]:
def add_superclass(g, n, sc):
    if sc not in g:
        g.add_node(sc, {'sc_count': 1})
        g.add_edge(sc, sc, label='repr')
        g.add_edge(sc, sc, label='superclass')
    elif 'sc_count' not in g.node[sc]:
        g.node[sc]['sc_count'] = 1
    else:
        g.node[sc]['sc_count'] += 1
    g.add_edge(n, sc, label='superclass')

In [44]:
for ingr in ingredients_multidigraph.nodes():
    representantive = value(ingredients_multidigraph, ingr, 'repr')
    tokens = nltk.word_tokenize(representantive)
    candidates = [token for token in tokens if token in ingredients]
    if not candidates:
        sc = tokens[0]
    else:
        sc = candidates[0]
    add_superclass(ingredients_multidigraph, ingr, sc)

In [45]:
len(ingredients_multidigraph)

1656

In [46]:
ingredients_multidigraph.number_of_edges()

3312

In [47]:
nx.write_gexf(ingredients_multidigraph, 'data/cookpad_ingredients_lexicon_4.gexf')

In [48]:
ingredients_multidigraph = nx.read_gexf('data/cookpad_ingredients_lexicon_4.gexf')
ingredients_multidigraph = nx.MultiDiGraph(ingredients_multidigraph)

In [49]:
for tech in techniques_multidigraph.nodes():
    representantive = value(techniques_multidigraph, tech, 'repr')
    tokens = nltk.word_tokenize(representantive)
    candidates = [token for token in tokens if token in techniques]
    if not candidates:
        sc = tokens[0]
    else:
        sc = candidates[0]
    add_superclass(techniques_multidigraph, tech, sc)

In [50]:
len(techniques_multidigraph)

371

In [51]:
techniques_multidigraph.number_of_edges()

742

In [52]:
nx.write_gexf(techniques_multidigraph, 'data/cookpad_techniques_lexicon_4.gexf')

In [53]:
techniques_multidigraph = nx.read_gexf('data/cookpad_techniques_lexicon_4.gexf')
techniques_multidigraph = nx.MultiDiGraph(techniques_multidigraph)

# Types of ingredient

In [54]:
with open('data/spanish_ingredients_type.pickle', 'rb') as f:
    type_dict = pickle.load(f)

In [55]:
for ingr, dat in ingredients_multidigraph.nodes_iter(data=True):
    if ingr in type_dict:
        dat['type'] = type_dict[ingr]
    else:
        dat['type'] = 'unknown'

In [56]:
a = b = c = 0
for ingr, dat in ingredients_multidigraph.nodes_iter(data=True):
    a += 1
    if dat['type'] == 'unknown':
        c += 1
    else:
        b += 1
print('Total ingredients:', a)
print('Classified:', b, '({}%)'.format(round(b * 100 / a)))
print('Not classified:', c, '({}%)'.format(round(c * 100 / a)))

Total ingredients: 1656
Classified: 404 (24%)
Not classified: 1252 (76%)


In [57]:
ambiguous = []

for syns in nx.connected_components(all_ingredients_graph):
    types = []
    if any(map(lambda x: x in ingredients_multidigraph and ingredients_multidigraph.node[x]['type'] != 'unknown', syns)):
        types = [ingredients_multidigraph.node[x]['type']
                 for x in syns
                     if x in ingredients_multidigraph and ingredients_multidigraph.node[x]['type'] != 'unknown']
    else:
        for ingr in syns:
            if ingr in type_dict:
                types.append(type_dict[ingr])
            else:
                for ngrm in my_ngrams(ingr):
                    if ngrm in type_dict:
                        types.append(type_dict[ngrm])
    if not types:
        typ = 'other'
    elif len(set(types)) == 1:
        typ = types[0]
    else:
        c = Counter(types)
        most_common_list = c.most_common()
        typ = most_common_list[0][1]
        if most_common_list[1][1] == typ:
            ambiguous.append((syns, most_common_list))
    for ingr in syns:
        if ingr in ingredients_multidigraph:
            ingredients_multidigraph.node[ingr]['type'] = typ

for syns in ambiguous:
    if any(map(lambda x: x in ingredients_multidigraph, syns)):
        print(syns)
        print()

In [58]:
len(ambiguous)

239

In [59]:
a = b = c = 0
for ingr, dat in ingredients_multidigraph.nodes_iter(data=True):
    a += 1
    if dat['type'] == 'other':
        c += 1
    else:
        b += 1
print('Total ingredients:', a)
print('Classified:', b, '({}%)'.format(round(b * 100 / a)))
print('Not classified:', c, '({}%)'.format(round(c * 100 / a)))

Total ingredients: 1656
Classified: 1304 (79%)
Not classified: 352 (21%)


In [60]:
nx.write_gexf(ingredients_multidigraph, 'data/cookpad_ingredients_lexicon_5.gexf')

In [61]:
ingredients_multidigraph = nx.read_gexf('data/cookpad_ingredients_lexicon_5.gexf')
ingredients_multidigraph = nx.MultiDiGraph(ingredients_multidigraph)

# Cuisine

In [62]:
with open('data/spanish_ingredients_cuisine.pickle', 'rb') as f:
    cuisine_dict = pickle.load(f)

In [63]:
for ingr, dat in ingredients_multidigraph.nodes_iter(data=True):
    if ingr in cuisine_dict:
        dat['cuisine'] = cuisine_dict[ingr]
    else:
        dat['cuisine'] = 'unknown'

In [64]:
a = b = c = 0
for ingr, dat in ingredients_multidigraph.nodes_iter(data=True):
    a += 1
    if dat['cuisine'] == 'unknown':
        c += 1
    else:
        b += 1
print('Total ingredients:', a)
print('Classified:', b, '({}%)'.format(round(b * 100 / a)))
print('Not classified:', c, '({}%)'.format(round(c * 100 / a)))

Total ingredients: 1656
Classified: 1578 (95%)
Not classified: 78 (5%)


In [65]:
ambiguous = []

for syns in nx.connected_components(all_ingredients_graph):
    cuisines = []
    if any(map(lambda x: x in ingredients_multidigraph and ingredients_multidigraph.node[x]['cuisine'] != 'unknown', syns)):
        cuisines = [ingredients_multidigraph.node[x]['cuisine']
                    for x in syns
                        if x in ingredients_multidigraph and ingredients_multidigraph.node[x]['cuisine'] != 'unknown']
    else:
        for ingr in syns:
            if ingr in cuisine_dict:
                cuisines.append(cuisine_dict[ingr])
            else:
                for ngrm in my_ngrams(ingr):
                    if ngrm in cuisine_dict:
                        cuisines.append(cuisine_dict[ngrm])
    if not cuisines:
        cuisine = 'other'
    elif len(set(cuisines)) == 1:
        cuisine = cuisines[0]
    else:
        c = Counter(cuisines)
        most_common_list = c.most_common()
        cuisine = most_common_list[0][1]
        if most_common_list[1][1] == cuisine:
            ambiguous.append((syns, most_common_list))
    for ingr in syns:
        if ingr in ingredients_multidigraph:
            ingredients_multidigraph.node[ingr]['cuisine'] = cuisine

for syns in ambiguous:
    if any(map(lambda x: x in ingredients_multidigraph, syns)):
        print(syns)
        print()

In [66]:
len(ambiguous)

183

In [67]:
a = b = c = 0
for ingr, dat in ingredients_multidigraph.nodes_iter(data=True):
    a += 1
    if dat['cuisine'] == 'other':
        c += 1
    else:
        b += 1
print('Total ingredients:', a)
print('Classified:', b, '({}%)'.format(round(b * 100 / a)))
print('Not classified:', c, '({}%)'.format(round(c * 100 / a)))

Total ingredients: 1656
Classified: 1652 (100%)
Not classified: 4 (0%)


In [68]:
nx.write_gexf(ingredients_multidigraph, 'data/cookpad_ingredients_lexicon_6.gexf')

In [69]:
nx.write_gexf(techniques_multidigraph, 'data/cookpad_techniques_lexicon_6.gexf')

In [70]:
ingredients_multidigraph = nx.read_gexf('data/cookpad_ingredients_lexicon_6.gexf')
ingredients_multidigraph = nx.MultiDiGraph(ingredients_multidigraph)

In [71]:
techniques_multidigraph = nx.read_gexf('data/cookpad_techniques_lexicon_6.gexf')
techniques_multidigraph = nx.MultiDiGraph(techniques_multidigraph)

# More databases

In [72]:
# Example
print('aceite de oliva' in ingredients_multidigraph)
print(value(ingredients_multidigraph, 'aceite de oliva', 'repr'))
print(value(ingredients_multidigraph, 'aceite de oliva', 'superclass'))
print(ingredients_multidigraph.node['aceite de oliva']['type'])
print(ingredients_multidigraph.node['aceite de oliva']['cuisine'])
print()
print('olla a presión' in techniques_multidigraph)
print(value(techniques_multidigraph, 'olla a presión', 'repr'))
print(value(techniques_multidigraph, 'olla a presión', 'superclass'))

True
aceite de oliva
aceite
spicies_and_condimients
western

True
olla a presión
presión


In [74]:
# %%time

# db.cookpad_representatives.drop() # representative ingredients and techniques
# db.cookpad_superclasses.drop() # superclasses of ingredients and techniques
# db.cookpad_cuisines.drop() # cuisines of ingredients, representative techniques
# db.cookpad_types.drop() # types of ingredients, representative techniques

# count = 0
# representatives_rows = []
# superclasses_rows = []
# cuisines_rows = []
# types_rows = []

# for r in db.cookpad_raw.find():
#     row = dict(r)
#     row['ingredients'] = [value(ingredients_multidigraph, x, 'repr') for x in r['ingredients']]
#     row['techniques'] = [value(techniques_multidigraph, x, 'repr') for x in r['techniques']]
#     representatives_rows.append(row)
    
#     row = dict(r)
#     row['ingredients'] = [value(ingredients_multidigraph, x, 'superclass') for x in r['ingredients']]
#     row['techniques'] = [value(techniques_multidigraph, x, 'superclass') for x in r['techniques']]
#     superclasses_rows.append(row)
    
#     row = dict(r)
#     row['ingredients'] = [ingredients_multidigraph.node[x]['cuisine'] for x in r['ingredients']]
#     row['techniques'] = [value(techniques_multidigraph, x, 'repr') for x in r['techniques']]
#     cuisines_rows.append(row)
    
#     row = dict(r)
#     row['ingredients'] = [ingredients_multidigraph.node[x]['type'] for x in r['ingredients']]
#     row['techniques'] = [value(techniques_multidigraph, x, 'repr') for x in r['techniques']]
#     types_rows.append(row)

#     count += 1
#     if count % 100 == 0:
#         db.cookpad_representatives.insert_many(representatives_rows)
#         db.cookpad_superclasses.insert_many(superclasses_rows)
#         db.cookpad_cuisines.insert_many(cuisines_rows)
#         db.cookpad_types.insert_many(types_rows)
#         representatives_rows = []
#         superclasses_rows = []
#         cuisines_rows = []
#         types_rows = []
#         print(count, 'rows inserted')
# db.cookpad_representatives.insert_many(representatives_rows)
# db.cookpad_superclasses.insert_many(superclasses_rows)
# db.cookpad_cuisines.insert_many(cuisines_rows)
# db.cookpad_types.insert_many(types_rows)
# representatives_rows = []
# superclasses_rows = []
# cuisines_rows = []
# types_rows = []
# print(count, 'rows inserted')
    
# # CPU times: user 5min 4s, sys: 40 ms, total: 5min 4s
# # Wall time: 5min 4s