In [1]:
import os
import pickle
from collections import Counter
from collections import defaultdict

import networkx as nx
import nltk
from nltk.util import ngrams
from pymongo import MongoClient

In [2]:
client = MongoClient()
db = client.recipes

# Parser

In [3]:
spanish_stopwords = set()
with open('data/spanish_stopwords.txt') as f:
    for line in f:
        word = line.strip()
        spanish_stopwords.add(word)

In [4]:
all_ingredients_graph = nx.read_gexf('data/spanish_ingredients_lexicon_5.gexf')

In [5]:
ingredients = all_ingredients_graph.nodes()
ingredients = [i for i in ingredients if i not in spanish_stopwords]

In [6]:
ingredients[:10]

['besameles de camaron',
 'chocolate rallado',
 'cereales sesamo garrofin',
 'láminas de champinón',
 'cereal alegrias americanas',
 'alga alga alga nori sequias',
 'hinojos fresco',
 'tés verde',
 'vinagre komes',
 'alga noris secas']

In [7]:
all_techniques_graph = nx.read_gexf('data/spanish_techniques_lexicon_5.gexf')

In [8]:
techniques = all_techniques_graph.nodes()
techniques = [t for t in techniques if t not in spanish_stopwords]

In [9]:
techniques[:10]

['retencion calorizado',
 'al hornagueamos presionado',
 'sofreír a presionado',
 'al grilla presionado',
 'cocidos al vapora',
 'productos horneados a presion',
 'cocinando vaciando',
 'en jiangsu jugamos rui',
 'repostado',
 'salteando de microondas']

In [10]:
def trim(s):
    return ' '.join(s.split())

In [11]:
def my_ngrams(s):
    ngrms = []
    tokens = nltk.word_tokenize(s)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

In [13]:
# %%time

# db.elbulli_raw.drop()

# count = 0
# rows = []

# path = 'data/recipes/elbulli/'
# for folder in os.listdir(path):
#     for filename_number in sorted(map(lambda x: int(x[:-4]), os.listdir(path + folder))):
#         filename = str(filename_number) + '.dat'
#         with open(path + folder + '/' + filename) as f:
#             row = {
#                 '_id': '',
#                 'title': '',
#                 'year': 0,
#                 'ingredients': set(),
#                 'techniques': set(),
#             }
#             ingreds = set()
#             techns = set()
#             for line in f:
#                 line = line.lower().strip()
#                 if line.startswith('num'):
#                     ide = line.split('=')[1]
#                 elif line.startswith('&titol='):
#                     title = line.split('=')[1]
#                 elif line.startswith('&any'):
#                     year = int(line.split('=')[1])
#                 elif line.startswith('&ingredientselaboracio'):
#                     elems = line.split('=')[1].split('#')
#                     ingrs = map(trim, elems)
#                     for ingr in ingrs:
#                         ngrms = my_ngrams(ingr)
#                         ngrms.reverse()
#                         for ngrm in ngrms:
#                             if ngrm in ingredients:
#                                 ingreds.add(ngrm)
#                                 all_ingredients_graph.node[ngrm]['count'] += 1
#                                 break
#                 elif line.startswith('&descripcioelaboracio') or \
#                      line.startswith('&acabatipresentacio') or \
#                      line.startswith('&titolelaboracio'):
#                     elems = line.split('=')[1].split('#')
#                     steps = map(trim, elems)
#                     for step in steps:
#                         used_ngrams = set()
#                         ngrms = my_ngrams(step)
#                         ngrms.reverse()
#                         for ngrm in ngrms:
#                             if ngrm in techniques and all(map(lambda x: ngrm not in x, used_ngrams)):
#                                 techns.add(ngrm)
#                                 all_techniques_graph.node[ngrm]['count'] += 1
#                                 used_ngrams.add(ngrm)
#             row['_id'] = ide
#             row['title'] = title.lower()
#             row['year'] = year
#             row['ingredients'] = list(ingreds)
#             row['techniques'] = list(techns)
#             rows.append(row)
            
#             count += 1
#             if count % 100 == 0:
#                 db.elbulli_raw.insert_many(rows)
#                 rows = []
#                 print(count, 'rows inserted')
# db.elbulli_raw.insert_many(rows)
# rows = []
# print(count, 'rows inserted')

# CPU times: user 26min 38s, sys: 560 ms, total: 26min 38s
# Wall time: 26min 37s

In [14]:
db.elbulli_raw.count()

1214

In [15]:
# nx.write_gexf(all_ingredients_graph, 'data/elbulli_ingredients_lexicon_1.gexf')

In [16]:
# nx.write_gexf(all_techniques_graph, 'data/elbulli_techniques_lexicon_1.gexf')

In [17]:
ingredients_graph = nx.read_gexf('data/elbulli_ingredients_lexicon_1.gexf')

In [18]:
techniques_graph = nx.read_gexf('data/elbulli_techniques_lexicon_1.gexf')

# Only ingredients and techniques found in elbulli's recipes

In [19]:
def add_edge(g, n1, n2):
    if n1 != n2 and not nx.has_path(g, n1, n2):
        g.add_edge(n1, n2)

In [20]:
def del_node_and_link_neighbors(g, n):
    neighbors = g.neighbors(n)
    g.remove_node(n)
    if len(neighbors) > 1:
        add_edge(g, neighbors[0], neighbors[1])

In [21]:
for n in ingredients_graph.nodes():
    if ingredients_graph.node[n]['count'] == 0:
        del_node_and_link_neighbors(ingredients_graph, n)

In [22]:
len(ingredients_graph)

1504

In [23]:
ingredients_graph.number_of_edges()

197

In [24]:
nx.number_connected_components(ingredients_graph)

1307

In [25]:
nx.write_gexf(ingredients_graph, 'data/elbulli_ingredients_lexicon_2.gexf')

In [26]:
ingredients_graph = nx.read_gexf('data/elbulli_ingredients_lexicon_2.gexf')

In [27]:
for n in techniques_graph.nodes():
    if techniques_graph.node[n]['count'] == 0:
        del_node_and_link_neighbors(techniques_graph, n)

In [28]:
len(techniques_graph)

224

In [29]:
techniques_graph.number_of_edges()

138

In [30]:
nx.number_connected_components(techniques_graph)

86

In [31]:
nx.write_gexf(techniques_graph, 'data/elbulli_techniques_lexicon_2.gexf')

In [32]:
techniques_graph = nx.read_gexf('data/elbulli_techniques_lexicon_2.gexf')

# Representatives

In [33]:
# def value(g, n, r):
#     ms = [m for m, rels in g[n].items() for rel in rels.values() if rel['label'] == r]
#     return ms[0]

def value(g, n, r):
    ms = [y for x, y, d in g.edges_iter(data=True) if x == n and d['label'] == r]
    return ms[0]

In [34]:
ingredients_multidigraph = nx.MultiDiGraph()

for syns in nx.connected_components(ingredients_graph):
    max_ingr = ''
    max_count = 0
    total_count = 0
    for ingr in syns:
        dat = ingredients_graph.node[ingr]
        if dat['count'] > max_count:
            max_ingr = ingr
            max_count = dat['count']
        total_count += dat['count']
    ingredients_multidigraph.add_node(max_ingr, {'repr_count': total_count})
    for ingr in syns:
        ingredients_multidigraph.add_node(ingr, ingredients_graph.node[ingr])
        ingredients_multidigraph.add_edge(ingr, max_ingr, label='repr')

In [35]:
len(ingredients_multidigraph)

1504

In [36]:
ingredients_multidigraph.number_of_edges()

1504

In [37]:
nx.write_gexf(ingredients_multidigraph, 'data/elbulli_ingredients_lexicon_3.gexf')

In [38]:
ingredients_multidigraph = nx.read_gexf('data/elbulli_ingredients_lexicon_3.gexf')
ingredients_multidigraph = nx.MultiDiGraph(ingredients_multidigraph)

In [39]:
techniques_multidigraph = nx.MultiDiGraph()

for syns in nx.connected_components(techniques_graph):
    max_tech = ''
    max_count = 0
    total_count = 0
    for tech in syns:
        dat = techniques_graph.node[tech]
        if dat['count'] > max_count:
            max_tech = tech
            max_count = dat['count']
        total_count += dat['count']
    techniques_multidigraph.add_node(max_tech, {'repr_count': total_count})
    for tech in syns:
        techniques_multidigraph.add_node(tech, techniques_graph.node[tech])
        techniques_multidigraph.add_edge(tech, max_tech, label='repr')

In [40]:
len(techniques_multidigraph)

224

In [41]:
techniques_multidigraph.number_of_edges()

224

In [42]:
nx.write_gexf(techniques_multidigraph, 'data/elbulli_techniques_lexicon_3.gexf')

In [43]:
techniques_multidigraph = nx.read_gexf('data/elbulli_techniques_lexicon_3.gexf')
techniques_multidigraph = nx.MultiDiGraph(techniques_multidigraph)

# Superclasses

In [44]:
def add_superclass(g, n, sc):
    if sc not in g:
        g.add_node(sc, {'sc_count': 1})
        g.add_edge(sc, sc, label='repr')
        g.add_edge(sc, sc, label='superclass')
    elif 'sc_count' not in g.node[sc]:
        g.node[sc]['sc_count'] = 1
    else:
        g.node[sc]['sc_count'] += 1
    g.add_edge(n, sc, label='superclass')

In [45]:
for ingr in ingredients_multidigraph.nodes():
    representantive = value(ingredients_multidigraph, ingr, 'repr')
    tokens = nltk.word_tokenize(representantive)
    candidates = [token for token in tokens if token in ingredients]
    if not candidates:
        sc = tokens[0]
    else:
        sc = candidates[0]
    add_superclass(ingredients_multidigraph, ingr, sc)

In [46]:
len(ingredients_multidigraph)

1594

In [47]:
ingredients_multidigraph.number_of_edges()

3188

In [48]:
nx.write_gexf(ingredients_multidigraph, 'data/elbulli_ingredients_lexicon_4.gexf')

In [49]:
ingredients_multidigraph = nx.read_gexf('data/elbulli_ingredients_lexicon_4.gexf')
ingredients_multidigraph = nx.MultiDiGraph(ingredients_multidigraph)

In [50]:
for tech in techniques_multidigraph.nodes():
    representantive = value(techniques_multidigraph, tech, 'repr')
    tokens = nltk.word_tokenize(representantive)
    candidates = [token for token in tokens if token in techniques]
    if not candidates:
        sc = tokens[0]
    else:
        sc = candidates[0]
    add_superclass(techniques_multidigraph, tech, sc)

In [51]:
len(techniques_multidigraph)

230

In [52]:
techniques_multidigraph.number_of_edges()

460

In [53]:
nx.write_gexf(techniques_multidigraph, 'data/elbulli_techniques_lexicon_4.gexf')

In [54]:
techniques_multidigraph = nx.read_gexf('data/elbulli_techniques_lexicon_4.gexf')
techniques_multidigraph = nx.MultiDiGraph(techniques_multidigraph)

# Types of ingredient

In [55]:
with open('data/spanish_ingredients_type.pickle', 'rb') as f:
    type_dict = pickle.load(f)

In [56]:
for ingr, dat in ingredients_multidigraph.nodes_iter(data=True):
    if ingr in type_dict:
        dat['type'] = type_dict[ingr]
    else:
        dat['type'] = 'unknown'

In [57]:
a = b = c = 0
for ingr, dat in ingredients_multidigraph.nodes_iter(data=True):
    a += 1
    if dat['type'] == 'unknown':
        c += 1
    else:
        b += 1
print('Total ingredients:', a)
print('Classified:', b, '({}%)'.format(round(b * 100 / a)))
print('Not classified:', c, '({}%)'.format(round(c * 100 / a)))

Total ingredients: 1594
Classified: 287 (18%)
Not classified: 1307 (82%)


In [59]:
ambiguous = []

for syns in nx.connected_components(all_ingredients_graph):
    types = []
    if any(map(lambda x: x in ingredients_multidigraph and ingredients_multidigraph.node[x]['type'] != 'unknown', syns)):
        types = [ingredients_multidigraph.node[x]['type']
                 for x in syns
                     if x in ingredients_multidigraph and ingredients_multidigraph.node[x]['type'] != 'unknown']
    else:
        for ingr in syns:
            if ingr in type_dict:
                types.append(type_dict[ingr])
            else:
                for ngrm in my_ngrams(ingr):
                    if ngrm in type_dict:
                        types.append(type_dict[ngrm])
    if not types:
        typ = 'other'
    elif len(set(types)) == 1:
        typ = types[0]
    else:
        c = Counter(types)
        most_common_list = c.most_common()
        typ = most_common_list[0][1]
        if most_common_list[1][1] == typ:
            ambiguous.append((syns, most_common_list))
    for ingr in syns:
        if ingr in ingredients_multidigraph:
            ingredients_multidigraph.node[ingr]['type'] = typ

for syns in ambiguous:
    if any(map(lambda x: x in ingredients_multidigraph, syns)):
        print(syns)
        print()

In [60]:
len(ambiguous)

238

In [61]:
a = b = c = 0
for ingr, dat in ingredients_multidigraph.nodes_iter(data=True):
    a += 1
    if dat['type'] == 'other':
        c += 1
    else:
        b += 1
print('Total ingredients:', a)
print('Classified:', b, '({}%)'.format(round(b * 100 / a)))
print('Not classified:', c, '({}%)'.format(round(c * 100 / a)))

Total ingredients: 1594
Classified: 1241 (78%)
Not classified: 353 (22%)


In [62]:
nx.write_gexf(ingredients_multidigraph, 'data/elbulli_ingredients_lexicon_5.gexf')

In [63]:
ingredients_multidigraph = nx.read_gexf('data/elbulli_ingredients_lexicon_5.gexf')
ingredients_multidigraph = nx.MultiDiGraph(ingredients_multidigraph)

# Cuisine

In [64]:
with open('data/spanish_ingredients_cuisine.pickle', 'rb') as f:
    cuisine_dict = pickle.load(f)

In [65]:
for ingr, dat in ingredients_multidigraph.nodes_iter(data=True):
    if ingr in cuisine_dict:
        dat['cuisine'] = cuisine_dict[ingr]
    else:
        dat['cuisine'] = 'unknown'

In [66]:
a = b = c = 0
for ingr, dat in ingredients_multidigraph.nodes_iter(data=True):
    a += 1
    if dat['cuisine'] == 'unknown':
        c += 1
    else:
        b += 1
print('Total ingredients:', a)
print('Classified:', b, '({}%)'.format(round(b * 100 / a)))
print('Not classified:', c, '({}%)'.format(round(c * 100 / a)))

Total ingredients: 1594
Classified: 1535 (96%)
Not classified: 59 (4%)


In [67]:
def my_ngrams(ingredient):
    ngrms = []
    tokens = nltk.word_tokenize(ingredient)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

In [68]:
ambiguous = []

for syns in nx.connected_components(all_ingredients_graph):
    cuisines = []
    if any(map(lambda x: x in ingredients_multidigraph and ingredients_multidigraph.node[x]['cuisine'] != 'unknown', syns)):
        cuisines = [ingredients_multidigraph.node[x]['cuisine']
                    for x in syns
                        if x in ingredients_multidigraph and ingredients_multidigraph.node[x]['cuisine'] != 'unknown']
    else:
        for ingr in syns:
            if ingr in cuisine_dict:
                cuisines.append(cuisine_dict[ingr])
            else:
                for ngrm in my_ngrams(ingr):
                    if ngrm in cuisine_dict:
                        cuisines.append(cuisine_dict[ngrm])
    if not cuisines:
        cuisine = 'other'
    elif len(set(cuisines)) == 1:
        cuisine = cuisines[0]
    else:
        c = Counter(cuisines)
        most_common_list = c.most_common()
        cuisine = most_common_list[0][1]
        if most_common_list[1][1] == cuisine:
            ambiguous.append((syns, most_common_list))
    for ingr in syns:
        if ingr in ingredients_multidigraph:
            ingredients_multidigraph.node[ingr]['cuisine'] = cuisine

for syns in ambiguous:
    if any(map(lambda x: x in ingredients_multidigraph, syns)):
        print(syns)
        print()

In [69]:
len(ambiguous)

134

In [70]:
a = b = c = 0
for ingr, dat in ingredients_multidigraph.nodes_iter(data=True):
    a += 1
    if dat['cuisine'] == 'other':
        c += 1
    else:
        b += 1
print('Total ingredients:', a)
print('Classified:', b, '({}%)'.format(round(b * 100 / a)))
print('Not classified:', c, '({}%)'.format(round(c * 100 / a)))

Total ingredients: 1594
Classified: 1587 (100%)
Not classified: 7 (0%)


In [71]:
nx.write_gexf(ingredients_multidigraph, 'data/elbulli_ingredients_lexicon_6.gexf')

In [72]:
nx.write_gexf(techniques_multidigraph, 'data/elbulli_techniques_lexicon_6.gexf')

In [73]:
ingredients_multidigraph = nx.read_gexf('data/elbulli_ingredients_lexicon_6.gexf')
ingredients_multidigraph = nx.MultiDiGraph(ingredients_multidigraph)

In [74]:
techniques_multidigraph = nx.read_gexf('data/elbulli_techniques_lexicon_6.gexf')
techniques_multidigraph = nx.MultiDiGraph(techniques_multidigraph)

# More databases

In [75]:
# Example
print('aceite de oliva' in ingredients_multidigraph)
print(value(ingredients_multidigraph, 'aceite de oliva', 'repr'))
print(value(ingredients_multidigraph, 'aceite de oliva', 'superclass'))
print(ingredients_multidigraph.node['aceite de oliva']['type'])
print(ingredients_multidigraph.node['aceite de oliva']['cuisine'])
print()
print('cocción al vacío' in techniques_multidigraph)
print(value(techniques_multidigraph, 'cocción al vacío', 'repr'))
print(value(techniques_multidigraph, 'cocción al vacío', 'superclass'))

True
aceite de oliva
aceite
spicies_and_condimients
western

True
vaciar
vaciar


In [76]:
# %%time

# db.elbulli_representatives.drop() # representative ingredients and techniques
# db.elbulli_superclasses.drop() # superclasses of ingredients and techniques
# db.elbulli_types.drop() # types of ingredients, representative techniques
# db.elbulli_cuisines.drop() # cuisines of ingredients, representative techniques

# for r in db.elbulli_raw.find():
#     row = dict(r)
    
#     row['ingredients'] = [value(ingredients_multidigraph, x, 'repr') for x in r['ingredients']]
#     row['techniques'] = [value(techniques_multidigraph, x, 'repr') for x in r['techniques']]
#     db.elbulli_representatives.insert_one(row)
    
#     row['ingredients'] = [value(ingredients_multidigraph, x, 'superclass') for x in r['ingredients']]
#     row['techniques'] = [value(techniques_multidigraph, x, 'superclass') for x in r['techniques']]
#     db.elbulli_superclasses.insert_one(row)
    
#     row['ingredients'] = [ingredients_multidigraph.node[x]['type'] for x in r['ingredients']]
#     row['techniques'] = [value(techniques_multidigraph, x, 'repr') for x in r['techniques']]
#     db.elbulli_types.insert_one(row)
    
#     row['ingredients'] = [ingredients_multidigraph.node[x]['cuisine'] for x in r['ingredients']]
#     row['techniques'] = [value(techniques_multidigraph, x, 'repr') for x in r['techniques']]
#     db.elbulli_cuisines.insert_one(row)
    
# CPU times: user 1min 2s, sys: 208 ms, total: 1min 2s
# Wall time: 1min 4s

CPU times: user 1min 42s, sys: 224 ms, total: 1min 42s
Wall time: 1min 43s
