In [2]:
import os
import pickle
from collections import Counter
from collections import defaultdict

import networkx as nx
import nltk
from nltk.util import ngrams
from pymongo import MongoClient

In [3]:
client = MongoClient()
# client.drop_database('recipes')
db = client.recipes

# Parser

In [4]:
spanish_stopwords = set()
with open('data/spanish_stopwords.txt') as f:
    for line in f:
        word = line.strip()
        spanish_stopwords.add(word)

In [5]:
# def sublist(a, b):
#     res = False
#     for i in range(len(b)-len(a)+1):
#         if b[i:i+len(a)] == a:
#             res = True
#             break
#     return res

# def sublist(a, b):
#     str_a = str(a)[1:-1]
#     str_b = str(b)[1:-1]
#     return str_a in str_b

def sublist(a, b):
    return any(a == b[i:i+len(a)] for i in range(len(b)-len(a)+1))

In [6]:
all_ingredients_graph = nx.read_gexf('data/spanish_ingredients_lexicon_5.gexf')

In [7]:
ingredients = all_ingredients_graph.nodes()
ingredients.sort(key=lambda x: len(x.split()), reverse=True)
ingredients = [i for i in ingredients if i not in spanish_stopwords]

In [8]:
ingredients[:10]

['nubes de metiles de purés de frutas de la pasiones',
 'espumas frías de whiskyes soures de frutas de la pasiones',
 'nube de metil de puré de fruta de la pasión',
 'espuma fría de whisky sour de fruta de la pasión',
 'aguas de la cocciones de los pies de corderos',
 'salsa de jugo de pollo al aceite de ajo',
 'aguas de la cocciones de los pies de cerdos',
 'agua de la cocción de los pie de cerdo',
 'agua de la cocción de los pies de cordero',
 'agua de la cocción de los pie de cordero']

In [9]:
ingredients[-10:]

['brick',
 'pistolas',
 'glutamato',
 'mortadelas',
 'aceite',
 'harina',
 'komes',
 'lyomango',
 'pajeles',
 'corvinas']

In [10]:
all_techniques_graph = nx.read_gexf('data/spanish_techniques_lexicon_5.gexf')

In [11]:
techniques = all_techniques_graph.nodes()
techniques.sort(key=lambda x: len(x.split()), reverse=True)
techniques = [t for t in techniques if t not in spanish_stopwords]

In [12]:
techniques[:10]

['hielo y salando para enfriando',
 'cocinar en a la cazuela',
 'hielo y salar para enfriar',
 'cocinando en a la cazuela',
 'cocinado en a la cazuela',
 'cocinado de retención del calorizado',
 'cocinar de retención del calorizar',
 'cocinando de retención del calorizando',
 'hielo y salado para enfriado',
 'cocina de retención del calor']

In [13]:
techniques[-10:]

['rouelle',
 'mencionar',
 'encurtidos',
 'presionar',
 'kao',
 'crudos',
 'jiang',
 'rotavapor',
 'glaseando',
 'mechar']

In [14]:
# %%time

# db.elbulli_raw.drop()

# count = 0
# rows = []

# path = 'data/recipes/elbulli/'
# for folder in os.listdir(path):
#     for filename_number in sorted(map(lambda x: int(x[:-4]), os.listdir(path + folder))):
#         filename = str(filename_number) + '.dat'
#         with open(path + folder + '/' + filename) as f:
#             row = {
#                 '_id': '',
#                 'title': '',
#                 'year': 0,
#                 'ingredients': set(),
#                 'techniques': set(),
#             }
#             i_text = ''
#             t_text = ''
#             for line in f:
#                 line = line.strip()
#                 if line.startswith('num'):
#                     row['_id'] = line.split('=')[1]
#                 elif line.startswith('&titol='):
#                     row['title'] = line.split('=')[1]
#                 elif line.startswith('&any'):
#                     row['year'] = int(line.split('=')[1])
#                 elif line.startswith('&ingredientselaboracio'):
#                     equals_index = line.index('=')
#                     i_text += line[equals_index + 1:].lower() + ' - '
#                 elif line.startswith('&descripcioelaboracio') or \
#                      line.startswith('&acabatipresentacio') or \
#                      line.startswith('&titolelaboracio'):
#                     equals_index = line.index('=')
#                     t_text += line[equals_index + 1:].lower() + ' - '
#             i_text_tokens = nltk.word_tokenize(i_text)
#             for ingr in ingredients:
#                 ingr_tokens = nltk.word_tokenize(ingr)
#                 if sublist(ingr_tokens, i_text_tokens):
#                     all_ingredients_graph.node[ingr]['count'] += 1
#                     row['ingredients'].add(ingr)
#                     i_text = i_text.replace(ingr, '')
#                     i_text_tokens = nltk.word_tokenize(i_text)
#             row['ingredients'] = list(row['ingredients'])
#             t_text_tokens = nltk.word_tokenize(t_text)
#             for tech in techniques:
#                 tech_tokens = nltk.word_tokenize(tech)
#                 if sublist(tech_tokens, t_text_tokens):
#                     all_techniques_graph.node[tech]['count'] += 1
#                     row['techniques'].add(tech)
#                     t_text = t_text.replace(tech, '')
#                     t_text_tokens = nltk.word_tokenize(t_text)
#             row['techniques'] = list(row['techniques'])
#             rows.append(row)
            
#             count += 1
#             if count % 100 == 0:
#                 db.elbulli_raw.insert_many(rows)
#                 rows = []
#                 print(count, 'rows inserted')
# db.elbulli_raw.insert_many(rows)
# rows = []
# print(count, 'rows inserted')

# CPU times: user 1h 26min 4s, sys: 796 ms, total: 1h 26min 4s
# Wall time: 1h 26min 3s

In [15]:
# nx.write_gexf(all_ingredients_graph, 'data/elbulli_ingredients_lexicon_1.gexf')

In [16]:
# nx.write_gexf(all_techniques_graph, 'data/elbulli_techniques_lexicon_1.gexf')

In [17]:
ingredients_graph = nx.read_gexf('data/elbulli_ingredients_lexicon_1.gexf')

In [18]:
techniques_graph = nx.read_gexf('data/elbulli_techniques_lexicon_1.gexf')

# Only ingredients found in elbulli's recipes

In [19]:
def add_edge(g, n1, n2):
    if n1 != n2 and not nx.has_path(g, n1, n2):
        g.add_edge(n1, n2)

In [20]:
def del_node_and_link_neighbors(g, n):
    neighbors = g.neighbors(n)
    g.remove_node(n)
    if len(neighbors) > 1:
        add_edge(g, neighbors[0], neighbors[1])

In [21]:
for n in ingredients_graph.nodes():
    if ingredients_graph.node[n]['count'] == 0:
        del_node_and_link_neighbors(ingredients_graph, n)

In [22]:
len(ingredients_graph)

1614

In [23]:
ingredients_graph.number_of_edges()

227

In [24]:
nx.number_connected_components(ingredients_graph)

1387

In [25]:
nx.write_gexf(ingredients_graph, 'data/elbulli_ingredients_lexicon_2.gexf')

In [26]:
ingredients_graph = nx.read_gexf('data/elbulli_ingredients_lexicon_2.gexf')

In [27]:
for n in techniques_graph.nodes():
    if techniques_graph.node[n]['count'] == 0:
        del_node_and_link_neighbors(techniques_graph, n)

In [28]:
len(techniques_graph)

181

In [29]:
techniques_graph.number_of_edges()

99

In [30]:
nx.number_connected_components(techniques_graph)

82

In [31]:
nx.write_gexf(techniques_graph, 'data/elbulli_techniques_lexicon_2.gexf')

In [32]:
techniques_graph = nx.read_gexf('data/elbulli_techniques_lexicon_2.gexf')

# Representatives

In [33]:
# def value(g, n, r):
#     ms = [m for m, rels in g[n].items() for rel in rels.values() if rel['label'] == r]
#     return ms[0]

def value(g, n, r):
    ms = [y for x, y, d in g.edges_iter(data=True) if x == n and d['label'] == r]
    return ms[0]

In [34]:
ingredients_multidigraph = nx.MultiDiGraph()

for syns in nx.connected_components(ingredients_graph):
    max_ingr = ''
    max_count = 0
    total_count = 0
    for ingr in syns:
        dat = ingredients_graph.node[ingr]
        if dat['count'] > max_count:
            max_ingr = ingr
            max_count = dat['count']
        total_count += dat['count']
    ingredients_multidigraph.add_node(max_ingr, {'repr_count': total_count})
    for ingr in syns:
        ingredients_multidigraph.add_node(ingr, ingredients_graph.node[ingr])
        ingredients_multidigraph.add_edge(ingr, max_ingr, label='repr')

In [35]:
len(ingredients_multidigraph)

1614

In [36]:
ingredients_multidigraph.number_of_edges()

1614

In [37]:
nx.write_gexf(ingredients_multidigraph, 'data/elbulli_ingredients_lexicon_3.gexf')

In [38]:
ingredients_multidigraph = nx.read_gexf('data/elbulli_ingredients_lexicon_3.gexf')
ingredients_multidigraph = nx.MultiDiGraph(ingredients_multidigraph)

In [39]:
techniques_multidigraph = nx.MultiDiGraph()

for syns in nx.connected_components(techniques_graph):
    max_tech = ''
    max_count = 0
    total_count = 0
    for tech in syns:
        dat = techniques_graph.node[tech]
        if dat['count'] > max_count:
            max_tech = tech
            max_count = dat['count']
        total_count += dat['count']
    techniques_multidigraph.add_node(max_tech, {'repr_count': total_count})
    for tech in syns:
        techniques_multidigraph.add_node(tech, techniques_graph.node[tech])
        techniques_multidigraph.add_edge(tech, max_tech, label='repr')

In [40]:
len(techniques_multidigraph)

181

In [41]:
techniques_multidigraph.number_of_edges()

181

In [42]:
nx.write_gexf(techniques_multidigraph, 'data/elbulli_techniques_lexicon_3.gexf')

In [43]:
techniques_multidigraph = nx.read_gexf('data/elbulli_techniques_lexicon_3.gexf')
techniques_multidigraph = nx.MultiDiGraph(techniques_multidigraph)

# Superclasses

In [44]:
def add_superclass(g, n, sc):
    if sc not in g:
        g.add_node(sc, {'sc_count': 1})
        g.add_edge(sc, sc, label='repr')
        g.add_edge(sc, sc, label='superclass')
    elif 'sc_count' not in g.node[sc]:
        g.node[sc]['sc_count'] = 1
    else:
        g.node[sc]['sc_count'] += 1
    g.add_edge(n, sc, label='superclass')

In [45]:
for ingr in ingredients_multidigraph.nodes():
    representantive = value(ingredients_multidigraph, ingr, 'repr')
    tokens = nltk.word_tokenize(representantive)
    candidates = [token for token in tokens if token in ingredients]
    if not candidates:
        sc = tokens[0]
    else:
        sc = candidates[0]
    add_superclass(ingredients_multidigraph, ingr, sc)

In [46]:
len(ingredients_multidigraph)

1682

In [47]:
ingredients_multidigraph.number_of_edges()

3364

In [48]:
nx.write_gexf(ingredients_multidigraph, 'data/elbulli_ingredients_lexicon_4.gexf')

In [49]:
ingredients_multidigraph = nx.read_gexf('data/elbulli_ingredients_lexicon_4.gexf')
ingredients_multidigraph = nx.MultiDiGraph(ingredients_multidigraph)

In [50]:
for tech in techniques_multidigraph.nodes():
    representantive = value(techniques_multidigraph, tech, 'repr')
    tokens = nltk.word_tokenize(representantive)
    candidates = [token for token in tokens if token in techniques]
    if not candidates:
        sc = tokens[0]
    else:
        sc = candidates[0]
    add_superclass(techniques_multidigraph, tech, sc)

In [51]:
len(techniques_multidigraph)

188

In [52]:
techniques_multidigraph.number_of_edges()

376

In [53]:
nx.write_gexf(techniques_multidigraph, 'data/elbulli_techniques_lexicon_4.gexf')

In [54]:
techniques_multidigraph = nx.read_gexf('data/elbulli_techniques_lexicon_4.gexf')
techniques_multidigraph = nx.MultiDiGraph(techniques_multidigraph)

# Types of ingredient

In [55]:
with open('data/spanish_ingredients_type.pickle', 'rb') as f:
    type_dict = pickle.load(f)

In [56]:
for ingr, dat in ingredients_multidigraph.nodes_iter(data=True):
    if ingr in type_dict:
        dat['type'] = type_dict[ingr]
    else:
        dat['type'] = 'unknown'

In [57]:
a = b = c = 0
for ingr, dat in ingredients_multidigraph.nodes_iter(data=True):
    a += 1
    if dat['type'] == 'unknown':
        c += 1
    else:
        b += 1
print('Total ingredients:', a)
print('Classified:', b)
print('Not classified:', c)

Total ingredients: 1682
Classified: 296
Not classified: 1386


In [58]:
def my_ngrams(ingredient):
    ngrms = []
    tokens = nltk.word_tokenize(ingredient)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

In [59]:
ambiguous = []

for syns in nx.connected_components(all_ingredients_graph):
    types = []
    if any(map(lambda x: x in ingredients_multidigraph and ingredients_multidigraph.node[x]['type'] != 'unknown', syns)):
        types = [ingredients_multidigraph.node[x]['type']
                 for x in syns
                     if x in ingredients_multidigraph and ingredients_multidigraph.node[x]['type'] != 'unknown']
    else:
        for ingr in syns:
            if ingr in type_dict:
                types.append(type_dict[ingr])
            else:
                for ngrm in my_ngrams(ingr):
                    if ngrm in type_dict:
                        types.append(type_dict[ngrm])
    if not types:
        typ = 'unknown'
    elif len(set(types)) == 1:
        typ = types[0]
    else:
        c = Counter(types)
        most_common_list = c.most_common()
        typ = most_common_list[0][1]
        if most_common_list[1][1] == typ:
            ambiguous.append((syns, most_common_list))
    for ingr in syns:
        if ingr in ingredients_multidigraph:
            ingredients_multidigraph.node[ingr]['type'] = typ

for syns in ambiguous:
    if any(map(lambda x: x in ingredients_multidigraph, syns)):
        print(syns)
        print()

In [60]:
len(ambiguous)

209

In [64]:
a = b = c = 0
for ingr, dat in ingredients_multidigraph.nodes_iter(data=True):
    a += 1
    if dat['type'] == 'unknown':
        c += 1
    else:
        b += 1
print('Total ingredients:', a)
print('Classified:', b)
print('Not classified:', c)

Total ingredients: 1682
Classified: 1283
Not classified: 399


In [None]:
BUSCAR MAS LISTAS DE INGREDIENTES!!!! BUSCAR SI LAS PROPIAS PAGINAS DE DONDE YO SAQUE MI LISTA DE INGREDIENTES CONTIENE INFO SOBRE SU CATEGORIA

# Cuisine

In [None]:
with open('data/spanish_ingredients_cuisine.pickle', 'rb') as f:
    cuisine_dict = pickle.load(f)