In [1]:
import os
from collections import Counter

import networkx as nx
import nltk
from pymongo import MongoClient

In [2]:
client = MongoClient()
# client.drop_database('recipes')
db = client.recipes

# Parser

In [3]:
spanish_stopwords = set()
with open('data/spanish_stopwords.txt') as f:
    for line in f:
        word = line.strip()
        spanish_stopwords.add(word)

In [4]:
# def sublist(a, b):
#     res = False
#     for i in range(len(b)-len(a)+1):
#         if b[i:i+len(a)] == a:
#             res = True
#             break
#     return res

# def sublist(a, b):
#     str_a = str(a)[1:-1]
#     str_b = str(b)[1:-1]
#     return str_a in str_b

def sublist(a, b):
    return any(a == b[i:i+len(a)] for i in range(len(b)-len(a)+1))

In [5]:
ingredients_graph = nx.read_gexf('data/spanish_ingredients_lexicon_5.gexf')

In [6]:
ingredients = ingredients_graph.nodes()
ingredients.sort(key=lambda x: len(x.split()), reverse=True)
ingredients = [i for i in ingredients if i not in spanish_stopwords]

In [7]:
ingredients[:10]

['espuma fría de whisky sour de fruta de la pasión',
 'nubes de metiles de purés de frutas de la pasiones',
 'nube de metil de puré de fruta de la pasión',
 'espumas frías de whiskyes soures de frutas de la pasiones',
 'polvo de gelatina fría de zumo de kumquat liofilizada',
 'gelatinas calientes de caldos de pancetas ibéricas entreveradas ahumadas',
 'agua de la cocción de los pies de cordero',
 'salsas de jugos de pollos al aceites de ajos',
 'agua de la cocción de los pies de cerdo',
 'aceites de atunes en conservas de aceites de olivas']

In [8]:
ingredients[-10:]

['gelatina',
 'mentaiko',
 'tuétano',
 'sequías',
 'basas',
 'almendras',
 'gumbos',
 'nitrógeno',
 'membrillos',
 'vainillina']

In [9]:
techniques_graph = nx.read_gexf('data/spanish_techniques_lexicon_5.gexf')

In [10]:
techniques = techniques_graph.nodes()
techniques.sort(key=lambda x: len(x.split()), reverse=True)
techniques = [t for t in techniques if t not in spanish_stopwords]

In [11]:
techniques[:10]

['cocinado de retención del calorizado',
 'hielo y sal para enfriar',
 'hielo y salado para enfriado',
 'cocinado en a la cazuela',
 'cocinar en a la cazuela',
 'hielo y salar para enfriar',
 'cocinando en a la cazuela',
 'cocinar de retención del calorizar',
 'hielo y salando para enfriando',
 'cocina de retención del calor']

In [12]:
techniques[-10:]

['sifflets',
 'conserva',
 'brunoisse',
 'duro',
 'braseada',
 'repostar',
 'rebozar',
 'asar',
 'conchado',
 'espesando']

In [13]:
# %%time

# db.elbulli_raw.drop()

# count = 0
# rows = []

# path = 'data/recipes/elbulli/'
# for folder in os.listdir(path):
#     for filename_number in sorted(map(lambda x: int(x[:-4]), os.listdir(path + folder))):
#         filename = str(filename_number) + '.dat'
#         with open(path + folder + '/' + filename) as f:
#             row = {
#                 '_id': '',
#                 'title': '',
#                 'year': 0,
#                 'ingredients': set(),
#                 'techniques': set(),
#             }
#             i_text = ''
#             t_text = ''
#             for line in f:
#                 line = line.strip()
#                 if line.startswith('num'):
#                     row['_id'] = line.split('=')[1]
#                 elif line.startswith('&titol='):
#                     row['title'] = line.split('=')[1]
#                 elif line.startswith('&any'):
#                     row['year'] = int(line.split('=')[1])
#                 elif line.startswith('&ingredientselaboracio'):
#                     equals_index = line.index('=')
#                     i_text += line[equals_index + 1:].lower() + ' - '
#                 elif line.startswith('&descripcioelaboracio') or \
#                      line.startswith('&acabatipresentacio') or \
#                      line.startswith('&titolelaboracio'):
#                     equals_index = line.index('=')
#                     t_text += line[equals_index + 1:].lower() + ' - '
#             i_text_tokens = nltk.word_tokenize(i_text)
#             for ingr in ingredients:
#                 ingr_tokens = nltk.word_tokenize(ingr)
#                 if sublist(ingr_tokens, i_text_tokens):
#                     ingredients_graph.node[ingr]['count'] += 1
#                     row['ingredients'].add(ingr)
#                     i_text = i_text.replace(ingr, '')
#                     i_text_tokens = nltk.word_tokenize(i_text)
#             row['ingredients'] = list(row['ingredients'])
#             t_text_tokens = nltk.word_tokenize(t_text)
#             for tech in techniques:
#                 tech_tokens = nltk.word_tokenize(tech)
#                 if sublist(tech_tokens, t_text_tokens):
#                     techniques_graph.node[tech]['count'] += 1
#                     row['techniques'].add(tech)
#                     t_text = t_text.replace(tech, '')
#                     t_text_tokens = nltk.word_tokenize(t_text)
#             row['techniques'] = list(row['techniques'])
#             rows.append(row)
            
#             count += 1
#             if count % 100 == 0:
#                 db.elbulli_raw.insert_many(rows)
#                 rows = []
#                 print(count, 'rows inserted')
# db.elbulli_raw.insert_many(rows)
# rows = []
# print(count, 'rows inserted')

# CPU times: user 1h 26min 4s, sys: 796 ms, total: 1h 26min 4s
# Wall time: 1h 26min 3s

In [14]:
# nx.write_gexf(ingredients_graph, 'data/elbulli_ingredients_lexicon_1.gexf')

In [15]:
# nx.write_gexf(techniques_graph, 'data/elbulli_techniques_lexicon_1.gexf')

In [16]:
ingredients_graph = nx.read_gexf('data/elbulli_ingredients_lexicon_1.gexf')

In [17]:
techniques_graph = nx.read_gexf('data/elbulli_techniques_lexicon_1.gexf')

# Only ingredients found in elbulli's recipes

In [18]:
def add_edge(g, n1, n2):
    if n1 != n2 and not nx.has_path(g, n1, n2):
        g.add_edge(n1, n2)

In [19]:
def del_node_and_link_neighbors(g, n):
    neighbors = g.neighbors(n)
    g.remove_node(n)
    if len(neighbors) > 1:
        add_edge(g, neighbors[0], neighbors[1])

In [20]:
for n in ingredients_graph.nodes():
    if ingredients_graph.node[n]['count'] == 0:
        del_node_and_link_neighbors(ingredients_graph, n)

In [21]:
len(ingredients_graph)

1614

In [22]:
ingredients_graph.number_of_edges()

233

In [23]:
nx.number_connected_components(ingredients_graph)

1381

In [24]:
nx.write_gexf(ingredients_graph, 'data/elbulli_ingredients_lexicon_2.gexf')

In [25]:
ingredients_graph = nx.read_gexf('data/elbulli_ingredients_lexicon_2.gexf')

In [26]:
for n in techniques_graph.nodes():
    if techniques_graph.node[n]['count'] == 0:
        del_node_and_link_neighbors(techniques_graph, n)

In [27]:
len(techniques_graph)

181

In [28]:
techniques_graph.number_of_edges()

101

In [29]:
nx.number_connected_components(techniques_graph)

80

In [30]:
nx.write_gexf(techniques_graph, 'data/elbulli_techniques_lexicon_2.gexf')

In [31]:
techniques_graph = nx.read_gexf('data/elbulli_techniques_lexicon_2.gexf')

# Representatives

In [32]:
# def get_related_node(g, n, r):
#     ms = [m for m, rels in g[n].items() for rel in rels.values() if rel['label'] == r]
#     return ms[0]

def get_related_node(g, n, r):
    ms = [y for x, y, d in g.edges_iter(data=True) if x == n and d['label'] == r]
    return ms[0]

In [33]:
ingredients_multidigraph = nx.MultiDiGraph()

for syns in nx.connected_components(ingredients_graph):
    max_ingr = ''
    max_count = 0
    total_count = 0
    for ingr in syns:
        dat = ingredients_graph.node[ingr]
        if dat['count'] > max_count:
            max_ingr = ingr
            max_count = dat['count']
        total_count += dat['count']
    ingredients_multidigraph.add_node(max_ingr, {'repr_count': total_count})
    for ingr in syns:
        ingredients_multidigraph.add_node(ingr, ingredients_graph.node[ingr])
        ingredients_multidigraph.add_edge(ingr, max_ingr, label='repr')

In [34]:
len(ingredients_multidigraph)

1614

In [35]:
ingredients_multidigraph.number_of_edges()

1614

In [36]:
nx.write_gexf(ingredients_multidigraph, 'data/elbulli_ingredients_lexicon_3.gexf')

In [37]:
ingredients_multidigraph = nx.read_gexf('data/elbulli_ingredients_lexicon_3.gexf')
ingredients_multidigraph = nx.MultiDiGraph(ingredients_multidigraph)

In [38]:
techniques_multidigraph = nx.MultiDiGraph()

for syns in nx.connected_components(techniques_graph):
    max_tech = ''
    max_count = 0
    total_count = 0
    for tech in syns:
        dat = techniques_graph.node[tech]
        if dat['count'] > max_count:
            max_tech = tech
            max_count = dat['count']
        total_count += dat['count']
    techniques_multidigraph.add_node(max_tech, {'repr_count': total_count})
    for tech in syns:
        techniques_multidigraph.add_node(tech, techniques_graph.node[tech])
        techniques_multidigraph.add_edge(tech, max_tech, label='repr')

In [39]:
len(techniques_multidigraph)

181

In [40]:
techniques_multidigraph.number_of_edges()

181

In [41]:
nx.write_gexf(techniques_multidigraph, 'data/elbulli_techniques_lexicon_3.gexf')

In [42]:
techniques_multidigraph = nx.read_gexf('data/elbulli_techniques_lexicon_3.gexf')
techniques_multidigraph = nx.MultiDiGraph(techniques_multidigraph)

# Superclasses

In [43]:
def add_superclass(g, n, sc):
    if sc not in g:
        g.add_node(sc, {'sc_count': 1})
        g.add_edge(sc, sc, label='repr')
        g.add_edge(sc, sc, label='superclass')
    elif 'sc_count' not in g.node[sc]:
        g.node[sc]['sc_count'] = 1
    else:
        g.node[sc]['sc_count'] += 1
    g.add_edge(n, sc, label='superclass')

In [44]:
for ingr in ingredients_multidigraph.nodes():
    representantive = get_related_node(ingredients_multidigraph, ingr, 'repr')
    tokens = nltk.word_tokenize(representantive)
    candidates = [token for token in tokens if token in ingredients]
    if not candidates:
        sc = tokens[0]
    else:
        sc = candidates[0]
    add_superclass(ingredients_multidigraph, ingr, sc)

In [45]:
len(ingredients_multidigraph)

1682

In [46]:
ingredients_multidigraph.number_of_edges()

3364

In [47]:
nx.write_gexf(ingredients_multidigraph, 'data/elbulli_ingredients_lexicon_4.gexf')

In [48]:
ingredients_multidigraph = nx.read_gexf('data/elbulli_ingredients_lexicon_4.gexf')
ingredients_multidigraph = nx.MultiDiGraph(ingredients_multidigraph)

In [49]:
for tech in techniques_multidigraph.nodes():
    representantive = get_related_node(techniques_multidigraph, tech, 'repr')
    tokens = nltk.word_tokenize(representantive)
    candidates = [token for token in tokens if token in techniques]
    if not candidates:
        sc = tokens[0]
    else:
        sc = candidates[0]
    add_superclass(techniques_multidigraph, tech, sc)

In [50]:
len(techniques_multidigraph)

188

In [51]:
techniques_multidigraph.number_of_edges()

376

In [52]:
nx.write_gexf(techniques_multidigraph, 'data/elbulli_techniques_lexicon_4.gexf')

In [53]:
techniques_multidigraph = nx.read_gexf('data/elbulli_techniques_lexicon_4.gexf')
techniques_multidigraph = nx.MultiDiGraph(techniques_multidigraph)