In [5]:
import os

import networkx as nx
import nltk
from pymongo import MongoClient

In [6]:
client = MongoClient()
# client.drop_database('recipes')
db = client.recipes

In [3]:
spanish_stopwords = set()
with open('data/spanish_stopwords.txt') as f:
    for line in f:
        word = line.strip()
        spanish_stopwords.add(word)

In [4]:
def sublist(a, b):
    res = False
    for i in range(len(b)-len(a)+1):
        if b[i:i+len(a)] == a:
            res = True
            break
    return res

In [5]:
ingredients_graph = nx.read_gexf('data/spanish_ingredients_lexicon_5.gexf')

In [6]:
ingredients = ingredients_graph.nodes()
ingredients.sort(key=lambda x: len(x.split()), reverse=True)
ingredients = [i for i in ingredients if i not in spanish_stopwords]

In [7]:
ingredients[:10]

['espuma fría de whisky sour de fruta de la pasión',
 'nube de metil de puré de fruta de la pasión',
 'nubes de metiles de purés de frutas de las pasiones',
 'espumas frías de whiskyes soures de frutas de las pasiones',
 'salsa de jugo de pollo al aceite de ajo',
 'polvos de gelatinas frías de zumos de kumquates liofilizadas',
 'agua de la cocción de los pies de cerdo',
 'agu de la cocción de el pie de cerdo',
 'polvo de gelatina fría de zumo de kumquat liofilizada',
 'aceite de atún en conserva de aceite de oliva']

In [8]:
ingredients[-10:]

['karelas',
 'crackeres',
 'raya',
 'emperador',
 'soba',
 'ananás',
 'kobe',
 'peras',
 'tahinas',
 'jazmines']

In [9]:
techniques_graph = nx.read_gexf('data/spanish_techniques_lexicon_5.gexf')

In [10]:
techniques = techniques_graph.nodes()
techniques.sort(key=lambda x: len(x.split()), reverse=True)
techniques = [t for t in techniques if t not in spanish_stopwords]

In [11]:
techniques[:10]

['hielo y salando para enfriando',
 'cocinando en a la cazuela',
 'hielo y salado para enfriado',
 'cocinar en a la cazuela',
 'cocinado en a la cazuela',
 'cocina de retención del calor',
 'cocinando de retención del calorizando',
 'cocinado de retención del calorizado',
 'cocinar de retención del calorizar',
 'hielo y salar para enfriar']

In [12]:
techniques[-10:]

['ganshao',
 'marinado',
 'marcado',
 'sofrenar',
 'rouelle',
 'mirepoix',
 'itamemono',
 'sifón',
 'dentado',
 'rotando']

In [13]:
# %%time

# count = 0
# rows = []

# path = 'data/recipes/elbulli/'
# for folder in os.listdir(path):
#     for filename_number in sorted(map(lambda x: int(x[:-4]), os.listdir(path + folder))):
#         filename = str(filename_number) + '.dat'
#         with open(path + folder + '/' + filename) as f:
#             row = {
#                 '_id': '',
#                 'title': '',
#                 'year': 0,
#                 'ingredients': set(),
#                 'techniques': set(),
#             }
#             i_text = ''
#             t_text = ''
#             for line in f:
#                 line = line.strip()
#                 if line.startswith('num'):
#                     row['_id'] = line.split('=')[1]
#                 elif line.startswith('&titol='):
#                     row['title'] = line.split('=')[1]
#                 elif line.startswith('&any'):
#                     row['year'] = int(line.split('=')[1])
#                 elif line.startswith('&ingredientselaboracio'):
#                     equals_index = line.index('=')
#                     i_text += line[equals_index + 1:].lower() + ' - '
#                 elif line.startswith('&descripcioelaboracio') or \
#                      line.startswith('&acabatipresentacio') or \
#                      line.startswith('&titolelaboracio'):
#                     equals_index = line.index('=')
#                     t_text += line[equals_index + 1:].lower() + ' - '
#             i_text_tokens = nltk.word_tokenize(i_text)
#             for ingr in ingredients:
#                 ingr_tokens = nltk.word_tokenize(ingr)
#                 if (sublist(ingr_tokens, i_text_tokens)):
#                     row['ingredients'].add(ingr)
#                     i_text = i_text.replace(ingr, '')
#                     i_text_tokens = nltk.word_tokenize(i_text)
#             row['ingredients'] = list(row['ingredients'])
#             t_text_tokens = nltk.word_tokenize(t_text)
#             for tech in techniques:
#                 tech_tokens = nltk.word_tokenize(tech)
#                 if (sublist(tech_tokens, t_text_tokens)):
#                     row['techniques'].add(tech)
#                     t_text = t_text.replace(tech, '')
#                     t_text_tokens = nltk.word_tokenize(t_text)
#             row['techniques'] = list(row['techniques'])
#             rows.append(row)
#             for ingr in row['ingredients']:
#                 ingredients_graph.node[ingr]['count'] += 1
#             for tech in row['techniques']:
#                 techniques_graph.node[tech]['count'] += 1
            
#             count += 1
#             if count % 100 == 0:
#                 db.elbulli_aux.insert_many(rows)
#                 rows = []
#                 print(count, 'rows inserted')
# db.elbulli_aux.insert_many(rows)
# rows = []
# print(count, 'rows inserted')

# # CPU times: user 1h 26min 4s, sys: 796 ms, total: 1h 26min 4s
# # Wall time: 1h 26min 3s

In [14]:
# nx.write_gexf(ingredients_graph, 'data/spanish_ingredients_lexicon_6.gexf')

In [15]:
# nx.write_gexf(techniques_graph, 'data/spanish_techniques_lexicon_6.gexf')

In [7]:
ingredients_graph = nx.read_gexf('data/spanish_ingredients_lexicon_6.gexf')

In [8]:
techniques_graph = nx.read_gexf('data/spanish_techniques_lexicon_6.gexf')

In [9]:
repr_ingredients_dict = {}

for syns in nx.connected_components(ingredients_graph):
    max_ingr = ''
    max_count = 0
    for ingr in syns:
        dat = ingredients_graph.node[ingr]
        if dat['count'] > max_count:
            max_ingr = ingr
            max_count = dat['count']
    if max_ingr:
        for ingr in syns:
            dat = ingredients_graph.node[ingr]
            if dat['count'] > 0:
                repr_ingredients_dict[ingr] = max_ingr

In [10]:
repr_ingredients_dict['aceite de olivas']

'aceite de oliva'

In [11]:
len(repr_ingredients_dict)

1609

In [12]:
repr_techniques_dict = {}

for syns in nx.connected_components(techniques_graph):
    max_tech = ''
    max_count = 0
    for tech in syns:
        dat = techniques_graph.node[tech]
        if dat['count'] > max_count:
            max_tech = tech
            max_count = dat['count']
    if max_tech:
        for tech in syns:
            dat = techniques_graph.node[tech]
            if dat['count'] > 0:
                repr_techniques_dict[tech] = max_tech

In [13]:
len(repr_techniques_dict)

181

In [14]:
repr_techniques_dict['al horno']

'horno'

In [18]:
len(set(repr_ingredients_dict.values()))

1370

In [19]:
len(set(repr_techniques_dict.values()))

84

In [22]:
db.drop_collection('elbulli')

count = 0
rows = []

for row in db.elbulli_aux.find():
    repr_ingredients = set()
    for ingr in row['ingredients']:
        r = repr_ingredients_dict[ingr]
        repr_ingredients.add(r)
    row['ingredients'] = sorted(repr_ingredients)
    repr_techniques = set()
    for tech in row['techniques']:
        r = repr_techniques_dict[tech]
        repr_techniques.add(r)
    row['techniques'] = sorted(repr_techniques)
    rows.append(row)
    
    count += 1
    if count % 100 == 0:
        db.elbulli.insert_many(rows)
        rows = []
        print(count, 'rows inserted')
db.elbulli.insert_many(rows)
rows = []
print(count, 'rows inserted')

100 rows inserted
200 rows inserted
300 rows inserted
400 rows inserted
500 rows inserted
600 rows inserted
700 rows inserted
800 rows inserted
900 rows inserted
1000 rows inserted
1100 rows inserted
1200 rows inserted
1214 rows inserted
