In [1]:
import os
import pickle
from collections import Counter
from collections import defaultdict

import networkx as nx
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.util import ngrams
from pymongo import MongoClient
from pymongo.errors import BulkWriteError

In [2]:
client = MongoClient()
db = client.recipes

# Parser

In [3]:
english_stopwords = set(stopwords.words('english'))

In [4]:
all_ingredients_graph = nx.read_gexf('data/english_ingredients_lexicon_5.gexf')

In [5]:
ingredients = all_ingredients_graph.nodes()
ingredients = [i for i in ingredients if i not in english_stopwords]

In [6]:
ingredients[:10]

['yelloweye bean',
 'date sirups',
 'blackberry bush passover wines',
 'mulato chile',
 'vanillas extracts gelatoes',
 'chantilly cream rice',
 'peanuts oil',
 'non dairy milks',
 'low-fat milks',
 'almonds cordial']

In [7]:
all_techniques_graph = nx.read_gexf('data/english_techniques_lexicon_5.gexf')

In [8]:
techniques = all_techniques_graph.nodes()
techniques = [t for t in techniques if t not in english_stopwords]

In [9]:
techniques[:10]

['broasting',
 'low temperature',
 'boiling rice',
 'rolled double boiled',
 'soften',
 'knock',
 'iced the puck',
 'pressured',
 'planking',
 'dough sheet']

In [10]:
def trim(s):
    return ' '.join(s.split())

In [11]:
def my_ngrams(s):
    ngrms = []
    tokens = nltk.word_tokenize(s)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

In [12]:
# # %%time

# def insert_documents(docs):
#     try:
#         db.epicurious_raw.insert_many(docs, ordered=False)
#     except BulkWriteError as e:
#         pass

# # db.epicurious_raw.drop()

# count = 0
# rows = []

# # path = 'data/recipes/epicurious/'
# path = '/media/antonio/WD1T/datasets-recipes/epicurious/'
# for folder_number in sorted(map(int, os.listdir(path))):
#     folder = str(folder_number)
#     for filename_number in sorted(map(lambda x: int(x[:-5]), os.listdir(path + folder))):
#         filename = str(filename_number) + '.html'
#         with open(path + folder + '/' + filename) as f:
#             row = {
#                 '_id': '',
#                 'title': '',
#                 'year': 0,
#                 'ingredients': set(),
#                 'techniques': set(),
#             }
#             soup = BeautifulSoup(f.read(), 'html.parser')
#             year = 0
#             date = soup.select('meta[itemprop=datePublished]')
#             if date:
#                 year = date[0]['content'].split('-')[0]
#             if not year:
#                 continue
#             ingreds = set()
#             elems = soup.select(
#                 'div.recipe-content div.ingredients-info ul.ingredients li.ingredient'
#             )
#             ingrs = list(map(lambda x: trim(x.text.lower()), elems))
#             for ingr in ingrs:
#                 ngrms = my_ngrams(ingr)
#                 ngrms.reverse()
#                 for ngrm in ngrms:
#                     if ngrm in ingredients:
#                         ingreds.add(ngrm)
#                         # all_ingredients_graph.node[ngrm]['count'] += 1
#                         break
#             if not ingreds:
#                 continue
#             techns = set()
#             elems = soup.select(
#                 'div.recipe-content div.instructions ol.preparation-steps li.preparation-step'
#             )
#             steps = list(map(lambda x: trim(x.text.lower()), elems))
#             for step in steps:
#                 used_ngrams = set()
#                 ngrms = my_ngrams(step)
#                 ngrms.reverse()
#                 for ngrm in ngrms:
#                     if ngrm in techniques and all(map(lambda x: ngrm not in x, used_ngrams)):
#                         techns.add(ngrm)
#                         # all_techniques_graph.node[ngrm]['count'] += 1
#                         used_ngrams.add(ngrm)
#             if not techns:
#                 continue
#             ide = str(filename_number)
#             title = soup.select('meta[property=og:title]')[0]['content']
#             row['_id'] = ide
#             row['title'] = title
#             row['year'] = year
#             row['ingredients'] = list(ingreds)
#             row['techniques'] = list(techns)
#             rows.append(row)

#             count += 1
#             if count % 100 == 0:
#                 insert_documents(rows)
#                 rows = []
#                 print(count, 'rows inserted')
# insert_documents(rows)
# rows = []
# print(count, 'rows inserted')

In [13]:
db.epicurious_raw.count()

24324

In [14]:
for r in db.epicurious_raw.find():
    for i in r['ingredients']:
        all_ingredients_graph.node[i]['count'] += 1
    for t in r['techniques']:
        all_techniques_graph.node[t]['count'] += 1

In [15]:
nx.write_gexf(all_ingredients_graph, 'data/epicurious_ingredients_lexicon_1.gexf')

In [16]:
nx.write_gexf(all_techniques_graph, 'data/epicurious_techniques_lexicon_1.gexf')

In [17]:
ingredients_graph = nx.read_gexf('data/epicurious_ingredients_lexicon_1.gexf')

In [18]:
techniques_graph = nx.read_gexf('data/epicurious_techniques_lexicon_1.gexf')

# Only ingredients and techniques found in Epicurious's recipes

In [19]:
def add_edge(g, n1, n2):
    if n1 != n2 and not nx.has_path(g, n1, n2):
        g.add_edge(n1, n2)

In [20]:
def del_node_and_link_neighbors(g, n):
    neighbors = g.neighbors(n)
    g.remove_node(n)
    if len(neighbors) > 1:
        add_edge(g, neighbors[0], neighbors[1])

In [21]:
for n in ingredients_graph.nodes():
    if ingredients_graph.node[n]['count'] == 0:
        del_node_and_link_neighbors(ingredients_graph, n)

In [22]:
len(ingredients_graph)

2961

In [23]:
ingredients_graph.number_of_edges()

496

In [24]:
nx.number_connected_components(ingredients_graph)

2465

In [25]:
nx.write_gexf(ingredients_graph, 'data/epicurious_ingredients_lexicon_2.gexf')

In [26]:
ingredients_graph = nx.read_gexf('data/epicurious_ingredients_lexicon_2.gexf')

In [27]:
for n in techniques_graph.nodes():
    if techniques_graph.node[n]['count'] == 0:
        del_node_and_link_neighbors(techniques_graph, n)

In [28]:
len(techniques_graph)

335

In [29]:
techniques_graph.number_of_edges()

159

In [30]:
nx.number_connected_components(techniques_graph)

176

In [31]:
nx.write_gexf(techniques_graph, 'data/epicurious_techniques_lexicon_2.gexf')

In [32]:
techniques_graph = nx.read_gexf('data/epicurious_techniques_lexicon_2.gexf')

# Representatives

In [33]:
# def value(g, n, r):
#     ms = [m for m, rels in g[n].items() for rel in rels.values() if rel['label'] == r]
#     return ms[0]

def value(g, n, r):
    ms = [y for x, y, d in g.edges_iter(data=True) if x == n and d['label'] == r]
    return ms[0]

In [34]:
ingredients_multidigraph = nx.MultiDiGraph()

for syns in nx.connected_components(ingredients_graph):
    max_ingr = ''
    max_count = 0
    total_count = 0
    for ingr in syns:
        dat = ingredients_graph.node[ingr]
        if dat['count'] > max_count:
            max_ingr = ingr
            max_count = dat['count']
        total_count += dat['count']
    ingredients_multidigraph.add_node(max_ingr, {'repr_count': total_count})
    for ingr in syns:
        ingredients_multidigraph.add_node(ingr, ingredients_graph.node[ingr])
        ingredients_multidigraph.add_edge(ingr, max_ingr, label='repr')

In [35]:
len(ingredients_multidigraph)

2961

In [36]:
ingredients_multidigraph.number_of_edges()

2961

In [37]:
nx.write_gexf(ingredients_multidigraph, 'data/epicurious_ingredients_lexicon_3.gexf')

In [38]:
ingredients_multidigraph = nx.read_gexf('data/epicurious_ingredients_lexicon_3.gexf')
ingredients_multidigraph = nx.MultiDiGraph(ingredients_multidigraph)

In [39]:
techniques_multidigraph = nx.MultiDiGraph()

for syns in nx.connected_components(techniques_graph):
    max_tech = ''
    max_count = 0
    total_count = 0
    for tech in syns:
        dat = techniques_graph.node[tech]
        if dat['count'] > max_count:
            max_tech = tech
            max_count = dat['count']
        total_count += dat['count']
    techniques_multidigraph.add_node(max_tech, {'repr_count': total_count})
    for tech in syns:
        techniques_multidigraph.add_node(tech, techniques_graph.node[tech])
        techniques_multidigraph.add_edge(tech, max_tech, label='repr')

In [40]:
len(techniques_multidigraph)

335

In [41]:
techniques_multidigraph.number_of_edges()

335

In [42]:
nx.write_gexf(techniques_multidigraph, 'data/epicurious_techniques_lexicon_3.gexf')

In [43]:
techniques_multidigraph = nx.read_gexf('data/epicurious_techniques_lexicon_3.gexf')
techniques_multidigraph = nx.MultiDiGraph(techniques_multidigraph)

# No superclasses, types of ingredients or cuisine

# More databases

In [44]:
# Example
print('extra virgin olive oil' in ingredients_multidigraph)
print(value(ingredients_multidigraph, 'extra virgin olive oil', 'repr'))
print()
print('simmering' in techniques_multidigraph)
print(value(techniques_multidigraph, 'simmering', 'repr'))

True
extra-virgin olive oil

True
simmer


In [46]:
# %%time

# db.epicurious_representatives.drop() # representative ingredients and techniques

# count = 0
# representatives_rows = []

# for r in db.epicurious_raw.find():
#     row = dict(r)
#     row['ingredients'] = [value(ingredients_multidigraph, x, 'repr') for x in r['ingredients']]
#     row['techniques'] = [value(techniques_multidigraph, x, 'repr') for x in r['techniques']]
#     representatives_rows.append(row)

#     count += 1
#     if count % 100 == 0:
#         db.epicurious_representatives.insert_many(representatives_rows)
#         representatives_rows = []
#         print(count, 'rows inserted')
# db.epicurious_representatives.insert_many(representatives_rows)
# representatives_rows = []
# print(count, 'rows inserted')
    
# # CPU times: user 18min 14s, sys: 928 ms, total: 18min 15s
# # Wall time: 18min 36s