In [1]:
import os
import pickle
from collections import Counter
from collections import defaultdict

import networkx as nx
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.util import ngrams
from pymongo import MongoClient
from pymongo.errors import BulkWriteError

In [2]:
client = MongoClient()
db = client.recipes

# Parser

In [3]:
english_stopwords = set(stopwords.words('english'))

In [4]:
all_ingredients_graph = nx.read_gexf('data/english_ingredients_lexicon_5.gexf')

In [5]:
ingredients = all_ingredients_graph.nodes()
ingredients = [i for i in ingredients if i not in english_stopwords]

In [6]:
ingredients[:10]

['mustard sauce cress',
 'corianders leaf',
 'wild raspberries bushes',
 'quarks',
 'bean curds mayonnaise',
 'spanish cheese',
 'gras pea',
 'honey acetum dressings',
 'aduki moth bean',
 'citruss paradisis peels']

In [7]:
all_techniques_graph = nx.read_gexf('data/english_techniques_lexicon_5.gexf')

In [8]:
techniques = all_techniques_graph.nodes()
techniques = [t for t in techniques if t not in english_stopwords]

In [9]:
techniques[:10]

['vacuum filling',
 'microwave oven',
 'nitrogen',
 'bain marie',
 'air cook frying',
 'deglazed',
 'chop',
 'barbecue partial cooking',
 'chill',
 'food drying roast']

In [10]:
def trim(s):
    return ' '.join(s.split())

In [None]:
def my_ngrams(s):
    ngrms = []
    tokens = nltk.word_tokenize(s)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

In [None]:
%%time

def insert_documents(docs):
    try:
        db.epicurious_raw.insert_many(docs, ordered=False)
    except BulkWriteError as e:
        pass

db.epicurious_raw.drop()

count = 0
rows = []

# path = 'data/recipes/epicurious/'
path = '/media/antonio/WD1T/datasets-recipes/epicurious/'
for folder_number in sorted(map(int, os.listdir(path))):
    folder = str(folder_number)
    for filename_number in sorted(map(lambda x: int(x[:-5]), os.listdir(path + folder))):
        filename = str(filename_number) + '.html'
        with open(path + folder + '/' + filename) as f:
            row = {
                '_id': '',
                'title': '',
                'year': 0,
                'ingredients': set(),
                'techniques': set(),
            }
            soup = BeautifulSoup(f.read(), 'html.parser')
            year = soup.select('meta[itemprop=datePublished]')[0]['content'].split('-')[0]
            if not year:
                continue
            ingreds = set()
            elems = soup.select(
                'div.recipe-content div.ingredients-info ul.ingredients li.ingredient'
            )
            ingrs = list(map(lambda x: trim(x.text.lower()), elems))
            for ingr in ingrs:
                ngrms = my_ngrams(ingr)
                ngrms.reverse()
                for ngrm in ngrms:
                    if ngrm in ingredients:
                        ingreds.add(ngrm)
                        all_ingredients_graph.node[ngrm]['count'] += 1
                        break
            if not ingreds:
                continue
            techns = set()
            elems = soup.select(
                'div.recipe-content div.instructions ol.preparation-steps li.preparation-step'
            )
            steps = list(map(lambda x: trim(x.text.lower()), elems))
            for step in steps:
                used_ngrams = set()
                ngrms = my_ngrams(step)
                ngrms.reverse()
                for ngrm in ngrms:
                    if ngrm in techniques and all(map(lambda x: ngrm not in x, used_ngrams)):
                        techns.add(ngrm)
                        all_techniques_graph.node[ngrm]['count'] += 1
                        used_ngrams.add(ngrm)
            if not techns:
                continue
            ide = str(filename_number)
            title = soup.select('meta[property=og:title]')[0]['content']
            row['_id'] = ide
            row['title'] = title
            row['year'] = year
            row['ingredients'] = list(ingreds)
            row['techniques'] = list(techns)
            rows.append(row)

            count += 1
            if count % 100 == 0:
                insert_documents(rows)
                rows = []
                print(count, 'rows inserted')
insert_documents(rows)
rows = []
print(count, 'rows inserted')