In [1]:
import os
import pickle
from collections import Counter
from collections import defaultdict

import networkx as nx
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.util import ngrams
from pymongo import MongoClient
from pymongo.errors import BulkWriteError

In [2]:
client = MongoClient()
db = client.recipes

# Parser

In [3]:
english_stopwords = set(stopwords.words('english'))

In [4]:
all_ingredients_graph = nx.read_gexf('data/english_ingredients_lexicon_5.gexf')

In [5]:
ingredients = all_ingredients_graph.nodes()
ingredients = [i for i in ingredients if i not in english_stopwords]

In [6]:
ingredients[:10]

['mustard sauce cress',
 'corianders leaf',
 'wild raspberries bushes',
 'quarks',
 'bean curds mayonnaise',
 'spanish cheese',
 'gras pea',
 'honey acetum dressings',
 'aduki moth bean',
 'citruss paradisis peels']

In [7]:
all_techniques_graph = nx.read_gexf('data/english_techniques_lexicon_5.gexf')

In [8]:
techniques = all_techniques_graph.nodes()
techniques = [t for t in techniques if t not in english_stopwords]

In [9]:
techniques[:10]

['vacuum filling',
 'microwave oven',
 'nitrogen',
 'bain marie',
 'air cook frying',
 'deglazed',
 'chop',
 'barbecue partial cooking',
 'chill',
 'food drying roast']

In [10]:
def trim(s):
    return ' '.join(s.split())

In [11]:
def my_ngrams(s):
    ngrms = []
    tokens = nltk.word_tokenize(s)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

In [None]:
%%time

def insert_documents(docs):
    try:
        db.epicurious_raw.insert_many(docs, ordered=False)
    except BulkWriteError as e:
        pass

# db.epicurious_raw.drop()

count = 0
rows = []

# path = 'data/recipes/epicurious/'
path = '/media/antonio/WD1T/datasets-recipes/epicurious/'
for folder_number in sorted(map(int, os.listdir(path))):
    folder = str(folder_number)
    for filename_number in sorted(map(lambda x: int(x[:-5]), os.listdir(path + folder))):
        filename = str(filename_number) + '.html'
        with open(path + folder + '/' + filename) as f:
            row = {
                '_id': '',
                'title': '',
                'year': 0,
                'ingredients': set(),
                'techniques': set(),
            }
            soup = BeautifulSoup(f.read(), 'html.parser')
            year = soup.select('meta[itemprop=datePublished]')[0]['content'].split('-')[0]
            if not year:
                continue
            ingreds = set()
            elems = soup.select(
                'div.recipe-content div.ingredients-info ul.ingredients li.ingredient'
            )
            ingrs = list(map(lambda x: trim(x.text.lower()), elems))
            for ingr in ingrs:
                ngrms = my_ngrams(ingr)
                ngrms.reverse()
                for ngrm in ngrms:
                    if ngrm in ingredients:
                        ingreds.add(ngrm)
                        all_ingredients_graph.node[ngrm]['count'] += 1
                        break
            if not ingreds:
                continue
            techns = set()
            elems = soup.select(
                'div.recipe-content div.instructions ol.preparation-steps li.preparation-step'
            )
            steps = list(map(lambda x: trim(x.text.lower()), elems))
            for step in steps:
                used_ngrams = set()
                ngrms = my_ngrams(step)
                ngrms.reverse()
                for ngrm in ngrms:
                    if ngrm in techniques and all(map(lambda x: ngrm not in x, used_ngrams)):
                        techns.add(ngrm)
                        all_techniques_graph.node[ngrm]['count'] += 1
                        used_ngrams.add(ngrm)
            if not techns:
                continue
            ide = str(filename_number)
            title = soup.select('meta[property=og:title]')[0]['content']
            row['_id'] = ide
            row['title'] = title
            row['year'] = year
            row['ingredients'] = list(ingreds)
            row['techniques'] = list(techns)
            rows.append(row)

            count += 1
            if count % 100 == 0:
                insert_documents(rows)
                rows = []
                print(count, 'rows inserted')
insert_documents(rows)
rows = []
print(count, 'rows inserted')

In [18]:
ids=[]
for r in db.epicurious_raw.find():
    ids.append(int(r['_id']))

In [19]:
len(ids)

2800

In [20]:
ids.sort()

In [21]:
ids[-10:]

[4038, 4039, 4041, 4043, 4060, 4061, 4062, 4063, 4064, 4065]

In [22]:
ids[:10]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [24]:
folder

'4000'

In [23]:
for folder_number in sorted(map(int, os.listdir(path)))[4:5]:
    folder = str(folder_number)
    for filename_number in sorted(map(lambda x: int(x[:-5]), os.listdir(path + folder))):
        filename = str(filename_number) + '.html'
        print(filename)

4000.html
4001.html
4002.html
4003.html
4004.html
4005.html
4006.html
4007.html
4008.html
4009.html
4010.html
4011.html
4012.html
4013.html
4014.html
4015.html
4016.html
4017.html
4018.html
4019.html
4020.html
4021.html
4022.html
4023.html
4024.html
4025.html
4026.html
4027.html
4028.html
4029.html
4030.html
4031.html
4032.html
4033.html
4034.html
4035.html
4036.html
4038.html
4039.html
4041.html
4042.html
4043.html
4060.html
4061.html
4062.html
4063.html
4064.html
4065.html
4066.html
4067.html
4068.html
4069.html
4070.html
4071.html
4072.html
4073.html
4074.html
4075.html
4076.html
4077.html
4078.html
4079.html
4080.html
4082.html
4083.html
4084.html
4085.html
4086.html
4087.html
4088.html
4089.html
4090.html
4091.html
4092.html
4093.html
4094.html
4095.html
4096.html
4097.html
4098.html
4099.html
4101.html
4102.html
4103.html
4104.html
4105.html
4106.html
4107.html
4108.html
4109.html
4110.html
4111.html
4112.html
4113.html
4114.html
4115.html
4116.html
4117.html
4118.html
4119.html
