In [28]:
import os
import pickle
from collections import Counter
from collections import defaultdict

import networkx as nx
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.util import ngrams
from pymongo import MongoClient
from pymongo.errors import BulkWriteError

In [2]:
client = MongoClient()
db = client.recipes

# Parser

In [3]:
english_stopwords = set(stopwords.words('english'))

In [4]:
all_ingredients_graph = nx.read_gexf('data/english_ingredients_lexicon_5.gexf')

In [5]:
ingredients = all_ingredients_graph.nodes()
ingredients = [i for i in ingredients if i not in english_stopwords]

In [6]:
ingredients[:10]

['chantilly creams of wheat berries',
 'sour chantilly creams',
 'snake beans',
 'oxheart tomatoes chutneys',
 'orange roughies',
 'crabs apple',
 'amla',
 'wheats berriess beers',
 'lemon myrtle',
 'fresnoes pepper']

In [7]:
all_techniques_graph = nx.read_gexf('data/english_techniques_lexicon_5.gexf')

In [8]:
techniques = all_techniques_graph.nodes()
techniques = [t for t in techniques if t not in english_stopwords]

In [9]:
techniques[:10]

['airing cooking frying',
 'engine cook',
 'drying blind baked',
 'tataki',
 'prepared',
 'anti griddle',
 'extracted',
 'vacuum flask browning',
 'al dente',
 'food drying blind baking']

In [10]:
def trim(s):
    return ' '.join(s.split())

In [11]:
def my_ngrams(technique):
    ngrms = []
    tokens = nltk.word_tokenize(technique)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

In [None]:
%%time

def insert_documents(docs):
    try:
        db.allrecipes_raw.insert_many(docs, ordered=False)
    except BulkWriteError as e:
        pass

# db.allrecipes_raw.drop()

count = 0
rows = []

# path = 'data/recipes/allrecipes/'
path = '/media/antonio/WD1T/datasets-recipes/allrecipes/'
for folder_number in sorted(map(int, os.listdir(path))):
    folder = str(folder_number)
    for filename_number in sorted(map(lambda x: int(x[:-5]), os.listdir(path + folder))):
        filename = str(filename_number) + '.html'
        with open(path + folder + '/' + filename) as f:
            row = {
                '_id': '',
                'title': '',
                'year': 0,
                'ingredients': set(),
                'techniques': set(),
            }
            soup = BeautifulSoup(f.read(), 'html.parser')
            dates = soup.select('section#reviews article.reviewer-info meta[itemprop=dateCreated]')
            year = 0
            if dates:
                year = min(map(lambda x: x['content'].split('-')[0], dates))
            if not year:
                continue
            ingreds = set()
            elems = soup.select(
                'section.recipe-ingredients li.checkList__line span[itemprop=ingredients]'
            )
            ingrs = list(map(lambda x: trim(x.text.lower()), elems))
            for ingr in ingrs:
                ngrms = my_ngrams(ingr)
                ngrms.reverse()
                for ngrm in ngrms:
                    if ngrm in ingredients:
                        ingreds.add(ngrm)
                        all_ingredients_graph.node[ngrm]['count'] += 1
                        break
            if not ingreds:
                continue
            techns = set()
            elems = soup.select(
                'section.recipe-directions li.step span.recipe-directions__list--item'
            )
            steps = list(map(lambda x: trim(x.text.lower()), elems))
            for step in steps:
                used_ngrams = set()
                ngrms = my_ngrams(step)
                ngrms.reverse()
                for ngrm in ngrms:
                    if ngrm in techniques and all(map(lambda x: ngrm not in x, used_ngrams)):
                        techns.add(ngrm)
                        all_techniques_graph.node[ngrm]['count'] += 1
                        used_ngrams.add(ngrm)
            if not techns:
                continue
            recipe_info = soup.find('recipe-signup')
            ide = recipe_info['data-id']
            title = recipe_info['data-title']
            row['_id'] = ide
            row['title'] = title
            row['year'] = year
            row['ingredients'] = list(ingreds)
            row['techniques'] = list(techns)
            rows.append(row)

            count += 1
            if count % 100 == 0:
                insert_documents(rows)
                rows = []
                print(count, 'rows inserted')
insert_documents(rows)
rows = []
print(count, 'rows inserted')

100 rows inserted
200 rows inserted
300 rows inserted
400 rows inserted
500 rows inserted
600 rows inserted
700 rows inserted
800 rows inserted
900 rows inserted
1000 rows inserted
1100 rows inserted
1200 rows inserted
1300 rows inserted
1400 rows inserted
1500 rows inserted
1600 rows inserted
1700 rows inserted
1800 rows inserted
1900 rows inserted
2000 rows inserted
2100 rows inserted
2200 rows inserted
2300 rows inserted
2400 rows inserted
2500 rows inserted
2600 rows inserted
2700 rows inserted
2800 rows inserted
2900 rows inserted
3000 rows inserted
3100 rows inserted
3200 rows inserted
3300 rows inserted
3400 rows inserted
3500 rows inserted
3600 rows inserted
3700 rows inserted
3800 rows inserted
3900 rows inserted
4000 rows inserted
4100 rows inserted
4200 rows inserted
4300 rows inserted
4400 rows inserted
4500 rows inserted
4600 rows inserted
4700 rows inserted
4800 rows inserted
4900 rows inserted
5000 rows inserted
5100 rows inserted
5200 rows inserted
5300 rows inserted
54

In [13]:
ids=[]
for r in db.allrecipes_raw.find():
    ids.append(int(r['_id']))

In [14]:
len(ids)

6762

In [17]:
ids.sort()

In [18]:
ids[-10:]

[13551, 13552, 13553, 13554, 13555, 13556, 13557, 13558, 13559, 13560]

In [19]:
ids[0:10]

[6663, 6664, 6665, 6666, 6667, 6668, 6669, 6670, 6671, 6672]

In [27]:
folder

'13000'

In [26]:
db.allrecipes_raw.count()

6762