In [28]:
import os
import pickle
from collections import Counter
from collections import defaultdict

import networkx as nx
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.util import ngrams
from pymongo import MongoClient
from pymongo.errors import BulkWriteError

In [2]:
client = MongoClient()
db = client.recipes

# Parser

In [3]:
english_stopwords = set(stopwords.words('english'))

In [4]:
all_ingredients_graph = nx.read_gexf('data/english_ingredients_lexicon_5.gexf')

In [5]:
ingredients = all_ingredients_graph.nodes()
ingredients = [i for i in ingredients if i not in english_stopwords]

In [6]:
ingredients[:10]

['chantilly creams of wheat berries',
 'sour chantilly creams',
 'snake beans',
 'oxheart tomatoes chutneys',
 'orange roughies',
 'crabs apple',
 'amla',
 'wheats berriess beers',
 'lemon myrtle',
 'fresnoes pepper']

In [7]:
all_techniques_graph = nx.read_gexf('data/english_techniques_lexicon_5.gexf')

In [8]:
techniques = all_techniques_graph.nodes()
techniques = [t for t in techniques if t not in english_stopwords]

In [9]:
techniques[:10]

['airing cooking frying',
 'engine cook',
 'drying blind baked',
 'tataki',
 'prepared',
 'anti griddle',
 'extracted',
 'vacuum flask browning',
 'al dente',
 'food drying blind baking']

In [10]:
def trim(s):
    return ' '.join(s.split())

In [11]:
def my_ngrams(technique):
    ngrms = []
    tokens = nltk.word_tokenize(technique)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

In [None]:
%%time

def insert_documents(docs):
    try:
        db.allrecipes_raw.insert_many(docs, ordered=False)
    except BulkWriteError as e:
        pass

# db.allrecipes_raw.drop()

count = 0
rows = []

# path = 'data/recipes/allrecipes/'
path = '/media/antonio/WD1T/datasets-recipes/allrecipes/'
for folder_number in sorted(map(int, os.listdir(path))):
    folder = str(folder_number)
    for filename_number in sorted(map(lambda x: int(x[:-5]), os.listdir(path + folder))):
        filename = str(filename_number) + '.html'
        with open(path + folder + '/' + filename) as f:
            row = {
                '_id': '',
                'title': '',
                'year': 0,
                'ingredients': set(),
                'techniques': set(),
            }
            soup = BeautifulSoup(f.read(), 'html.parser')
            dates = soup.select('section#reviews article.reviewer-info meta[itemprop=dateCreated]')
            year = 0
            if dates:
                year = min(map(lambda x: x['content'].split('-')[0], dates))
            if not year:
                continue
            ingreds = set()
            elems = soup.select(
                'section.recipe-ingredients li.checkList__line span[itemprop=ingredients]'
            )
            ingrs = list(map(lambda x: trim(x.text.lower()), elems))
            for ingr in ingrs:
                ngrms = my_ngrams(ingr)
                ngrms.reverse()
                for ngrm in ngrms:
                    if ngrm in ingredients:
                        ingreds.add(ngrm)
                        all_ingredients_graph.node[ngrm]['count'] += 1
                        break
            if not ingreds:
                continue
            techns = set()
            elems = soup.select(
                'section.recipe-directions li.step span.recipe-directions__list--item'
            )
            steps = list(map(lambda x: trim(x.text.lower()), elems))
            for step in steps:
                used_ngrams = set()
                ngrms = my_ngrams(step)
                ngrms.reverse()
                for ngrm in ngrms:
                    if ngrm in techniques and all(map(lambda x: ngrm not in x, used_ngrams)):
                        techns.add(ngrm)
                        all_techniques_graph.node[ngrm]['count'] += 1
                        used_ngrams.add(ngrm)
            if not techns:
                continue
            recipe_info = soup.find('recipe-signup')
            ide = recipe_info['data-id']
            title = recipe_info['data-title']
            row['_id'] = ide
            row['title'] = title
            row['year'] = year
            row['ingredients'] = list(ingreds)
            row['techniques'] = list(techns)
            rows.append(row)

            count += 1
            if count % 100 == 0:
                insert_documents(rows)
                rows = []
                print(count, 'rows inserted')
insert_documents(rows)
rows = []
print(count, 'rows inserted')

In [46]:
ids=[]
for r in db.allrecipes_raw.find():
    ids.append(int(r['_id']))

In [47]:
len(ids)

22381

In [48]:
ids.sort()

In [49]:
ids[-10:]

[54968, 54969, 54970, 54975, 54976, 54981, 54982, 54989, 54990, 54994]

In [50]:
ids[0:10]

[6663, 6664, 6665, 6666, 6667, 6668, 6669, 6670, 6671, 6672]

In [51]:
folder

'54000'

In [26]:
db.allrecipes_raw.count()

6762

In [44]:
for folder_number in sorted(map(int, os.listdir(path)))[54:55]:
    folder = str(folder_number)
    for filename_number in sorted(map(lambda x: int(x[:-5]), os.listdir(path + folder)))[700:]:
        filename = str(filename_number) + '.html'
        print(filename)

54720.html
54721.html
54722.html
54723.html
54724.html
54725.html
54726.html
54727.html
54728.html
54729.html
54730.html
54731.html
54732.html
54733.html
54734.html
54735.html
54736.html
54737.html
54738.html
54739.html
54740.html
54741.html
54742.html
54743.html
54744.html
54745.html
54746.html
54747.html
54748.html
54749.html
54750.html
54751.html
54752.html
54753.html
54754.html
54755.html
54756.html
54757.html
54758.html
54759.html
54760.html
54761.html
54762.html
54763.html
54764.html
54765.html
54766.html
54767.html
54768.html
54769.html
54770.html
54771.html
54772.html
54773.html
54774.html
54775.html
54776.html
54777.html
54778.html
54779.html
54780.html
54781.html
54782.html
54783.html
54784.html
54785.html
54786.html
54787.html
54788.html
54789.html
54790.html
54791.html
54792.html
54793.html
54794.html
54795.html
54796.html
54797.html
54798.html
54799.html
54800.html
54801.html
54802.html
54803.html
54804.html
54805.html
54806.html
54807.html
54808.html
54809.html
54810.html

In [39]:
path

'/media/antonio/WD1T/datasets-recipes/allrecipes/'

In [42]:
sorted(map(int, os.listdir(path)))[54:]

[54000,
 55000,
 56000,
 57000,
 58000,
 59000,
 60000,
 61000,
 62000,
 63000,
 64000,
 65000,
 66000,
 67000,
 68000,
 69000,
 70000,
 71000,
 72000,
 73000,
 74000,
 75000,
 76000,
 77000,
 78000,
 79000,
 80000,
 81000,
 82000,
 83000,
 84000,
 85000,
 86000,
 87000,
 88000,
 89000,
 90000,
 91000,
 92000,
 93000,
 94000,
 95000,
 96000,
 97000,
 98000,
 99000,
 100000,
 101000,
 102000,
 103000,
 104000,
 105000,
 106000,
 107000,
 108000,
 109000,
 110000,
 111000,
 112000,
 113000,
 114000,
 115000,
 116000,
 117000,
 118000,
 119000,
 120000,
 121000,
 122000,
 123000,
 124000,
 125000,
 126000,
 127000,
 128000,
 129000,
 130000,
 131000,
 132000,
 133000,
 134000,
 135000,
 136000,
 137000,
 138000,
 139000,
 140000,
 141000,
 142000,
 143000,
 144000,
 145000,
 146000,
 147000,
 148000,
 149000,
 150000,
 151000,
 152000,
 153000,
 154000,
 155000,
 156000,
 157000,
 158000,
 159000,
 160000,
 161000,
 162000,
 163000,
 164000,
 165000,
 166000,
 167000,
 168000,
 169000,
 1