In [1]:
from bs4 import BeautifulSoup
import requests
import urllib
import os
import re
from urllib.request import urlopen

import pandas as pd
import numpy as np

In [59]:
def make_soup(url):
    '''
    INPUT: url string
    OUTPUT: BeautifulSoup object
    '''
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

In [18]:
soup1 = make_soup('https://minimalistbaker.com/recipes/')
posts = soup1.find_all('a', class_="entry-title-link")
links = [post.get("href") for post in posts]


In [10]:
def get_recipe_links(url):
    '''
    Gets all links to recipe post on one archival page.
    Returns list of strings containing the URLs
    '''
    soup = make_soup(url)
    posts = soup.find_all('a', class_="entry-title-link")
    links = [post.get("href") for post in posts]
    return links

Psuedocode for getting all recipe posts

- for every page
    - get the links for the posts on the page 
    - for every post on a page
        - get the text body of that post
        - save it into an array / dataframe
    

In [119]:
def grab_body(url):
    '''
    INPUT: blog post url
    OUTPUT: list of sentence strings
    '''
    soup = make_soup(url)
    art = soup.article
    recipe = art.find_all('div', class_="wprm-recipe-container")
    recipe_start = str(recipe[0])[:30]
    idx = str(art).index(recipe_start)
    data = str(art)[:idx]
    data = re.sub(r'<.*?>', '', data)
    data = re.sub('\xa0', '', data)
    data = re.sub('\n', ' ', data)
    return data, soup.title.text
    


In [129]:
def get_bodies_on_page(url):
    df = pd.DataFrame(columns=['title', 'body', 'url'])
    links = get_recipe_links(url)
    for link in links:
        try:
            body, title = grab_body(link)
            A = pd.DataFrame([[title, body, link]], columns=['title', 'body', 'url'])
            df = pd.concat([df,A], axis=0)
        except:
            continue
    return df

In [133]:
def scrape_mb(url):
    data = pd.DataFrame(columns=['title','body','url'])
    data = pd.concat([data, get_bodies_on_page(url)], axis=0)
    for n in range(2, 15):
        print(n)
        url = 'https://minimalistbaker.com/recipes/page/{}/'.format(n)
        df = get_bodies_on_page(url)
        data = pd.concat([data, df], axis=0)
    return data

In [134]:
data = scrape_mb('https://minimalistbaker.com/recipes/')
data.head()

2
3
4
5
6
7
8
9
10
11
12
13
14


Unnamed: 0,title,body,url
0,Fluffy Vegan Scrambled Eggs | Minimalist Baker...,Fluffy Vegan Scrambled Eggs If you’ve ever w...,https://minimalistbaker.com/recipes/
0,Vegan Buttercream Frosting Recipe | Minimalist...,How to Make Vegan Buttercream Frosting If you...,https://minimalistbaker.com/recipes/
0,Baked Crispy Peanut Tofu | Minimalist Baker Re...,Baked Crispy Peanut Tofu Tofu can either be ...,https://minimalistbaker.com/recipes/
0,Easy 1-Pot Vegan Tom Yum Soup | Minimalist Bak...,Lazy 1-Pot Vegan Tom Yum Soup A while back i...,https://minimalistbaker.com/recipes/
0,1-Bowl Chocolate Banana Baked Oatmeal | Minima...,1-Bowl Chocolate Chip Banana Baked Oatmeal Y...,https://minimalistbaker.com/recipes/


In [135]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 223 entries, 0 to 0
Data columns (total 3 columns):
title    223 non-null object
body     223 non-null object
url      223 non-null object
dtypes: object(3)
memory usage: 7.0+ KB


In [98]:
links = get_recipe_links('https://minimalistbaker.com/recipes/page/13/')

#body, title = grab_body(links[1])
soup = make_soup(links[1])
art = soup.article
recipe = art.find_all('div', class_="wprm-recipe-container")
print(type(recipe))
recipe_start = str(recipe)[:30]
recipe_start

<class 'bs4.element.ResultSet'>


'[]'

In [55]:
df = pd.read_csv('/Users/clairemason/Galvanize/Capstone/capstone/data/minimal_bake.csv')
df = df[['title','body']]

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 946 entries, 0 to 945
Data columns (total 2 columns):
title    946 non-null object
body     946 non-null object
dtypes: object(2)
memory usage: 14.9+ KB


In [57]:
df.body.iloc[0]

'Fluffy Vegan Scrambled Eggs\n\n\nIf you’ve ever wanted to make a vegan version of scrambled eggs but craved something a little more special than tofu, this is the recipe for you.\nPerhaps you’re looking for a more “eggy” texture and flavor than tofu, want to cut back on eggs, are sensitive to soy/tofu, or just want to change up your breakfast routine.\nIn any case, let me show you how easy it is to make these fluffy, vegan scrambled eggs!\n\nThis recipe is inspired by the Just Egg product everyone’s been talking about. I was so excited to see an egg product that looked (and supposedly tasted) like the real thing! The only thing I didn’t love was the addition of canola oil, which some health experts believe to be highly processed and inflammatory in the body.\nSo, I wanted to take a swing at my own version and am pretty excited about the results.\n\n\nThe base of this 9-ingredient recipe is soaked split mung beans, a.k.a. moong dal.\nIt’s a soft legume that’s easy to digest and happens

In [16]:
from nltk import sent_tokenize

In [49]:
def split_sentences(body):
    sents = sent_tokenize(body)
    new = ''
    for i, sent in enumerate(sents):
        if '\n' in sent:
            new += sents.pop(i)
    new = [line for line in new.split('\n') if line and len(line)>1]        
    sents += new
    return sents

In [50]:
sents_series = pd.Series([split_sentences(df.body.iloc[i]) for i in range(len(df))])

In [51]:
tot = 0
for i in sents_series:
    tot += len(i)
print(tot)

27176


In [71]:
def get_recipe_instructions()
    soup = make_soup('https://minimalistbaker.com/30-minute-potato-cauliflower-red-lentil-curry/')
    recipe = soup.find_all('div', class_="wprm-recipe-instruction-text")

    instructions = [instruct.text for instruct in recipe]
    ins_str = []
    for instruct in instructions:
        for sent in sent_tokenize(instruct):
            ins_str.append(sent)

    return ins_str