Code used to scrape the Recipies

In [6]:
import re
import requests
import os.path
import time
import pickle
import numpy as np
from bs4 import BeautifulSoup as bs

In [7]:
def get_Recipe_Base_HTML():
#     """Use the \li{requests} library to get the HTML source for the website 
#     \url{http://mc6help.tripod.com/RecipeLibrary/RecipeLibrary.htm}.
#     Save the source as a file called recipies.html.
#     If the file already exists, do not scrape the website or overwrite the file.
#     """
    if os.path.exists('recipes.html'): # checks if the folder already exists
        print("folder already here")
    else:
        with open('recipes.html', 'w') as f:
            response = requests.get("http://mc6help.tripod.com/RecipeLibrary/RecipeLibrary.htm") # gets the response from the website
            f.write(response.text)                            # writes the response from the website to the file
    return

In [8]:
get_Recipe_Base_HTML()

folder already here


In [9]:
def get_HTML_extensions(filename = 'recipes.html'):
    """Return a list of the names of the text file extensions from the recipe website."""
    extensions = []
    with open(filename, 'r') as f:
        text = f.read()
    soup = bs(text,"html.parser") # create a beautiful soup object of the given code
    table_list = soup.find_all(href=True)
    for i in table_list:
        if len(i.text) > 3: # ignore the texts that are blank - all the ones
                            # we need are .txt files, so at least 3 chars long
            extensions.append(i.text)
    return extensions    # return the tag name list

In [21]:
type_list = get_HTML_extensions() # contains the extensions for all
                                  # recipes of a specific food type 

In [34]:
def get_categories():
    ''' Gets all the category variables and saves them in a pickle.'''
  
    if os.path.exists('categories.pickle'): # checks if the folder already exists
        print("folder already here: returning contents")
        with open('categories.pickle','rb') as f:
            categories = pickle.load(f) # load the saved contents 
            return categories
    # otherwise, scrapes the website, pickles the information, and 
    # returns the contents
    else:
        print("folder not here yet: creating contents")
        type_list = get_HTML_extensions() # contains the extensions for all
                                          # recipes of a specific food type 
        categories = [t[:-4] for t in type_list] # remove the '.txt' from the list names
        # put a space between the words
        categories = [re.sub(r"(?<=\w)([A-Z])", r" \1", c) for c in categories]
        # then put a space between 'and' and the proceeding word (if there is an 'and')
        categories = [re.sub(r"(?<=)(and )", r" \1", c) for c in categories]
     
        with open('categories.pickle','wb') as f:
            pickle.dump(categories,f) # save the contents
            
        return categories    

In [36]:
categories = get_categories()

folder not here yet: creating contents


In [15]:
def get_recipes():
    ''' Gets all the recipes and saves them in a pickle.  Doesn't scrape
    the website if the information is already there.'''

    # if the website has been scraped, doesn't scrape it again.
    # return the scraped contents
    if os.path.exists('recipes.pickle'): # checks if the folder already exists
        print("folder already here: returning contents")
        with open('recipes.pickle','rb') as f:
            recipes = pickle.load(f) # load the saved contents 
            return recipes
    # otherwise, scrapes the website, pickles the information, and 
    # returns the contents
    else:
        print("folder not here yet: creating contents")
        text_data = [] # create the list to store the contents
        for i in type_list:
            time.sleep(.25)
            # gets the text files from the links on the website
            contents = requests.get(f"http://mc6help.tripod.com/RecipeLibrary/{i}") 
            text_data.append(contents.text) # appends the contents to the list
            
        with open('recipes.pickle','wb') as f:
            pickle.dump(text_data,f) # save the contents
            
        return text_data    

In [16]:
recipes = get_recipes()

folder already here: returning contents


In [17]:
len(recipes) # we can see how many 'categories' of recipes we have to work with

84

Let's check out what a random title and list of recipes looks like!

In [38]:
i = np.random.randint(len(recipes)+1)
print(f"Title: {categories[i]}\n")
print(f"List of Recipes:\n{recipes[i]}")

Title: All Cookie Recipes

List of Recipes:
* Exported from MasterCook *

                     Acorns

Recipe By     : 
Serving Size  : 0     Preparation Time : 0:00
Categories    : 
  Amount  Measure       Ingredient -- Preparation Method
--------  ------------  --------------------------------
       1           cup  butter -- melted
     3/4           cup  brown sugar -- firmly packed
   1 1/2          cups  pecans, chopped fine -- *
   2 1/2          cups  all-purpose flour -- sifted
     1/2      teaspoon  baking powder
       1           cup  semi-sweet chocolate chips

* divided in 3/4 cup portions This is an easy yet elegant butter-pecan cookie shaped to resemble an acorn and dipped in melted chocolate chips and chopped pecans. Preheat oven to 375. In a large bowl, beat together butter, brown sugar, 3/4 cup chopped pecans and vanilla on medium speed until well blended. Add flour and baking powder and mix well, using low speed. Shape dough into 1-inch balls. Slig