In [1]:
import re
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os.path
import pickle
import re # imports regular expressions
from collections import Counter
import collections
import operator
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process
import inflect

In [2]:
def get_saved_recipes(): 
    '''Gets the saved recipes and returns them'''
    if os.path.exists('recipes.pickle'): # checks if the folder already exists
        print("folder already here: returning contents")
        with open('recipes.pickle','rb') as f:
            recipes = pickle.load(f) # load the saved contents 
            return recipes
    else:
        print("folder not here yet.  Run contents from 'Scraper.ipynb' "
              "to create the folder")

In [3]:
recipes = get_saved_recipes()

folder already here: returning contents


In [4]:
def get_saved_categories():
    ''' Gets all the category variables and returns them.'''
  
    if os.path.exists('categories.pickle'): # checks if the folder already exists
        print("folder already here: returning contents")
        with open('categories.pickle','rb') as f:
            categories = pickle.load(f) # load the saved contents 
            return categories
    # otherwise, scrapes the website, pickles the information, and 
    # returns the contents
    else:
        print("folder not here yet.  Run contents from 'Scraper.ipynb' "
              "to create the folder")

In [5]:
categories = get_saved_categories()
c_len = len(categories)

folder already here: returning contents


In [32]:
def split_category_to_recipes(category):
    '''Takes in a string of text from a single category of recipes,
       and returns a list of strings containing the recipes contained
       in that category'''
    r = category
    # create the regex that all the recipes follow- not to give a 'clean'
    # cut-off, but rather to separate one recipe from the next.
    one_recipe_pattern = re.compile(r"\* Exported from MasterCook \*(.+?)Nutr\. Assoc\. : (\d+?)", re.DOTALL)
    batch = one_recipe_pattern.findall(r) # splits up the text to it's portions,
                               # but the formatting is as a list of strings
                               # in parenthesies
    singles = []
    for i in range(len(batch)):
        singles.append(batch[i][0]) # unpacks the information to make it accessable
    return singles    

In [33]:
def get_recipe_info(recipe):
    '''Takes in a recipe string, and uses regex to parse out:
       the Title, ingredients (as a group), and the serving size
       
    '''
    title_pattern = re.compile(r"([A-Za-z]{1}[^\r\n\t\f\v]*)") # take the first match
    ingredients_batch_pattern = re.compile(r"--------------------------------(.+?)[\n\r]{4}", re.S)
    serving_size_pattern = re.compile(r"Serving Size  :\s*(\d*)")
    
    title = title_pattern.search(recipe).group(0) # we only need the first match
    serving_size = serving_size_pattern.findall(recipe)[0]
    ingredients_batch = ingredients_batch_pattern.findall(recipe)[0]
        
    return title, serving_size, ingredients_batch
    

In [34]:
def save_recipe_info():
    ''' Gets all the recipe_info variables and saves them in a pickle.'''
  
    if os.path.exists('primary_recipe_info.pickle'): # checks if the folder already exists
        print("folder already here: returning contents")
        with open('primary_recipe_info.pickle','rb') as f:
            recipe_info = pickle.load(f) # load the saved contents 
            return recipe_info
        
    else:
        print("folder not here yet: creating contents")
        # We'll want each category to contain a list of dictionaries
        # with the following information:
        # name of the recipe:
        split_rs = []
        for r in recipes:
            split_rs.append(split_category_to_recipes(r))
        
        recipe_info = []
        for category in split_rs:
            for recipe in category:
                # gets the recipe name, serving size, and raw string of ingredient information
                recipe_info.append(get_recipe_info(recipe))

        with open('primary_recipe_info.pickle','wb') as f:
            pickle.dump(recipe_info,f) # save the contents
            
        return recipe_info   

In [35]:
r_info = save_recipe_info()

folder already here: returning contents


In [38]:
def create_recipe_df(recipes):
    '''create the dictionary of all this information
    to turn it into a pandas dataframe.
    Note that for now, I leave the ingredients as a list.
    Creates the dataframe and returns it.'''

    # to create the dictionary, it needs to be of the form
    # {'col1':list(),'col2':list(),...}
    category_list = []
    title_list = []
    serving_size_list = []
    ingredients_batch_list = []
    
    split_rs = []
       
    for r in recipes: 
        split_rs.append(split_category_to_recipes(r))
    
    for category,category_name in zip(split_rs,categories):
        for recipe in category:
            title,serving_size,ingredients_batch = get_recipe_info(recipe)

            # append the appropriate elements to create the needed dictionary
            category_list.append(category_name)
            title_list.append(title)
            serving_size_list.append(serving_size)
            ingredients_batch_list.append(ingredients_batch)

    # create the dataframe from the dictionary
    df = pd.DataFrame({'category':category_list,
                       'title':title_list,
                       'serving size':serving_size_list,
                       'ingredients batch':ingredients_batch_list})
    
    broken_recipes = {21,67,715,803,1109,2150,2347,2776,  ## These are the row numbers of recipes that I found broke some
                  3355,3356,3419,3420,4785,4786,4787, ## Code during the cleaning process; upon investigation, I don't
                  4788,4792,4800,5086,5089,5092,5162,5228} # want these!  Many are just tips/tricks, and the others are poorly formated.
    df = df.drop(broken_recipes)
    df = df.reset_index(drop=True)
    
    return df

In [45]:
def save_recipe_df(df,name):
    '''
    pickles the recipe dataframe.
    '''
    
    print('Pickling dataframe')
    with open(name,'wb') as f:
            pickle.dump(df,f) # save the contents
            
    return

In [46]:
df = create_recipe_df(recipes)
save_recipe_df(df,'recipe_df.pickle')

Pickling dataframe


In [47]:
def get_recipe_df(name):
    '''
    gets the pickled recipe dataframe from the name specified
    '''
    print('Unpickling dataframe')
    
    with open(name,'rb') as f:
        df = pickle.load(f) # save the contents
            
    return df