In [1]:
import re
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os.path
import pickle
import re # imports regular expressions
from collections import Counter
import collections
import operator
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process
import inflect

In [2]:
def get_saved_recipes(): 
    '''Gets the saved recipes and returns them'''
    if os.path.exists('recipes.pickle'): # checks if the folder already exists
        print("folder already here: returning contents")
        with open('recipes.pickle','rb') as f:
            recipes = pickle.load(f) # load the saved contents 
            return recipes
    else:
        print("folder not here yet.  Run contents from 'Scraper.ipynb' "
              "to create the folder")

In [3]:
recipes = get_saved_recipes()

folder already here: returning contents


In [4]:
def get_saved_categories():
    ''' Gets all the category variables and returns them.'''
  
    if os.path.exists('categories.pickle'): # checks if the folder already exists
        print("folder already here: returning contents")
        with open('categories.pickle','rb') as f:
            categories = pickle.load(f) # load the saved contents 
            return categories
    # otherwise, scrapes the website, pickles the information, and 
    # returns the contents
    else:
        print("folder not here yet.  Run contents from 'Scraper.ipynb' "
              "to create the folder")

In [5]:
categories = get_saved_categories()
c_len = len(categories)

folder already here: returning contents


In [6]:
def split_category_to_recipes(category):
    '''Takes in a string of text from a single category of recipes,
       and returns a list of strings containing the recipes contained
       in that category'''
    # create the regex that all the recipes follow- not to give a 'clean'
    # cut-off, but rather to separate one recipe from the next.
    one_recipe_pattern = re.compile(r"\* Exported from MasterCook \*(.+?)Nutr\. Assoc\. : (\d+?)", re.DOTALL)
    batch = one_recipe_pattern.findall(r) # splits up the text to it's portions,
                               # but the formatting is as a list of strings
                               # in parenthesies
    singles = []
    for i in range(len(batch)):
        singles.append(batch[i][0]) # unpacks the information to make it accessable
    return singles    

In [7]:
# We'll want each category to contain a list of dictionaries
# with the following information:
# name of the recipe:
split_rs = []
for r in recipes:
    split_rs.append(split_category_to_recipes(r))

In [8]:
 # check and see how many total recipes we have
sum([len(split_rs[i]) for i in range(len(split_rs))])

5559

In [9]:
def get_recipe_info(recipe):
    '''Takes in a recipe string, and uses regex to parse out:
       the Title, ingredients (as a group), and the serving size
       
    '''
    title_pattern = re.compile(r"([A-Za-z]{1}[^\r\n\t\f\v]*)") # take the first match
    ingredients_batch_pattern = re.compile(r"--------------------------------(.+?)[\n\r]{4}", re.S)
    serving_size_pattern = re.compile(r"Serving Size  :\s*(\d*)")
    
    title = title_pattern.search(recipe).group(0) # we only need the first match
    serving_size = serving_size_pattern.findall(recipe)[0]
    ingredients_batch = ingredients_batch_pattern.findall(recipe)[0]
        
    return title, serving_size, ingredients_batch
    

In [10]:
# lets check out the names of the different recipes!
for category in split_rs:
    for recipe in category:
        name = get_recipe_info(recipe)[0]
        print(name)

Almond Liqueur
Cafe Mexicano
Coffee Liqueur
Chartreuse Cocktail
Whiskey Sour
Sangria Blanco
Raspberry Liqueur
Pina Colada
Orange Liqueur
Mexican Tea Punch
Margarita Sunrise
Margarita
Irish Coffee
Hot Buttered Rum
Hazelnut Liqueur
Glogg
Frozen Daiquiri
Bloody Mary
Amaretto Coffee
Berry Liqueur
Black Russian
Mango Margarita
Sparkling Berry Champagne
Caipirinha
Barbecue Pecans
Brandied Fruit Balls
Cajun Chex Party Mix
Champagne Batter
Candied Cashews
Devilish Eggs
Deviled Egg Slices, Pk
Debbie's Spiced Pecans
Curried Pecans
Artichoke Ham Bites
Curried Meat Balls, Pk
Cucumber Rye Surprises, Pk
Cucumber Rings
Crunchy Chocolate-Coconut Balls
Fruit Kabobs
Fried Won Tongs or Won Tong Soup
Fried Tortilla Chips
Five-Spice Appetizer Meatballs
Festive Snack Mix, Pk
Ella's Divine Date Rum Balls
Dolmas
Dip For Sausage Balls
Crispy Pecan Logs, Pk
Cranberry Coconut Fruit Balls
Crab-Papaya Appetizer, Pk
Crab And Avocado Cocktail
Corn 'n Bacon Sticks, Pk
Copenhagens
Cocktail Beer Ball
Chocolate Covered 

Chili Con Carne 2
Chili Con Carne
Chili Bean Soup
Chili Base
Chile Caribe
Chicken Chili
Cheap Cheap Chili
Carroll Shelby's Chili
Pierre's Chili
Pegleg Shorty's Chili
Pedernales River Chili
Pat's Special Diet Chili
Out-Of-The-Ordinary Chili
Oakwood Feed Store Chili
Numero Uno Chili
New Mexico Red Bean Chili
Nevada Cowboy Chili
Nevada Annie's Cowboy Chili
Nevada Annie's Champion Chili
Neiman Marcus Chili Blanco
Navajo Green Chili
My Skyline Chili
Murray's Girlfriend's Cincinnati Chili
Murray's Cincinnati Chili
Margo Knudson's Chili
Mad Mike's Chili
Low Fat Lentil Chili
Lo Cal Chili Con Carne
Lewis And Clark's White Chili
Lbj Pedernales River Chili
Last Minute Chili
L. J's Chili
Kahlua Chili
Joe Cooper's Chili
Jenny's Chili
Jeanne Owen's Chili Con Carne
Jay's Chili
Jay Pennington's Just Plain Good Chili
Jack's Easy Cincy Chili
Hunan Style Salmon Chili
Hot Red Chili
Headquarters Chili
Ham Flannagan's Va. Champ Chili
Half-Hour Chili
Ground Turkey And Black-Bean Chili
Gringo Chili
Green Chil

Corn Tortilla with Fresh Flower Petals
Black Bean and Rice Burritos
Healthy Chili
Crab Quesadillas
Country Corn & Southwestern Beef Salsa
South-of-the-Border Chicken Salad
Arroz Con Pollo
Apple Crunch Muffins
Spicy Apricot Oat Muffins
Pumpkin Oat Muffins
Pumpkin Muffins
Pecan Cinnamon Muffins
Parmesan Herb Muffins
Oatmeal Muffins
Molasses Refrigerator Muffins
Spicy Mandarin Muffins
High-Protein Muffins
Heirloom Raisin Muffins
Dilly Zucchini Ricotta Muffins
Date Or Raisin Bran Muffins
Cranberry Muffins
Corn Meal Muffins
Bran Muffins #3
Bran Muffins #2
Banana-Nut Muffins
Basic Muffins And Variations
Blueberry Muffins
Bran Muffins #1
Bran Cereal Muffins
Banana Muffin Surprise
Bacon And Onion Muffins
Quick and Easy Herb Rolls
Blueberry Buttermilk Muffins
Zesty Corn Muffins
Brides Lunch Punch
Dots Punch
Egg Cream
Fresh Fruit Frappe
Homemade Rootbeer
Orange-Cranberry Tea
Hot Chocolate Float
Strawberry Lemonade
Southwest Smoothie
Six Hour Root Beer
Simple And Delicious Lemonade
Punch #1
Punch

Surprise Tomato Egg Basket
Polenta Pie with Fresh Mushrooms & Spinach
Buff Puff
Pilaf-Stuffed Peppers
Spanish Bean Frittata
Tangy Pear 'n Pecan Noodle Salad
Baked Acorn Squash
Baked Vegetables au Gratin
Spinach, Mushroom, and Mozzarella Wrap
Black Bean and Rice Burritos
Plain Omelet with Potato and Onion
Carrot Thurum (Grated Carrot Curry)
Middle Eastern Sandwiches
Linguine and Spinach Pesto
Creamed Spinach on Toast
Eggplant Manicotti
Healthy Chili
Green and White Lasagne
Asian-Style Vegetable Stir Fry
Zucchini Cheese Casserole
Vietnamese Spring Rolls
Creamed Mushrooms on Toast with Hard-Boiled Eggs
Double Pepper Pizza
Giant Spinach Shells
Toasted Couscous with Almonds and Raisins
Chinese Vegetables
Fresh Mushroom, Black Bean and Green Chili Melt
Cauliflower Pakora
Fresh Mushroom, Onion and Walnut Stuffing
Green Bananas in Balsamic-Olive Vinaigrette
Crisp Fried Noodles and Chili Vegetables
Chilled Avocado Soup
Falafel
Szechwan Noodles
Garden Vegetable Kebabs
Roasted Veggie and Cheese P

In [11]:
def create_recipe_df():
    '''create the dictionary of all this information
    to turn it into a pandas dataframe.
    Note that for now, I leave the ingredients as a list.
    Creates the dataframe and returns it.'''

    # to create the dictionary, it needs to be of the form
    # {'col1':list(),'col2':list(),...}
    category_list = []
    title_list = []
    serving_size_list = []
    ingredients_batch_list = []
    for category,category_name in zip(split_rs,categories):
        for recipe in category:
            title,serving_size,ingredients_batch = get_recipe_info(recipe)

            # append the appropriate elements to create the needed dictionary
            category_list.append(category_name)
            title_list.append(title)
            serving_size_list.append(serving_size)
            ingredients_batch_list.append(ingredients_batch)

    # create the dataframe from the dictionary
    df = pd.DataFrame({'category':category_list,
                       'title':title_list,
                       'serving size':serving_size_list,
                       'ingredients batch':ingredients_batch_list})
    
    return df

In [12]:
df = create_recipe_df()
b = df['ingredients batch']
print(f"total number of recipes: {len(b)}")

total number of recipes: 5559


In [13]:
df.head()

Unnamed: 0,category,title,serving size,ingredients batch
0,Alcoholic Beverages,Almond Liqueur,1,\r\n 3 cup sugar\r\n 2 1/4 ...
1,Alcoholic Beverages,Cafe Mexicano,8,\r\n 8 cups water; cold\r\n ...
2,Alcoholic Beverages,Coffee Liqueur,1,\r\n 3 cup sugar\r\n 3 ...
3,Alcoholic Beverages,Chartreuse Cocktail,1,\r\n 1 1/2 ounces tequila; 3 T\r\n ...
4,Alcoholic Beverages,Whiskey Sour,1,\r\n 1 lemon; juiced\r\n ...


In [14]:
b.head()

0    \r\n       3           cup  sugar\r\n   2 1/4 ...
1    \r\n       8          cups  water; cold\r\n   ...
2    \r\n       3           cup  sugar\r\n       3 ...
3    \r\n   1 1/2        ounces  tequila; 3 T\r\n  ...
4    \r\n       1                lemon; juiced\r\n ...
Name: ingredients batch, dtype: object

In [15]:
def get_ingredients(batch):
    '''
    Takes in a 'batch' of ingredient information, and separates it
    into the quanity, unit, ingredient, and comment portions.
    Returns them as a list of tuples.
    '''
    lines = batch.splitlines()[1:] # splits the data into lines to break down
    
    # create the regex patterns to separate the data apart
    measure_pattern = re.compile(r"(\d+ \d/\d|\d+/\d+|\d+)?(?=  ) +?([A-Aa-z]+)?(?=  )", re.S)
    ingredient_info_pattern = re.compile(r".{24}(.*)")
    info_and_comments_pattern = re.compile(r".{24}(.*) +?-- +?(.*)")
    
    ans = [] # this will contain the 'answer': the list of tuples
    for line in lines:
        mini = measure_pattern.findall(line)
        numb = [measure[0] for measure in mini if measure[0] != ''];
        if numb: numb = numb[0]; # select the string out of the list if the list is not empty
        else: numb = None # otherwise, set the value as None rather than as an empty list
            
        unit = [measure[1] for measure in mini if measure[1] != '']; 
        if unit: unit = unit[0]; # see 'numb' bove for explanation
        else: unit = None 
            
        # get the string containing info about the ingredient
        if '--' not in line: # this is the case where a comment on this line
            ing_string_data = ingredient_info_pattern.findall(line)
            if ing_string_data: ing_string = ing_string_data[0];
            else: ing_string = None # see 'numb' bove for explanation
            comment_string = None    
        else: # if there is no comment on this line
            ing_string_data = info_and_comments_pattern.findall(line)
            
            ing_string = [text[0] for text in ing_string_data if text[0] !=''];
            if ing_string: ing_string = ing_string[0] # see 'numb' bove for explanation
            else: ing_string: ing_string = None
                    
            comment_string = [text[1] for text in ing_string_data if text[1] !=''];
            if comment_string: comment_string = comment_string[0]; # see 'numb' bove for explanation
            else: comment_string = None    
                                
        ans.append((numb, unit, ing_string, comment_string))
    return ans

In [16]:
[group for group in get_ingredients(b[13])] # just to check that the get_ingredients function works

[('2', 'tablespoons', 'brown sugar', None),
 ('4', 'teaspoons', 'butter', 'softened'),
 ('1', 'dash', 'ground cinnamon', None),
 ('1', 'dash', 'ground nutmeg', None),
 ('1 1/2', 'cups', 'warm water', None),
 ('1/2', 'cup', 'rum', None)]

In [17]:
all_ing_data = [] # we'll collect all the ingredient data here
for i in range(len(b)):
    all_ing_data += [group for group in get_ingredients(b[i])]
print(all_ing_data[:5])    

[('3', 'cup', 'sugar', None), ('2 1/4', 'cup', 'water', None), ('3', None, 'lemons; the rind', 'finely'), (None, None, 'grated', None), ('1', 'quart', 'vodka', None)]


Now we need to use the defined functions to go through and see what the top 150 or so ingredients are that are in all of the recipes.  This will be done by creating a counter over all ingredients, then sorting it.

Note that I make all strings lowercase from the beginning to help to standardize/normalize the names of the ingredients.

In [18]:
# we need to create a 'not_foods' list to make sure that things that aren't
# foods, but that show up in the top foods list don't get 'accepted' as foods
not_foods = {'or', 'the', 'chopped', 'chop', 'diced', 'ground', 'fresh', 'freshly'
            'to', 'taste', 'crushed', 'minced', 'sliced', 'optional', 
            'garnish', 'garnished', 'oz', '', 'boiling', 'boiled',
            'cold', 'and', 'spalding'}

# normalize all_ingredients by first making everything lowercase
all_ingredients = [all_ing_data[i][2].lower() for i in range(len(all_ing_data)) if all_ing_data[i][2] is not None]
# next singularize all the words in all_ingredients using the inflect package
stripped_ingredients = []
p = inflect.engine() # this will be used to singularize plural words
for ingredient in all_ingredients:
    # for each string, strip the non alphanumeric chars and
    # singularize the word if it is plural, then rejoin the string
    alph = re.compile('[\W_]+')
    word_list = [p.singular_noun(alph.sub('', word))
                 if p.singular_noun(alph.sub('', word)) 
                 else 
                 (alph.sub('', word) if word not in not_foods
                  else None)                 
                 for word in ingredient.split()]
    no_nones = list(filter(None, word_list))
    word = " ".join(map(str,no_nones))
    if word not in not_foods:
        stripped_ingredients.append(word)
stripped_ingredients[:10]    

['sugar',
 'water',
 'lemon rind',
 'grated',
 'vodka',
 'almond extract',
 'vanilla extract',
 'water',
 'dark brown sugar packed',
 'baking chocolate fine']

I will now take all the stripped ingredients, and sort them and count them.  I will then use the most common ingredients to create a list of 'head names' of those common ingredients, where each 'head name' will be the most common name for that ingredient.  For example, I expect 'popcorn' to be the headname for things like 'buttered popcorn','white popcorn' and 'popped corn'.

In [19]:
ingredient_counter = Counter(stripped_ingredients)
sorted_ings = sorted(ingredient_counter.items(), key=lambda kv: kv[1],reverse=True) # a list of the ingredients and 
                                                                                    # the number of their occurances

I will now use the FuzzyWuzzy package to put the ingredients with their correct 'header' ingredient name.

In [20]:
def get_ingredient_classes(sorted_ings):
    ''' Takes in a list of sorted ingredients,
     and gets all the ingredient 'head' variables and saves them in a pickle.
     returns a list of sublists, where the first element of each sublist is the 'head' 
     variable, and each other element are the variables that fall under it's 'ingredient type'.
     '''
  
    if os.path.exists('ingredient_name_groups.pickle'): # checks if the folder already exists
        print("pickle already here: returning contents")
        with open('ingredient_name_groups.pickle','rb') as f:
            name_groups = pickle.load(f) # load the saved contents 
            return name_groups
    # otherwise, scrapes the website, pickles the information, and 
    # returns the contents
    else:
        print("pickle not here yet: creating contents")
        name_groups = []
        check_list = set()
        j = 0
        while len(name_groups) < 200:
            current_item = sorted_ings[j]
            if current_item not in check_list:
                check_list.add(current_item)
                # use fuzzywuzzy to appropriately add elements to the same list as the 'head' element
                name_groups.append([sorted_ings[i][0] for i in range(j,2000) if fuzz.partial_ratio(current_item,sorted_ings[i]) > 80])
            j += 1

        with open('ingredient_name_groups.pickle','wb') as f:
            pickle.dump(name_groups,f) # save the contents

        return name_groups  

In [21]:
name_groups = get_ingredient_classes(sorted_ings)
headers = [n[0] for n in name_groups]
print(len(headers))
print(len(set(headers)))

pickle already here: returning contents
200
200


It can be seen above that I have successfully created a list of unique header values (since the list and set lengths are equal, there are no repeat values).

In [23]:
# name_groups
df

Unnamed: 0,category,title,serving size,ingredients batch
0,Alcoholic Beverages,Almond Liqueur,1,\r\n 3 cup sugar\r\n 2 1/4 ...
1,Alcoholic Beverages,Cafe Mexicano,8,\r\n 8 cups water; cold\r\n ...
2,Alcoholic Beverages,Coffee Liqueur,1,\r\n 3 cup sugar\r\n 3 ...
3,Alcoholic Beverages,Chartreuse Cocktail,1,\r\n 1 1/2 ounces tequila; 3 T\r\n ...
4,Alcoholic Beverages,Whiskey Sour,1,\r\n 1 lemon; juiced\r\n ...
5,Alcoholic Beverages,Sangria Blanco,8,\r\n 1/4 cup sugar\r\n 1/2 ...
6,Alcoholic Beverages,Raspberry Liqueur,1,\r\n 2 1/2 cups to 3 cups raspberr...
7,Alcoholic Beverages,Pina Colada,1,\r\n 1/4 cup crushed ice\r\n ...
8,Alcoholic Beverages,Orange Liqueur,1,\r\n 3 md oranges\r\n ...
9,Alcoholic Beverages,Mexican Tea Punch,8,\r\n 2 cups tequila\r\n ...


PROBLEM CHILD

Nutr. Assoc. : 0
* Exported from MasterCook *

                     Hors d' Oeuveres

Recipe By     : Janina Cobb, Calif.
Serving Size  : 1     Preparation Time : 0:00
Categories    : 
  Amount  Measure       Ingredient -- Preparation Method
--------  ------------  --------------------------------

Roll 1/2 strip bacon around large stuffed olives, broil 5 minutes.  Serve hot.

Balls of softened cream cheese on toothpicks, rooled in minced olives, parsley, dried beer or grated carrots.

Roll cocktail size shrimp in half slices of bacon, broil until crisp.  Serve hot.

Spread 4 squares ham slices with softened cream cheese seasoned onion, olives, mustard. Place slices together like a layer cake.  Spread cheese over top and sides. Decorate with diced olives.  Chill.  cut into wedges.

Cur crusts from sliced bread. Mix mayonnaise and minced onions.  Spread on bread.  roll around canned asparagus spears.  chill and slice into rounds before serving.

In [41]:
p.singular_noun('eggs')

'egg'