In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests
import time
import datetime

I'm going to need a handful of helper functions before I can start trying to fit models. My plan is to have two part models which consider the words associated with the ingredients and recipe instructions, and then a second layer that considers a handful of 'meta'-data, such as the amount of time the recipe takes or the number of steps. I'll make more as needed, but in advance I know I'll need at least the following functions:

1. Simple time translater: convert the string time description into a number of minutes. I say simple, because some recipes says things like "or more, for chilling" and I'm not going to try to classify that
2. Instruction-ingredient stripper. Remove the ingredient names from the instructions word lists, to avoid the cross-contamination of unique ingredient names becoming correlated with preparation ratings
3. Strip quantities out of ingredient list

Going to read in one of my smaller batches so I don't need to work with a dataframe with all 19,000 recipes.

In [2]:
df = pd.read_excel('batch_1.xlsx')

In [3]:
df.head()

Unnamed: 0,recipe,cooking_time,item_list,item_rating,instructions,instructions_rating,url
0,Mushroom-Farro Soup With Parmesan Broth,1 1/2 hours,"['1 cup pearled farro', '1 ounce dried mixed m...",,['Heat oven to 300 degrees and bring a small s...,,https://cooking.nytimes.com/recipes/1020933-mu...
1,Easiest Lentil Soup,1 hour,"['6 tablespoons extra-virgin olive oil, plus m...",,['Heat 1/4 cup oil in a medium pot over medium...,,https://cooking.nytimes.com/recipes/1019943-ea...
2,Beans and Garlic Toast in Broth,"2 1/4 hours, plus optional soaking","['1 cup dried beans, such as cannellini or cra...",,"['If you remember, soak the beans in cold wate...",,https://cooking.nytimes.com/recipes/1019241-be...
3,Parmesan Broth,2 3/4 hours,"['3 tablespoons extra-virgin olive oil', '1 la...",,"['In a large Dutch oven or heavy pot, heat the...",,https://cooking.nytimes.com/recipes/1020934-pa...
4,Potato Gratin With Swiss Chard and Sumac Onions,2 1/2 hours,"['¼ cup/60 milliliters olive oil', '1 ½ pounds...",,['Heat oven to 375 degrees Fahrenheit/180 degr...,,https://cooking.nytimes.com/recipes/1020928-po...


In [15]:
#The existence of the fractions is tricky, they don't convert to ints or floats well, need to split them
#and then convert to ints and then do math on them
df.iloc[0]['cooking_time'].split()

['1', '1/2', 'hours']

In [22]:
df.iloc[0]['cooking_time'].split()[2].split('/')

['hours']

In [26]:
#First step, split and then split again to seperate all numbers
split_up_time = []

for x in df.iloc[0]['cooking_time'].split():
    if len(x.split('/')) == 2:
        split_up_time.append(int(x.split('/')[0])/int(x.split('/')[1]))
    else:
        try:
            split_up_time.append(int(x))
        except:
            split_up_time.append(x)
            
split_up_time

[1, 0.5, 'hours']

In [36]:
#Now we check if these numbers associate with hours or minutes:
minutes = 0
running_total = 0
for n in range(0,len(split_up_time)):
    if (type(split_up_time[n]) == float) or (type(split_up_time[n]) == int):
        running_total += split_up_time[n]
    elif (split_up_time[n] == 'hours') or (split_up_time[n] == 'hour'):
        minutes += running_total*60
        running_total = 0
    elif (split_up_time[n] == 'minutes') or (split_up_time[n] == 'minute'):
        minutes += running_total
        running_total = 0
#This ended up not being robust enough, because sometimes punction marks get caught up
#The function has a similar logic, but checking the first however many characters

In [38]:
minutes

90.0

In [107]:
def simple_time_translater(text):
    if type(text) != str:
        return np.nan
    split_up_time = []

    for x in text.split():
        if len(x.split('/')) == 2:
            split_up_time.append(int(x.split('/')[0])/int(x.split('/')[1]))
        else:
            try:
                split_up_time.append(int(x))
            except:
                split_up_time.append(x)
                
    minutes = 0
    running_total = 0
    for n in range(0,len(split_up_time)):
        if (type(split_up_time[n]) == float) or (type(split_up_time[n]) == int):
            running_total += split_up_time[n]
        elif len(split_up_time[n]) >= 7:
            if (split_up_time[n][:6] == 'minute'):
                minutes += running_total
                running_total = 0
        elif len(split_up_time[n]) >= 4:
            if (split_up_time[n][:4] == 'hour'):
                minutes += running_total*60
                running_total = 0
        
    return minutes

In [108]:
test_times = []
for n in range(0,10):
    test_times.append(simple_time_translater(df.iloc[n]['cooking_time']))
test_times

[90.0, 60, 135.0, 165.0, 150.0, 50, 5, nan, 30, 75.0]

In [66]:
df.head(10)

Unnamed: 0,recipe,cooking_time,item_list,item_rating,instructions,instructions_rating,url
0,Mushroom-Farro Soup With Parmesan Broth,1 1/2 hours,"['1 cup pearled farro', '1 ounce dried mixed m...",,['Heat oven to 300 degrees and bring a small s...,,https://cooking.nytimes.com/recipes/1020933-mu...
1,Easiest Lentil Soup,1 hour,"['6 tablespoons extra-virgin olive oil, plus m...",,['Heat 1/4 cup oil in a medium pot over medium...,,https://cooking.nytimes.com/recipes/1019943-ea...
2,Beans and Garlic Toast in Broth,"2 1/4 hours, plus optional soaking","['1 cup dried beans, such as cannellini or cra...",,"['If you remember, soak the beans in cold wate...",,https://cooking.nytimes.com/recipes/1019241-be...
3,Parmesan Broth,2 3/4 hours,"['3 tablespoons extra-virgin olive oil', '1 la...",,"['In a large Dutch oven or heavy pot, heat the...",,https://cooking.nytimes.com/recipes/1020934-pa...
4,Potato Gratin With Swiss Chard and Sumac Onions,2 1/2 hours,"['¼ cup/60 milliliters olive oil', '1 ½ pounds...",,['Heat oven to 375 degrees Fahrenheit/180 degr...,,https://cooking.nytimes.com/recipes/1020928-po...
5,Braised Fennel With White Bean Purée,50 minutes,"['2 (15-ounce) cans white beans, rinsed', '2 ½...",,"['Prepare the bean purée: Add the beans, 2 1/2...",,https://cooking.nytimes.com/recipes/1020935-br...
6,Cold-Fashioned,5 minutes,"[' Ice, as needed', '1 ¼ ounces Irish whiskey,...",,['Fill a mixing glass halfway with ice. Add wh...,,https://cooking.nytimes.com/recipes/1020905-co...
7,NoMad Espresso Martini,,"['1 ounce Mr. Black Cold Brew coffee liqueur',...",,['Combine ingredients in a cocktail shaker hal...,,https://cooking.nytimes.com/recipes/1020936-no...
8,Japanese-Style Tuna Noodle Salad,30 minutes,"['¼ cup cut dried wakame seaweed', '8 ounces d...",,['Bring a large pot of water to a boil over hi...,,https://cooking.nytimes.com/recipes/1020939-ja...
9,Toor Dal (Split Yellow Pigeon Peas),"1 1/4 hours, plus soaking","['1 cup toor dal (split yellow pigeon peas)', ...",,['Prepare the dal: Soak the pigeon peas in a l...,,https://cooking.nytimes.com/recipes/1020907-to...


Now to some text processing. In particular, to turn each listed set (items, instructions) into bags and then remove the items from the instructions bag.

One quirk of the scraping process is that when I scraped the recipes, I created lists of ingredients and recipe steps, and then simply stored those lists nested in a dataframe, which I saved down as an excel. Reading that excel up here, it doesn't know to interpret the contents of those cells as python lists, it just has them as strings.

This only matters because I was planning on having certain 'meta' features of this data as their own features in my final classication models. For instance, the number of distinct ingredients or the number of steps in the recipe. Some of these features want a list and some want a collection of words, but it's worth turning everything into the list first and reading up the indivual components to get rid of some of the extranious punctuation marks.

In [120]:
eval(df.iloc[0]['item_list'])

['1 cup pearled farro',
 '1 ounce dried mixed mushrooms or dried porcini mushrooms, roughly torn or chopped',
 '1 tablespoon unsalted butter',
 '4 tablespoons extra-virgin olive oil',
 '1 cup minced shallots (from about 5 medium shallots)',
 ' Kosher salt and freshly ground black pepper',
 '3 garlic cloves, minced',
 '5 cups Parmesan broth',
 '⅓ cup dry white wine',
 '5 fresh thyme sprigs',
 '⅔ cup finely grated Parmesan (about 1 ounce), plus more for garnish',
 '1 pound mixed fresh mushrooms (such as beech mushrooms, hen of the woods or oyster mushrooms), roughly torn',
 '2 teaspoons sherry vinegar']

In [123]:
listified = eval(df.iloc[0]['item_list'])
listified[0]

'1 cup pearled farro'

In [131]:
#Is there a better way to test if a string could be either an integer or a float?
de_numbered = []
for x in listified[0].split():
    try:
        int(x)
    except:
        try:
            float(x)
        except:
            de_numbered.append(x)
        

In [132]:
de_numbered

['cup', 'pearled', 'farro']

In [133]:
#Will this work on the special fraction symbols?
de_numbered = []
for x in listified[8].split():
    try:
        int(x)
    except:
        try:
            float(x)
        except:
            de_numbered.append(x)

In [134]:
#Nope
de_numbered

['⅓', 'cup', 'dry', 'white', 'wine']

In [138]:
#Some searching online suggests that there is a way to test for these special unicode characters
import unicodedata
unicodedata.name(de_numbered[0])

'VULGAR FRACTION ONE THIRD'

In [148]:
#Oof, is this an ugly function! Well, it works... famous last words
de_numbered = []
for x in listified[8].split():
    if len(x) == 1:
        if unicodedata.name(x).startswith('VULGAR FRACTION'):
            pass
        else:
            try:
                int(x)
            except:
                try:
                    float(x)
                except:
                    de_numbered.append(x)
    else:
        try:
            int(x)
        except:
            try:
                float(x)
            except:
                de_numbered.append(x)

In [145]:
de_numbered

['cup', 'dry', 'white', 'wine']

In [160]:
import re

def de_numberfy(item_list):
    de_numbered = []
    for item in item_list:
        for x in item.split():
            if len(x) == 1:
                if unicodedata.name(x).startswith('VULGAR FRACTION'):
                    pass
                else:
                    try:
                        int(x)
                    except:
                        try:
                            float(x)
                        except:
                            de_numbered.append(x)
            else:
                try:
                    int(x)
                except:
                    try:
                        float(x)
                    except:
                        de_numbered.append(x)
    #A bit of regex to remove errant punctuation. Got that online, I need to learn regex...
    de_numbered = [re.sub(r'[^\w\s]','',x) for x in de_numbered]
    return de_numbered

In [161]:
de_numberfy(listified)

['cup',
 'pearled',
 'farro',
 'ounce',
 'dried',
 'mixed',
 'mushrooms',
 'or',
 'dried',
 'porcini',
 'mushrooms',
 'roughly',
 'torn',
 'or',
 'chopped',
 'tablespoon',
 'unsalted',
 'butter',
 'tablespoons',
 'extravirgin',
 'olive',
 'oil',
 'cup',
 'minced',
 'shallots',
 'from',
 'about',
 'medium',
 'shallots',
 'Kosher',
 'salt',
 'and',
 'freshly',
 'ground',
 'black',
 'pepper',
 'garlic',
 'cloves',
 'minced',
 'cups',
 'Parmesan',
 'broth',
 'cup',
 'dry',
 'white',
 'wine',
 'fresh',
 'thyme',
 'sprigs',
 'cup',
 'finely',
 'grated',
 'Parmesan',
 'about',
 'ounce',
 'plus',
 'more',
 'for',
 'garnish',
 'pound',
 'mixed',
 'fresh',
 'mushrooms',
 'such',
 'as',
 'beech',
 'mushrooms',
 'hen',
 'of',
 'the',
 'woods',
 'or',
 'oyster',
 'mushrooms',
 'roughly',
 'torn',
 'teaspoons',
 'sherry',
 'vinegar']

In [151]:
def instructions_bag(items_list,recipe_list):
    de_numbered_items = de_numberfy(items_list)
    de_numbered_instructions = de_numberfy(recipe_list)
    
    instructions_bag = [word for word in de_numbered_instructions if word not in de_numbered_items]
    
    return instructions_bag

In [162]:
items_list = eval(df.iloc[0]['item_list'])
recipe_list = eval(df.iloc[0]['instructions'])
instructions_bag(items_list,recipe_list)

['Heat',
 'oven',
 'to',
 'degrees',
 'bring',
 'a',
 'small',
 'saucepan',
 'water',
 'to',
 'a',
 'boil',
 'Spread',
 'out',
 'on',
 'a',
 'baking',
 'sheet',
 'in',
 'a',
 'single',
 'layer',
 'toast',
 'minutes',
 'stirring',
 'once',
 'twice',
 'to',
 'toast',
 'grains',
 'evenly',
 'While',
 'toasts',
 'place',
 'in',
 'a',
 'small',
 'bowl',
 'cover',
 'completely',
 'with',
 '12',
 'hot',
 'water',
 'saucepan',
 'Cover',
 'bowl',
 'set',
 'aside',
 'to',
 'hydrate',
 'Place',
 'a',
 'large',
 'Dutch',
 'oven',
 'over',
 'mediumhigh',
 'heat',
 'add',
 'Once',
 'has',
 'melted',
 'add',
 '12',
 'teaspoon',
 'sauté',
 'until',
 'translucent',
 'to',
 'minutes',
 'Add',
 'sauté',
 'minutes',
 'adding',
 'a',
 'splash',
 'water',
 'if',
 'necessary',
 'to',
 'prevent',
 'it',
 'scorching',
 'Stir',
 'in',
 'toasted',
 'Carefully',
 'scoop',
 'their',
 'soaking',
 'liquid',
 'add',
 'them',
 'to',
 'pot',
 'Strain',
 'their',
 'soaking',
 'liquid',
 'through',
 'a',
 'finemesh',
 's