In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests
import time
import datetime

I'm going to need a handful of helper functions before I can start trying to fit models. My plan is to have two part models which consider the words associated with the ingredients and recipe instructions, and then a second layer that considers a handful of 'meta'-data, such as the amount of time the recipe takes or the number of steps. I'll make more as needed, but in advance I know I'll need at least the following functions:

1. Simple time translater: conver the string time description into a number of minutes. I say simple, because some recipes says things like "or more, for chilling" and I'm not going to try to classify that
2. Instruction-ingredient stripper. Remove the ingredient names from the instructions word lists, to avoid the cross-contamination of unique ingredient names becoming correlated with preparation ratings
3. Strip quantities out of ingredient list

Going to read in one of my smaller batches so I don't need to work with a dataframe with all 19,000 recipes.

In [2]:
df = pd.read_excel('batch_1.xlsx')

In [3]:
df.head()

Unnamed: 0,recipe,cooking_time,item_list,item_rating,instructions,instructions_rating,url
0,Mushroom-Farro Soup With Parmesan Broth,1 1/2 hours,"['1 cup pearled farro', '1 ounce dried mixed m...",,['Heat oven to 300 degrees and bring a small s...,,https://cooking.nytimes.com/recipes/1020933-mu...
1,Easiest Lentil Soup,1 hour,"['6 tablespoons extra-virgin olive oil, plus m...",,['Heat 1/4 cup oil in a medium pot over medium...,,https://cooking.nytimes.com/recipes/1019943-ea...
2,Beans and Garlic Toast in Broth,"2 1/4 hours, plus optional soaking","['1 cup dried beans, such as cannellini or cra...",,"['If you remember, soak the beans in cold wate...",,https://cooking.nytimes.com/recipes/1019241-be...
3,Parmesan Broth,2 3/4 hours,"['3 tablespoons extra-virgin olive oil', '1 la...",,"['In a large Dutch oven or heavy pot, heat the...",,https://cooking.nytimes.com/recipes/1020934-pa...
4,Potato Gratin With Swiss Chard and Sumac Onions,2 1/2 hours,"['¼ cup/60 milliliters olive oil', '1 ½ pounds...",,['Heat oven to 375 degrees Fahrenheit/180 degr...,,https://cooking.nytimes.com/recipes/1020928-po...


In [15]:
#The existence of the fractions is tricky, they don't convert to ints or floats well, need to split them
#and then convert to ints and then do math on them
df.iloc[0]['cooking_time'].split()

['1', '1/2', 'hours']

In [22]:
df.iloc[0]['cooking_time'].split()[2].split('/')

['hours']

In [26]:
#First step, split and then split again to seperate all numbers
split_up_time = []

for x in df.iloc[0]['cooking_time'].split():
    if len(x.split('/')) == 2:
        split_up_time.append(int(x.split('/')[0])/int(x.split('/')[1]))
    else:
        try:
            split_up_time.append(int(x))
        except:
            split_up_time.append(x)
            
split_up_time

[1, 0.5, 'hours']

In [36]:
#Now we check if these numbers associate with hours or minutes:
minutes = 0
running_total = 0
for n in range(0,len(split_up_time)):
    if (type(split_up_time[n]) == float) or (type(split_up_time[n]) == int):
        running_total += split_up_time[n]
    elif (split_up_time[n] == 'hours') or (split_up_time[n] == 'hour'):
        minutes += running_total*60
        running_total = 0
    elif (split_up_time[n] == 'minutes') or (split_up_time[n] == 'minute'):
        minutes += running_total
        running_total = 0
    

In [38]:
minutes

90.0

In [107]:
def simple_time_translater(text):
    if type(text) != str:
        return np.nan
    split_up_time = []

    for x in text.split():
        if len(x.split('/')) == 2:
            split_up_time.append(int(x.split('/')[0])/int(x.split('/')[1]))
        else:
            try:
                split_up_time.append(int(x))
            except:
                split_up_time.append(x)
                
    minutes = 0
    running_total = 0
    for n in range(0,len(split_up_time)):
        if (type(split_up_time[n]) == float) or (type(split_up_time[n]) == int):
            running_total += split_up_time[n]
        elif len(split_up_time[n]) >= 7:
            if (split_up_time[n][:6] == 'minute'):
                minutes += running_total
                running_total = 0
        elif len(split_up_time[n]) >= 4:
            if (split_up_time[n][:4] == 'hour'):
                minutes += running_total*60
                running_total = 0
        
    return minutes

In [108]:
test_times = []
for n in range(0,10):
    test_times.append(simple_time_translater(df.iloc[n]['cooking_time']))
test_times

[90.0, 60, 135.0, 165.0, 150.0, 50, 5, nan, 30, 75.0]

In [66]:
df.head(10)

Unnamed: 0,recipe,cooking_time,item_list,item_rating,instructions,instructions_rating,url
0,Mushroom-Farro Soup With Parmesan Broth,1 1/2 hours,"['1 cup pearled farro', '1 ounce dried mixed m...",,['Heat oven to 300 degrees and bring a small s...,,https://cooking.nytimes.com/recipes/1020933-mu...
1,Easiest Lentil Soup,1 hour,"['6 tablespoons extra-virgin olive oil, plus m...",,['Heat 1/4 cup oil in a medium pot over medium...,,https://cooking.nytimes.com/recipes/1019943-ea...
2,Beans and Garlic Toast in Broth,"2 1/4 hours, plus optional soaking","['1 cup dried beans, such as cannellini or cra...",,"['If you remember, soak the beans in cold wate...",,https://cooking.nytimes.com/recipes/1019241-be...
3,Parmesan Broth,2 3/4 hours,"['3 tablespoons extra-virgin olive oil', '1 la...",,"['In a large Dutch oven or heavy pot, heat the...",,https://cooking.nytimes.com/recipes/1020934-pa...
4,Potato Gratin With Swiss Chard and Sumac Onions,2 1/2 hours,"['¼ cup/60 milliliters olive oil', '1 ½ pounds...",,['Heat oven to 375 degrees Fahrenheit/180 degr...,,https://cooking.nytimes.com/recipes/1020928-po...
5,Braised Fennel With White Bean Purée,50 minutes,"['2 (15-ounce) cans white beans, rinsed', '2 ½...",,"['Prepare the bean purée: Add the beans, 2 1/2...",,https://cooking.nytimes.com/recipes/1020935-br...
6,Cold-Fashioned,5 minutes,"[' Ice, as needed', '1 ¼ ounces Irish whiskey,...",,['Fill a mixing glass halfway with ice. Add wh...,,https://cooking.nytimes.com/recipes/1020905-co...
7,NoMad Espresso Martini,,"['1 ounce Mr. Black Cold Brew coffee liqueur',...",,['Combine ingredients in a cocktail shaker hal...,,https://cooking.nytimes.com/recipes/1020936-no...
8,Japanese-Style Tuna Noodle Salad,30 minutes,"['¼ cup cut dried wakame seaweed', '8 ounces d...",,['Bring a large pot of water to a boil over hi...,,https://cooking.nytimes.com/recipes/1020939-ja...
9,Toor Dal (Split Yellow Pigeon Peas),"1 1/4 hours, plus soaking","['1 cup toor dal (split yellow pigeon peas)', ...",,['Prepare the dal: Soak the pigeon peas in a l...,,https://cooking.nytimes.com/recipes/1020907-to...
