In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json, urllib2
import os, glob, sys, time
import nltk
import itertools
from nltk.corpus import stopwords
%matplotlib inline

### Helper functions

In [3]:
#Read 70k recipes from allrecipes.com into a list of dictionaries
def read_data():
    folder='recipes/'
    files=pd.read_csv('all_files.txt',header=None)[0].values
    k=0
    recipes=[]
    st=time.time()
    for filename in files:
        f=open(folder+filename,'r')
        r=json.load(f)
        recipes.append(r)
        k+=1
        if k%10000==0:
            print k
    return recipes

#Removing and replacing some noizy symbols
def clean_string(s):    
    sep_symbols=[';']
    for ss in sep_symbols:
        s=s.replace(ss,'.')
    for i in range(10):
        s=s.replace('..','.')
    bad_symbols=[')','(','!','-']
    for bs in bad_symbols:
        s=s.replace(bs,' ')
    s=s.replace(',',' , ')                  
    s=s.replace('  ',' ')
    s=s.replace('. ','.')
    return s

#Raw direction text -> List of single directions
def get_clean_directions(recipe):
    raw=recipe['directions']
    direction=''
    for dd in raw:
        direction=direction+dd+'.'
    direction=clean_string(direction).lower()
    s=direction.split('.')
    return s

### Highlighting recipes

In [79]:
def bright(l,idx1,idx2,idx3):
    l2=[]
    for i in range(len(l)):
        if idx1[i]:
            l2.append(yellow(l[i]))
        elif idx2[i]:
            l2.append(blue(l[i]))
        elif idx3[i]:
            l2.append(purple(l[i]))
        else:
            l2.append(l[i])
    l2=' '.join(l2)
    return l2

def purple(string):
    return '\x1b[1;45m'+string+'\x1b[0m'

def yellow(string):
    return '\x1b[1;43m'+string+'\x1b[0m'

def blue(string):
    return '\x1b[1;46m'+string+'\x1b[0m'

def highlight_recipe(recipes,recipe_id):
    dirs=get_clean_directions(recipes[recipe_id])
    for d in dirs:
        if len(d)>0:
            d_words=np.array(d.split(' '))
            action_idx=np.array([(word in actions_set) for word in d_words])
            print bright(d_words,action_idx)

### Main

##### Loading data

In [12]:
%time recipes=read_data()
actions=pd.read_csv('actions_dict_sorted.txt',sep=' ')

10000
20000
30000
40000
50000
60000
70000
Wall time: 7.05 s


#### Printing results

In [13]:
actions_set=set(actions[:100].word.values)

In [14]:
highlight_recipe(recipes,0)

[43mcombine[0m parmesan cheese , pepper and garlic powder
unfold pastry sheets onto cutting board
[43mbrush[0m lightly with egg white
[43msprinkle[0m each sheet with 1/4 of the cheese mixture
lightly [43mpress[0m into pastry , [43mturn[0m over
[43mrepeat[0m
[43mcut[0m each sheet into 12 1 inch strips
twist
[43mplace[0m on ungreased cookie sheet and [43mbake[0m in 350 degrees f 175 degrees c oven for 15 minutes or until golden [43mbrown[0m


In [89]:
#cleaning and reading ingridiends and measures

def read_measure_list(path):
    measures=pd.read_csv(path,header=None)
    measure_list=measures[0].values
    return measure_list

def remove_stopwords(text_list):
    stop = stopwords.words('english')
    content = [w for w in text_list if w.lower() not in stop]
    return content

def remove_digits(text_list):
    content = [w for w in text_list if w.isdigit()==0]
    return content

def get_clean_text(text):
    return text.replace('(','').replace(')','').replace(',','').replace('-',' ').replace('/',' ').replace(';',' ').replace('  ',' ')

def ingr_words_func(ingr_list):
    recipe_words=[]
    for recipe in ingr_list:
        recipe=get_clean_text(recipe)
        recipe_words.append([element for element in recipe.lower().split()])
    recipe_words = list(itertools.chain.from_iterable(recipe_words))
    recipe_words=remove_stopwords(remove_digits(recipe_words))
    return recipe_words

In [70]:
#defining ingridients and measures
def define_ingr_measure(dirs_words, ingr_words):
    if_ingr=[0]*len(dirs_words)
    if_measure=[0]*len(dirs_words)
    for i,dirs_word in enumerate(dirs_words):
        for ingrs in ingr_words:
            if dirs_word==ingrs:
                if dirs_word not in measure_list:
                    if_ingr[i]=1
                else:
                    if_measure[i]=1
    return if_ingr,if_measure

In [94]:
recipe_id=5
dirs=get_clean_directions(recipes[recipe_id])
ingr_words=list(set(ingr_words_func(recipes[recipe_id]['ingr'])))
for d in dirs:
    if len(d)>0:
        d_words=np.array(d.split(' '))
        ingr_idx,measure_idx=np.array(define_ingr_measure(d_words, ingr_words))
        action_idx=np.array([(word in actions_set) for word in d_words]).astype(np.int32)
        colored_string=bright(d_words,action_idx,ingr_idx,measure_idx)

        print colored_string

[1;43mstir[0m [1;46mbutter[0m and 1 [1;45mteaspoon[0m [1;46msugar[0m into the [1;46mhot[0m [1;46mmilk[0m until [1;46mbutter[0m is melted
when mixture is lukewarm , [1;43mstir[0m in [1;46myeast[0m and [1;43mset[0m aside for 5 minutes
when mixture is creamy , [1;43mtransfer[0m to a large mixing bowl
[1;43mmix[0m in 2 [1;45mcups[0m of [1;46mbread[0m [1;43mflour[0m
[1;43madd[0m 1/2 [1;45mcup[0m [1;46msugar[0m , [1;46meggs[0m , [1;46morange[0m [1;46mjuice[0m , [1;46morange[0m [1;46mzest[0m , and [1;43msalt[0m and [1;43mbeat[0m until combined
[1;43madd[0m remaining [1;43mflour[0m , mixing well after each addition , until it pulls away from the sides of the bowl
knead for about 10 minutes
[1;43mtransfer[0m dough to a greased bowl , [1;43mcover[0m with plastic [1;43mwrap[0m , and [1;43mlet[0m rise until doubled , about 1 hour
if time permits , you can punch dough down , [1;43mcover[0m it , and [1;43mlet[0m it rise again
[1;43mt