In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json, urllib2
import os, glob, sys, time
import nltk,re
import itertools
from nltk.corpus import stopwords
%matplotlib inline

### Helper functions

In [2]:
#Read 70k recipes from allrecipes.com into a list of dictionaries
def read_data():
    folder='recipes/'
    files=pd.read_csv('all_files.txt',header=None)[0].values
    k=0
    recipes=[]
    st=time.time()
    for filename in files:
        f=open(folder+filename,'r')
        r=json.load(f)
        recipes.append(r)
        k+=1
        if k%10000==0:
            print k
    return recipes

#Removing and replacing some noizy symbols
def clean_string(s):    
    sep_symbols=[';']
    for ss in sep_symbols:
        s=s.replace(ss,'.')
    for i in range(10):
        s=s.replace('..','.')
    bad_symbols=[')','(','!','-']
    for bs in bad_symbols:
        s=s.replace(bs,' ')
    s=s.replace(',',' , ')                  
    s=s.replace('  ',' ')
    s=s.replace('. ','.')
    return s

#Raw direction text -> List of single directions
def get_clean_directions(recipe):
    raw=recipe['directions']
    direction=''
    for dd in raw:
        direction=direction+dd+'.'
    direction=clean_string(direction).lower()
    s=direction.split('.')
    return s

### Highlighting recipes

In [49]:
def bright(l,idx1,idx2,idx3):
    l2=[]
    for i in range(len(l)):
        if idx1[i]:
            l2.append(yellow(l[i]))
        elif idx2[i]:
            l2.append(blue(l[i]))
        elif idx3[i]:
            l2.append(purple(l[i]))
        else:
            l2.append(l[i])
    l2=' '.join(l2)
    return l2

def purple(string):
    return '\x1b[1;45m'+string+'\x1b[0m'

def yellow(string):
    return '\x1b[1;43m'+string+'\x1b[0m'

def blue(string):
    return '\x1b[1;46m'+string+'\x1b[0m'

def highlight_recipe(recipes,recipe_id):
    dirs=get_clean_directions(recipes[recipe_id])
    ingr_words=list(set(ingr_words_func(recipes[recipe_id]['ingr'])))
    for d in dirs:
        if len(d)>0:
            d_words=np.array(d.split(' '))
            ingr_idx,measure_idx=np.array(define_ingr_measure(d_words, ingr_words))
            action_idx=np.array([(word in actions_set) for word in d_words]).astype(np.int32)
            colored_string=bright(d_words,action_idx,ingr_idx,measure_idx)
            print colored_string
            print create_instructions(d)
            print '_____________________________________'
            #print create_instructions(d)
            

            
import re


### Ingredients stuff

In [41]:
#cleaning and reading ingridiends and measures

def read_measure_list(path):
    measures=pd.read_csv(path,header=None)
    measure_list=measures[0].values
    return measure_list

def remove_stopwords(text_list):
    stop = stopwords.words('english')
    content = [w for w in text_list if w.lower() not in stop]
    return content

def remove_digits(text_list):
    content=[]
    for w in text_list:
        w = re.sub('[./]', ' ', w).split() 
        content.append(w)
    content = list(itertools.chain.from_iterable(content))
    content = [w for w in content if w.isdigit()==0]
    return content

def get_clean_text(text):
    return text.replace('(','').replace(')','').replace(',','').replace('-',' ').replace('/',' ').replace(';',' ').replace('  ',' ')

def ingr_words_func(ingr_list):
    recipe_words=[]
    for recipe in ingr_list:
        recipe=get_clean_text(recipe)
        recipe_words.append([element for element in recipe.lower().split()])
    recipe_words = list(itertools.chain.from_iterable(recipe_words))
    recipe_words=remove_stopwords(remove_digits(recipe_words))
    return recipe_words

#defining ingridients and measures
def define_ingr_measure(dirs_words, ingr_words):
    if_ingr=[0]*len(dirs_words)
    if_measure=[0]*len(dirs_words)
    for i,dirs_word in enumerate(dirs_words):
        for ingrs in ingr_words:
            if dirs_word==ingrs:
                if dirs_word not in measure_list:
                    if_ingr[i]=1
                else:
                    if_measure[i]=1
    return if_ingr,if_measure

### Semantic role labeling part

In [42]:
from practnlptools.tools import Annotator
annotator=Annotator()
def create_instructions(phrase,flag=1):
    
    annotated = annotator.getAnnotations(phrase)['srl']
    annotated_steps = []
    if len(annotated) > 0:
        for i in xrange(len(annotated)):
            annotated_step = dict()
            annotated_step['action'] = annotated[i]['V']
            if set(['A1','A2']).issubset(annotated[i].keys()):
                annotated_step['object'] = annotated[i]['A1']
                annotated_step['target'] = annotated[i]['A2']
            elif set(['A2']).issubset(annotated[i].keys()):
                annotated_step['target'] = annotated[i]['A2']
            elif set(['A1']).issubset(annotated[i].keys()):
                annotated_step['object'] = annotated[i]['A1']
            else:
                pass
            annotated_steps.append(annotated_step)
    if (len(annotated_steps)==0) & (flag):
        return create_instructions('they '+phrase,0)
    return annotated_steps

## Main

##### Loading data

In [5]:
%time recipes=read_data()
actions=pd.read_csv('actions_dict_sorted.txt',sep=' ')
measure_list=read_measure_list('measure_list.txt')

10000
20000
30000
40000
50000
60000
70000
Wall time: 6.54 s


#### Tagging results

In [10]:
actions_set=set(actions[:100].word.values)


In [50]:
highlight_recipe(recipes,3)

[1;43mcombine[0m [1;46msugar[0m and [1;46moil[0m
[{'action': 'combine', 'object': 'sugar and oil'}]
_____________________________________
[1;43mbeat[0m well
[{'action': 'beat', 'object': 'well'}]
_____________________________________
[1;43madd[0m [1;46meggs[0m and [1;43mbeat[0m
[{'action': 'add', 'object': 'eggs'}]
_____________________________________
[1;43mcombine[0m [1;43mflour[0m , [1;46mbaking[0m [1;46msoda[0m , [1;43msalt[0m , [1;46mcinnamon[0m and [1;46mnutmeg[0m
[{'action': 'combine', 'object': 'flour'}, {'action': 'baking', 'object': 'soda , salt , cinnamon and nutmeg'}]
_____________________________________
[1;43mstir[0m [1;43mflour[0m mixture into egg mixture alternately with [1;46mwater[0m
[{'action': 'stir', 'object': 'flour mixture', 'target': 'into egg mixture alternately with water'}]
_____________________________________
[1;43mstir[0m in [1;46msweet[0m [1;46mpotatoes[0m and [1;46mchopped[0m nuts
[{'action': 'stir', 'target': 'i

### First attempt to build flow of commands

In [40]:
recipe_id=2
dirs=get_clean_directions(recipes[recipe_id])
ingr_words=list(set(ingr_words_func(recipes[recipe_id]['ingr'])))
for d in dirs:
    if len(d)>0:
        print d
        print create_instructions(d)
        print '____________________________________________'

melt margarine in hot water
[{'action': 'melt', 'object': 'margarine'}]
____________________________________________
add sugar and salt and stir
[{'action': 'add', 'object': 'sugar and salt and stir'}]
____________________________________________
add cold water and yeast
[{'action': 'add', 'object': 'cold water and yeast'}]
____________________________________________
stir to dissolve yeast
[{'action': 'dissolve', 'object': 'yeast'}]
____________________________________________
add 3 cups flour and mix
[{'action': 'add', 'object': '3 cups flour and mix'}]
____________________________________________
add eggs and 2 1/2  3 cups more flour
[{'action': 'add', 'object': 'eggs and 2 1/2 3 cups'}]
____________________________________________
mix , cover and let rise until dough doubles in size
[{'action': 'let', 'object': 'rise'}]
____________________________________________
punch down and let rise 30 more minutes or until doubles
[{'action': 'punch down'}, {'action': 'let', 'object': 'rise 3