In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json, urllib2
import os, glob, sys, time
import nltk,re
import itertools
from nltk.corpus import stopwords
%matplotlib inline

### Helper functions

In [2]:
#Read 70k recipes from allrecipes.com into a list of dictionaries
def read_data():
    folder='recipes/'
    files=pd.read_csv('all_files.txt',header=None)[0].values
    k=0
    recipes=[]
    st=time.time()
    for filename in files:
        f=open(folder+filename,'r')
        r=json.load(f)
        recipes.append(r)
        k+=1
        if k%10000==0:
            print k
    return recipes

#Removing and replacing some noizy symbols
def clean_string(s):    
    sep_symbols=[';']
    for ss in sep_symbols:
        s=s.replace(ss,'.')
    for i in range(10):
        s=s.replace('..','.')
    bad_symbols=[')','(','!','-']
    for bs in bad_symbols:
        s=s.replace(bs,' ')
    s=s.replace(',',' , ')                  
    s=s.replace('  ',' ')
    s=s.replace('. ','.')
    return s

#Raw direction text -> List of single directions
def get_clean_directions(recipe):
    raw=recipe['directions']
    direction=''
    for dd in raw:
        direction=direction+dd+'.'
    direction=clean_string(direction).lower()
    s=direction.split('.')
    return s

### Highlighting recipes

In [145]:
def bright(l,idx1,idx2,idx3):
    l2=[]
    for i in range(len(l)):
        if idx2[i]:
            l2.append(blue(l[i]))
        elif idx1[i]:
            l2.append(yellow(l[i]))
        elif idx3[i]:
            l2.append(purple(l[i]))
        else:
            l2.append(l[i])
    l2=' '.join(l2)
    return l2

def purple(string):
    return '\x1b[1;45m'+string+'\x1b[0m'

def yellow(string):
    return '\x1b[1;43m'+string+'\x1b[0m'

def blue(string):
    return '\x1b[1;46m'+string+'\x1b[0m'


            

            
import re


### Ingredients stuff

In [4]:
#cleaning and reading ingridiends and measures

def read_measure_list(path):
    measures=pd.read_csv(path,header=None)
    measure_list=measures[0].values
    return measure_list

def remove_stopwords(text_list):
    stop = stopwords.words('english')
    content = [w for w in text_list if w.lower() not in stop]
    return content

def remove_digits(text_list):
    content=[]
    for w in text_list:
        w = re.sub('[./]', ' ', w).split() 
        content.append(w)
    content = list(itertools.chain.from_iterable(content))
    content = [w for w in content if w.isdigit()==0]
    return content

def get_clean_text(text):
    return text.replace('(','').replace(')','').replace(',','').replace('-',' ').replace('/',' ').replace(';',' ').replace('  ',' ')

def ingr_words_func(ingr_list):
    recipe_words=[]
    for recipe in ingr_list:
        recipe=get_clean_text(recipe)
        recipe_words.append([element for element in recipe.lower().split()])
    recipe_words = list(itertools.chain.from_iterable(recipe_words))
    recipe_words=remove_stopwords(remove_digits(recipe_words))
    return recipe_words

#defining ingridients and measures
def define_ingr_measure(dirs_words, ingr_words):
    if_ingr=[0]*len(dirs_words)
    if_measure=[0]*len(dirs_words)
    for i,dirs_word in enumerate(dirs_words):
        for ingrs in ingr_words:
            if dirs_word==ingrs:
                if dirs_word not in measure_list:
                    if_ingr[i]=1
                else:
                    if_measure[i]=1
    return if_ingr,if_measure

### Semantic role labeling part

In [5]:
from practnlptools.tools import Annotator
annotator=Annotator()
def create_instructions(phrase,flag=1):
    
    annotated = annotator.getAnnotations(phrase)['srl']
    annotated_steps = []
    if len(annotated) > 0:
        for i in xrange(len(annotated)):
            annotated_step = dict()
            annotated_step['action'] = annotated[i]['V']
            if set(['A1','A2']).issubset(annotated[i].keys()):
                annotated_step['object'] = annotated[i]['A1']
                annotated_step['target'] = annotated[i]['A2']
            elif set(['A2']).issubset(annotated[i].keys()):
                annotated_step['target'] = annotated[i]['A2']
            elif set(['A1']).issubset(annotated[i].keys()):
                annotated_step['object'] = annotated[i]['A1']
            else:
                pass
            annotated_steps.append(annotated_step)
    if (len(annotated_steps)==0) & (flag):
        return create_instructions('they '+phrase,0)
    return annotated_steps

## Main

##### Loading data

In [134]:
#%time recipes=read_data()
actions=pd.read_csv('action_dict_wordnet.txt',sep=' ')
measure_list=read_measure_list('measure_list.txt')

In [7]:
def highlight_recipe(recipes,recipe_id):
    dirs=get_clean_directions(recipes[recipe_id])
    ingr_words=list(set(ingr_words_func(recipes[recipe_id]['ingr'])))
    r=0
    for d in dirs:
        if len(d)>0:
            d_words=np.array(d.split(' '))
            ingr_idx,measure_idx=np.array(define_ingr_measure(d_words, ingr_words))
            action_idx=np.array([(word in actions_set) for word in d_words]).astype(np.int32)
            colored_string=bright(d_words,action_idx,ingr_idx,measure_idx)
            print r,colored_string
            print create_instructions(d)
            print '_____________________________________'
            #print create_instructions(d)
            r+=1

#### Tagging results

In [142]:
actions_set=set(actions[:200].word.values)

### First attempt to build flow of commands

# 2. Simple predictor

## Helper functions

In [201]:
def obj_elements_list(strr):
    try:
        output_list=[]
        #make the list of correct ingrs (without 'or+word', 'and'+word = new list element etc.)
        ingr_in_one_dir_list=(strr.split(' , ')) 
        for i, ingr in enumerate(ingr_in_one_dir_list): #for all objects if it's list of ingrs
            ingr_without_and_list=ingr.split(' and ') #dealing with 'and' 
            for ingr_without_and in ingr_without_and_list:
                ingr_temp=ingr_without_and.split(' ')
                for ingr_temp_el in ingr_temp:
                    if ingr_temp_el=='or':#dealing with 'or'
                        idx=ingr_temp.index('or') 
                        ingr_temp = ingr_temp[:idx]
                ingr_temp = remove_stopwords(ingr_temp)
                output_list.append(' '.join(ingr_temp))
        return output_list
    except:
        return ['0']

def add_action_line(df,ls):
    cur_flow=pd.Series(ls,index=['line_id', 'action', 'object', 'target', 'no', 'nt', 'description'])
    return df.append(cur_flow,ignore_index=True)


def create_instructions(phrase,flag=1):
    annotated = annotator.getAnnotations('they '+phrase)['srl']
    annotated_steps = []
    #if (len(annotated) > 0) :
    for i in xrange(len(annotated)):
        annotated_step = dict()
        annotated_step['object']=''
        annotated_step['target']=''
        annotated_step['action'] = annotated[i]['V']
        if set(['A2']).issubset(annotated[i].keys()):
            annotated_step['target'] = annotated[i]['A2']
        if set(['AM-LOC']).issubset(annotated[i].keys()):
            annotated_step['target'] = annotated[i]['AM-LOC']
        if set(['A1']).issubset(annotated[i].keys()):
            annotated_step['object'] = annotated[i]['A1']
        annotated_steps.append(annotated_step)
    #if (len(annotated_steps)==0) & (flag):
    #    return create_instructions('they '+phrase,0)
    return annotated_steps


### Dev

In [106]:
dd='place the chicken in a medium bowl , and coat with the marinade'
aaa = annotator.getAnnotations(dd)['srl']
obj_elements_list(aaa)
##prediction=add_action_line(prediction,['0','combine','cheese','body','0','0','0'])

['chicken']

In [99]:
dd='in a food processor or blender , combine the green onions , onion , jalapeno pepper , soy sauce , vinegar , vegetable oil , brown sugar , thyme , cloves , nutmeg and allspice'

In [212]:
cols=['line_id', 'action', 'object', 'target', 'no', 'nt', 'description']
prediction=pd.DataFrame(columns=cols)

recipe_id=6666
dirs=get_clean_directions(recipes[recipe_id])
ingr_words=ingr_words_func(recipes[recipe_id]['ingr'])
r=0
cur_line_id=0
for d in dirs:
    if len(d)>0:
        d_words=np.array(d.split(' '))
        ingr_idx,measure_idx=np.array(define_ingr_measure(d_words, ingr_words))
        action_idx=np.array([(word in actions_set) for word in d_words]).astype(np.int32)
        colored_string=bright(d_words,action_idx,ingr_idx,measure_idx)
        print r,colored_string
        srl=create_instructions(d)
        
        for t in range(len(srl)):
            cur_srl=srl[t]
            objects=obj_elements_list(cur_srl['object'])
            for object in objects:
                ls=7*['0']
                ls[0]=str(cur_line_id)
                ls[1]=cur_srl['action']
                ls[2]='0'
                if len(object):
                    ls[2]=object
                if len(cur_srl['target'])>0:
                    ls[3]=cur_srl['target']
                else:
                    ls[3]='body'
                prediction=add_action_line(prediction,ls)
        print '_____________________________________'
        #print create_instructions(d)
        r+=1
        
        
        cur_line_id+=1

0 [1;43mplace[0m the [1;46molive[0m [1;46moil[0m in a [1;46mlarge[0m saucepan and [1;43mheat[0m over medium high [1;43mheat[0m until hot
_____________________________________
1 [1;43madd[0m [1;46mchicken[0m
_____________________________________
2 [1;43mcook[0m and [1;43mstir[0m about 5 minutes or until [1;46mchicken[0m is browned
_____________________________________
3 [1;43madd[0m [1;46monion[0m
_____________________________________
4 [1;43mcook[0m and [1;43mstir[0m 2 minutes
_____________________________________
5 [1;43madd[0m [1;46mchicken[0m [1;46mbroth[0m , undrained [1;46mtomatoes[0m , [1;46mnorthern[0m [1;46mbeans[0m , [1;46mcarrots[0m , [1;46mpotato[0m , [1;46msalt[0m and [1;46mpepper[0m
_____________________________________
6 [1;43mbring[0m to a [1;43mboil[0m , stirring to [1;43mbreak[0m up [1;46mtomatoes[0m
_____________________________________
7 [1;43mreduce[0m [1;43mheat[0m to [1;43mlow[0m
_____________________

In [213]:
prediction

Unnamed: 0,line_id,action,object,target,no,nt,description
0,0,place,olive oil,in a large saucepan and heat over medium high ...,0,0,0
1,1,add,chicken,body,0,0,0
2,2,cook,0,body,0,0,0
3,2,stir,5 minutes,body,0,0,0
4,3,add,onion,body,0,0,0
5,4,cook,0,body,0,0,0
6,4,stir,2 minutes,body,0,0,0
7,5,add,chicken broth,body,0,0,0
8,5,add,undrained tomatoes,body,0,0,0
9,5,add,northern beans,body,0,0,0


In [211]:
fold='C:/Users/User/Dropbox (MIT)/NLP Final project/labeled recipes/'
#fold='C:/Users/User/Desktop/labeled recipes/'
files=glob.glob(fold+'*')
#print files[i]
df=pd.read_csv(fold+'l13344.txt')
df

Unnamed: 0,line_id,action,object,target,no,nt,description
0,0,place,olive oil,sausepan,1 tablespoon,0,0
1,0,heat,olive oil,0,1 tablespoon,0,over medium high heat until hot
2,1,add,chicken,body,1/2 pound,0,0
3,1,cook,chicken,0,0,0,5 minutes or until chicken is browned
4,1,stir,chicken,0,0,0,5 minutes or until chicken is browned
5,2,add,onion,body,1,0,0
6,3,cook,body,body,0,0,2 minutes
7,3,stir,body,body,0,0,2 minutes
8,4,add,chicken broth,body,3 (14.5 ounce) can,0,0
9,4,add,undrained tomatoes,body,1 (14.5 ounce) can,0,0


In [216]:
dd='they place the olive oil in a large saucepan and heat over medium high heat until hot'
aaa = annotator.getAnnotations(dd)
aaa['srl']

[{'A0': 'they',
  'A1': 'the olive oil',
  'A2': 'in a large saucepan and heat over medium high heat until hot',
  'V': 'place'}]