In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json, urllib2
import os, glob, sys, time
import nltk,re
import itertools
from nltk.corpus import stopwords
%matplotlib inline

### Helper functions

In [2]:
#Read 70k recipes from allrecipes.com into a list of dictionaries
def read_data():
    folder='recipes/'
    files=pd.read_csv('all_files.txt',header=None)[0].values
    k=0
    recipes=[]
    st=time.time()
    for filename in files:
        f=open(folder+filename,'r')
        r=json.load(f)
        recipes.append(r)
        k+=1
        if k%10000==0:
            print k
    return recipes

#Removing and replacing some noizy symbols
def clean_string(s):    
    sep_symbols=[';']
    for ss in sep_symbols:
        s=s.replace(ss,'.')
    for i in range(10):
        s=s.replace('..','.')
    bad_symbols=[')','(','!','-']
    for bs in bad_symbols:
        s=s.replace(bs,' ')
    s=s.replace(',',' , ')                  
    s=s.replace('  ',' ')
    s=s.replace('. ','.')
    return s

#Raw direction text -> List of single directions
def get_clean_directions(recipe):
    raw=recipe['directions']
    direction=''
    for dd in raw:
        direction=direction+dd+'.'
    direction=clean_string(direction).lower()
    s=direction.split('.')
    return s

### Highlighting recipes

In [145]:
def bright(l,idx1,idx2,idx3):
    l2=[]
    for i in range(len(l)):
        if idx2[i]:
            l2.append(blue(l[i]))
        elif idx1[i]:
            l2.append(yellow(l[i]))
        elif idx3[i]:
            l2.append(purple(l[i]))
        else:
            l2.append(l[i])
    l2=' '.join(l2)
    return l2

def purple(string):
    return '\x1b[1;45m'+string+'\x1b[0m'

def yellow(string):
    return '\x1b[1;43m'+string+'\x1b[0m'

def blue(string):
    return '\x1b[1;46m'+string+'\x1b[0m'


            

            
import re


### Ingredients stuff

In [4]:
#cleaning and reading ingridiends and measures

def read_measure_list(path):
    measures=pd.read_csv(path,header=None)
    measure_list=measures[0].values
    return measure_list

def remove_stopwords(text_list):
    stop = stopwords.words('english')
    content = [w for w in text_list if w.lower() not in stop]
    return content

def remove_digits(text_list):
    content=[]
    for w in text_list:
        w = re.sub('[./]', ' ', w).split() 
        content.append(w)
    content = list(itertools.chain.from_iterable(content))
    content = [w for w in content if w.isdigit()==0]
    return content

def get_clean_text(text):
    return text.replace('(','').replace(')','').replace(',','').replace('-',' ').replace('/',' ').replace(';',' ').replace('  ',' ')

def ingr_words_func(ingr_list):
    recipe_words=[]
    for recipe in ingr_list:
        recipe=get_clean_text(recipe)
        recipe_words.append([element for element in recipe.lower().split()])
    recipe_words = list(itertools.chain.from_iterable(recipe_words))
    recipe_words=remove_stopwords(remove_digits(recipe_words))
    return recipe_words

#defining ingridients and measures
def define_ingr_measure(dirs_words, ingr_words):
    if_ingr=[0]*len(dirs_words)
    if_measure=[0]*len(dirs_words)
    for i,dirs_word in enumerate(dirs_words):
        for ingrs in ingr_words:
            if dirs_word==ingrs:
                if dirs_word not in measure_list:
                    if_ingr[i]=1
                else:
                    if_measure[i]=1
    return if_ingr,if_measure

### Semantic role labeling part

In [5]:
from practnlptools.tools import Annotator
annotator=Annotator()

## Main

##### Loading data

In [134]:
#%time recipes=read_data()
actions=pd.read_csv('action_dict_wordnet.txt',sep=' ')
measure_list=read_measure_list('measure_list.txt')

In [7]:
def highlight_recipe(recipes,recipe_id):
    dirs=get_clean_directions(recipes[recipe_id])
    ingr_words=list(set(ingr_words_func(recipes[recipe_id]['ingr'])))
    r=0
    for d in dirs:
        if len(d)>0:
            d_words=np.array(d.split(' '))
            ingr_idx,measure_idx=np.array(define_ingr_measure(d_words, ingr_words))
            action_idx=np.array([(word in actions_set) for word in d_words]).astype(np.int32)
            colored_string=bright(d_words,action_idx,ingr_idx,measure_idx)
            print r,colored_string
            print create_instructions(d)
            print '_____________________________________'
            #print create_instructions(d)
            r+=1

#### Tagging results

In [142]:
actions_set=set(actions[:200].word.values)

### First attempt to build flow of commands

# 2. Simple predictor

## Helper functions

In [528]:
def obj_elements_list(strr):
    try:
        output_list=[]
        #make the list of correct ingrs (without 'or+word', 'and'+word = new list element etc.)
        ingr_in_one_dir_list=(strr.split(' , ')) 
        for i, ingr in enumerate(ingr_in_one_dir_list): #for all objects if it's list of ingrs
            ingr_without_and_list=ingr.split(' and ') #dealing with 'and' 
            for ingr_without_and in ingr_without_and_list:
                ingr_temp=ingr_without_and.split(' ')
                for ingr_temp_el in ingr_temp:
                    if ingr_temp_el=='or':#dealing with 'or'
                        idx=ingr_temp.index('or') 
                        ingr_temp = ingr_temp[:idx]
                ingr_temp = remove_stopwords(ingr_temp)
                output_list.append(' '.join(ingr_temp))
        return output_list
    except:
        return ['0']

def add_action_line(df,ls):
    cur_flow=pd.Series(ls,index=['line_id', 'action', 'object', 'target', 'no', 'nt', 'description'])
    return df.append(cur_flow,ignore_index=True)


def create_instructions(phrase,flag=1):
    annotated = annotator.getAnnotations('they '+phrase)['srl']
    annotated_steps = []
    #if (len(annotated) > 0) :
    for i in xrange(len(annotated)):
        annotated_step = dict()
        annotated_step['object']=''
        annotated_step['target']=''
        annotated_step['action'] = annotated[i]['V']
        if set(['A2']).issubset(annotated[i].keys()):
            annotated_step['target'] = annotated[i]['A2']
        if set(['AM-LOC']).issubset(annotated[i].keys()):
            annotated_step['target'] = annotated[i]['AM-LOC']
        if set(['A1']).issubset(annotated[i].keys()):
            annotated_step['object'] = annotated[i]['A1']
        annotated_steps.append(annotated_step)
    #if (len(annotated_steps)==0) & (flag):
    #    return create_instructions('they '+phrase,0)
    return annotated_steps


def line_score(lt,lp):
    sc=[0,0,0]
    if (lt[1]==lp[1]):
        sc+=lt[1:4]==lp[1:4]
    else:
        pass
    return sc

def calc_score(y_t,y_p):
    cur_t=y_t.values
    cur_p=y_p.values
    score=np.zeros((len(cur_t),3)).astype(np.uint8)
    for t in range(len(cur_t)):
        cur_score=[0,0,0]
        for p in range(len(cur_p)):
            temp_score=line_score(cur_t[t],cur_p[p])
            if sum(temp_score)>sum(cur_score):
                cur_score=temp_score
        score[t]=cur_score
    score_2=round(np.sum(np.sum(score,axis=1)==3)/(len(cur_t)+0.),2)
    score_1=np.round(np.sum(score,axis=0)/(len(cur_t)+0.),2)
    return score_1,score_2

### Dev

In [106]:
dd='place the chicken in a medium bowl , and coat with the marinade'
aaa = annotator.getAnnotations(dd)['srl']
obj_elements_list(aaa)
##prediction=add_action_line(prediction,['0','combine','cheese','body','0','0','0'])

['chicken']

In [99]:
dd='in a food processor or blender , combine the green onions , onion , jalapeno pepper , soy sauce , vinegar , vegetable oil , brown sugar , thyme , cloves , nutmeg and allspice'

In [529]:
def get_prediction(recipes,recipe_id):
    cols=['line_id', 'action', 'object', 'target', 'no', 'nt', 'description']
    prediction=pd.DataFrame(columns=cols)

    #recipe_id=6667
    dirs=get_clean_directions(recipes[recipe_id])
    ingr_words=ingr_words_func(recipes[recipe_id]['ingr'])
    r=0
    cur_line_id=0
    for d in dirs:
        if len(d)>0:
            d_words=np.array(d.split(' '))
            ingr_idx,measure_idx=np.array(define_ingr_measure(d_words, ingr_words))
            action_idx=np.array([(word in actions_set) for word in d_words]).astype(np.int32)
            #colored_string=bright(d_words,action_idx,ingr_idx,measure_idx)
            #print r,colored_string
            srl=create_instructions(d)

            for t in range(len(srl)):
                cur_srl=srl[t]
                objects=obj_elements_list(cur_srl['object'])
                if (len(objects[0])==0) & (len(cur_srl['target'])==0):
                    ls=7*['0']
                    ls[0]=str(cur_line_id)
                    ls[1]=cur_srl['action']
                    ls[2]='body'
                    prediction=add_action_line(prediction,ls)

                else:
                    for object in objects:
                        ls=7*['0']
                        ls[0]=str(cur_line_id)
                        ls[1]=cur_srl['action']
                        ls[2]='0'
                        if len(object):
                            ls[2]=object
                        if len(cur_srl['target'])>0:
                            ls[3]=cur_srl['target']
                        else:
                            ls[3]='body'
                        prediction=add_action_line(prediction,ls)
            #print '_____________________________________'
            #print create_instructions(d)
            r+=1
            cur_line_id+=1
    return prediction

In [408]:
pairs=np.zeros(270000).astype(np.int32)
for i in range(len(lb)):
    rev=int(lb.values[i][0][:-4])
    pairs[rev]=i

In [558]:
fold='C:/Users/User/Dropbox (MIT)/NLP Final project/labeled recipes/'
files=glob.glob(fold+'*')
for file in files:
    y_t=pd.read_csv(file)
    v=int(file[63:-4])
    y_p=get_prediction(recipes,pairs[v])
    print v,calc_score(y_t,y_p)

13344 (array([ 0.96,  0.76,  0.6 ]), 0.52)
13345 (array([ 0.83,  0.44,  0.39]), 0.28)
13346 (array([ 0.79,  0.58,  0.54]), 0.46)
13347 (array([ 0.7,  0.4,  0.1]), 0.1)
13348 (array([ 0.81,  0.73,  0.19]), 0.19)
13349 (array([ 0.96,  0.2 ,  0.12]), 0.12)
13351 (array([ 0.77,  0.27,  0.09]), 0.09)
13352 (array([ 0.64,  0.56,  0.52]), 0.52)
13353 (array([ 0.68,  0.53,  0.32]), 0.32)
13354 (array([ 0.83,  0.58,  0.75]), 0.5)
13355 (array([ 0.7,  0.7,  0.6]), 0.6)
13356 (array([ 1. ,  0.9,  0.1]), 0.1)
13357 (array([ 0.86,  0.43,  0.  ]), 0.0)
13358 (array([ 0.66,  0.59,  0.48]), 0.45)
13359 (array([ 0.88,  0.88,  0.44]), 0.44)
13360 (array([ 0.7,  0.6,  0.5]), 0.5)
13361 (array([ 0.69,  0.69,  0.15]), 0.15)
13362 (array([ 0.8,  0.4,  0. ]), 0.0)
13363 (array([ 0.88,  0.5 ,  0.38]), 0.38)
13364 (array([ 0.92,  0.69,  0.15]), 0.08)
13366 (array([ 0.64,  0.5 ,  0.36]), 0.36)
13367 (array([ 0.86,  0.71,  0.57]), 0.52)
13368 (array([ 0.6 ,  0.52,  0.2 ]), 0.16)
13369 (array([ 0.8,  0.6,  0.4]),

In [523]:
v=13362
y_t=pd.read_csv(fold+'/l'+str(v)+'.txt')
y_p=get_prediction(recipes,pairs[v])
print v,calc_score(y_t,y_p)

13362 (array([ 0.8,  0.4,  0. ]), 0.0)


In [557]:
highlight_recipe(recipes,1992)

0 in a large bowl [1;43mtoss[0m together the [1;46mchicken[0m , [1;46mraisins[0m , [1;46mcelery[0m and [1;46mgrapes[0m
[]
_____________________________________
1 [1;43mfold[0m in the [1;46mavocado[0m and the [1;46mapple[0m
[{'action': 'fold', 'object': '', 'target': 'in the avocado and the apple'}]
_____________________________________
2 [1;43mmix[0m in the [1;46mmayonnaise[0m and [1;46mlemon[0m [1;46mjuice[0m
[{'action': 'mix', 'object': '', 'target': 'in the mayonnaise and lemon juice'}]
_____________________________________
3 [1;43mchill[0m
[]
_____________________________________


In [556]:
for recipe_id in range(1000,2000):
    dirs=get_clean_directions(recipes[recipe_id])
    ingr_words=list(set(ingr_words_func(recipes[recipe_id]['ingr'])))
    r=0
    f=0
    for d in dirs:
        if len(d)>0:
            d_words=np.array(d.split(' '))
            action_idx=np.array([(word in actions_set) for word in d_words]).astype(np.int32)
            v=sum(action_idx)
            f+=abs(1-v)
    if not(f):
        print recipe_id,f

1367 0
1856 0
1890 0
1992 0
