# Allrecipes.com "core" ingredient project by Paige McKenzie

Implements methods discussed in related [blog post](https://p-mckenzie.github.io/python/2018/10/01/ingredient-analysis/).

Data can be acquired using associated `scraper.py` file.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import nltk

In [2]:
# import data
df = pd.read_csv("scraped_data.csv", index_col=0)
df = df[df['ingredients'].notna()]

# clean up columns and add key
df['ingredients'] = df['ingredients'].apply(eval)
df['recipe_key'] = df['link'].apply(lambda x:int(re.findall(r"\d+", x)[0]))

In [3]:
# unstack each row's ingredients into a single entry in a series of ingredients
df.set_index('recipe_key', inplace=True)

ingredients = df['ingredients'].apply(pd.Series).stack()
ingredients.index = ingredients.reset_index().apply(lambda row:"{}_{}".format(row['recipe_key'], row['level_1']), axis=1).values

In [4]:
del df

## Parse text
### Theory:
Most ingredients are formatted as follows:

`# unit adjective noun, adjective (misc. extra information) - adjectives conjunctions adjectives`

In [5]:
def parse_text(s):
    '''
    Accepts a series of strings, and applies multiple cleaning steps. Returns a series of strings.
    '''
    assert type(s)==pd.core.series.Series
    
    from nltk.stem import WordNetLemmatizer
    wnl = WordNetLemmatizer()

    def lemmatize(string):
        for word in re.findall(r"[a-z]+", string):
            string = string.replace(word, wnl.lemmatize(word, 'n') if 's' in word[-3:] else word)
        return string
    
    # remove anything inside paranthesis
    s = s.apply(lambda x:re.sub(r"\([^\)]+\)", '', x))
    
    # remove anything containing a digit
    s = s.apply(lambda x:re.sub(r"\S*\d\S*", '', x))

    # make everything lowercase
    s = s.str.lower()

    # remove plurals where possible
    s = s.apply(lemmatize)
    
    # remove non-word characters except for , and -
    s = s.apply(lambda x:' '.join(re.findall(r"[-,''\w]+", x)))
    
    #clean excess whitespace
    s = s.apply(lambda x:re.sub(r"\s+", ' ', x).strip())

    # only keep entries with data after cleaning
    s = s[s!='']
    
    return s

In [6]:
pd.concat([ingredients.sample(n=5, random_state=100).rename('before'),
           parse_text(ingredients.sample(n=5, random_state=100)).rename('after')], axis=1)

Unnamed: 0,before,after
236942_9,1 cup panko bread crumbs,cup panko bread crumb
66404_2,"1 large onion, diced","large onion, diced"
231776_0,4 cups all-purpose flour,cup all-purpose flour
57002_2,1 tablespoon oil,tablespoon oil
14527_5,"2 lemons, juiced","lemon, juiced"


## Get penalties (position and part of speech)

In [7]:
def penalties(processed, pos_dict, split_dict, desc_penalty, first_penalty):
    '''
    Returns penalties for each token in processed, penalizing those that appear after strings in split_dict,
    those that appear first in a string with first_penalty, those that appear earlier in strings with desc_penalty,
    and those with parts of speech in pos_dict.
    '''
    def get_split_penalties(processed, split_string):
        '''Penalizes tokens that appear after split_string.'''
        s = pd.Series()

        for lst in processed[processed.str.contains(split_string)].apply(lambda x:x.split(split_string)).values:
            for index, section in enumerate(lst):
                for token in nltk.word_tokenize(section):
                    s = s.append(pd.Series({token:index/(len(lst)-1)}))
        return s.groupby(s.index).mean().rename(split_string)

    def get_pos_penalties(processed, pos_dict):
        '''Penalizes tokens with {part_of_speech:penalty} given by pos_dict'''
        pos_probs = pd.Series({x:nltk.pos_tag([x])[0][1] for x in set(nltk.word_tokenize(' '.join(processed)))}).apply(lambda x:x[:2])
        pos_probs = pd.get_dummies(pos_probs[[x.isalpha() for x in pos_probs.index]])

        for key, prob in pos_dict.items():
            pos_probs[key] = pos_probs[key]*prob
        return pos_probs.max(axis=1).rename('POS')

    def order_penalties(processed, first_penalty, desc_penalty):
        '''Penalizes tokens that appear earlier in strings'''
        from collections import defaultdict
        indices_dict = defaultdict(list)

        for string in processed:
            words = nltk.word_tokenize(string)
            for ind, word in enumerate(words):
                if ind==0:
                    indices_dict[word].append(first_penalty)
                else:
                    indices_dict[word].append(1-ind/len(words))

        return pd.Series({key:np.mean(val) for key, val in indices_dict.items()}).rename('order')*desc_penalty



    # return sum of penalties
    c_T = pd.concat([get_pos_penalties(processed, pos_dict),
                      order_penalties(processed, first_penalty, desc_penalty),
                      pd.concat([get_split_penalties(processed, key)*value for key, value in split_dict.items()], 
                                             axis=1, sort=True)
                     ], axis=1, join='outer', sort=True).sum(axis=1)

    return c_T[[x.isalpha() for x in c_T.index]]

# Get possible core ingredients from each ingredient

In [8]:
def get_A_matrix(s):
    '''Apply CountVectorizer and only retain tokens that are alphabetic.'''
    from sklearn.feature_extraction.text import CountVectorizer
    model = CountVectorizer(binary=True, stop_words=None, ngram_range=(1, 1),
                       tokenizer=lambda x:nltk.word_tokenize(x))
    data = pd.DataFrame(model.fit_transform(s).todense(), index=s.index, columns=model.get_feature_names())
    return data.loc[:, [x.isalpha() for x in data.columns]]

### Build integer program

In [9]:
def run_IP(A, c_T):
    '''Solve integer min. program with constraints A, penalties c_T, where Ax>=1
    for every row of A.
    '''
    import pulp
    
    # create problem
    pulp.LpSolverDefault.msg = 1
    problem = pulp.LpProblem('ingredient', pulp.LpMinimize)

    # add variables with associated penalty (sum)
    variables = [pulp.LpVariable(token, 0, 1, pulp.LpBinary) for token in A.columns.values]

    problem += pulp.lpDot(c_T.values, variables)
    
    # add constraint that either one or two tokens per ingredient can be selected
    for index, row in A.iterrows():
        c = pulp.LpAffineExpression([(variables[A.columns.tolist().index(token)], 1) for token in row[row>0].index])
        problem += (1<=c)
        problem += (c<=2)
        
    # solve problem
    status = problem.solve()
    print(pulp.LpStatus[status])
    
    return pd.Series([var.value() for var in variables],
                     index=A.columns.tolist(), name='is_ingredient').astype(int)

### Call functions

In [10]:
s = ingredients.sample(n=500, random_state=101)
s_clean = parse_text(s)

# build token matrix
A = get_A_matrix(s_clean)

# build coefficient vector
c_T = penalties(s_clean, 
            pos_dict={'NN':.1, 'VB':10, 'JJ':10},
            split_dict={' - ':3, ', ':2},
            desc_penalty=5,
            first_penalty=500)

# check that all tokens are in both places
assert sum(c_T.index!=A.columns)==0

sols = run_IP(A, c_T)

Optimal


In [11]:
pd.concat([s.rename('original'), A.apply(lambda row:row[(row*sols)>0].index.tolist(), axis=1)], axis=1).head(20)

Unnamed: 0,original,0
24831_2,"1 red bell pepper, julienned",[pepper]
218031_15,"8 ounces shredded Monterey Jack cheese, divided",[cheese]
77351_9,1/3 cup firmly packed brown sugar,[sugar]
223171_7,1/4 cup mango chutney,[chutney]
49444_3,32 fluid ounces Kentucky bourbon,[bourbon]
229270_10,1/2 cup chopped peanuts,[chopped]
7883_7,2 tablespoons milk,[milk]
18093_10,1/2 teaspoon garlic salt,"[garlic, salt]"
14297_8,2 cups thinly sliced red cabbage,"[cabbage, sliced]"
14898_3,2 teaspoons curry powder,[powder]


In [12]:
del s, s_clean, A, c_T, sols

# Iterative "learning"

**1) Approximately rank each token**

In [13]:
def scores(processed, pos_rewards, split_rewards, order_reward, first_prob):
    '''
    Returns a "score" for each token in processed with much the same logic as penalties(**args).
    '''
    def get_split_probs(processed, split_string):
        s = pd.Series()

        for lst in processed[processed.str.contains(split_string)].apply(lambda x:x.split(split_string)).values:
            for index, section in enumerate(reversed(lst)):
                for token in nltk.word_tokenize(section):
                    s = s.append(pd.Series({token:index/(len(lst)-1)}))
        return s.groupby(s.index).mean().rename(split_string)
    
    def get_pos_probs(processed, pos_rewards):
        # find part of speech penalties
        pos_probs = pd.Series({x:nltk.pos_tag([x])[0][1] for x in set(nltk.word_tokenize(' '.join(processed)))}).apply(lambda x:x[:2])
        pos_probs = pd.get_dummies(pos_probs[[x.isalpha() for x in pos_probs.index]])
        
        for key, prob in pos_rewards.items():
            pos_probs[key] = pos_probs[key]*prob
        return pos_probs.max(axis=1).rename('POS')
    
    def order_probs(processed, first_prob, order_reward):
        # penalize earlier words in string
        from collections import defaultdict
        indices_dict = defaultdict(list)

        for string in processed:
            words = nltk.word_tokenize(string)
            for ind, word in enumerate(words):
                if ind==0:
                    indices_dict[word].append(first_prob)
                else:
                    indices_dict[word].append(ind/len(words))

        return pd.Series({key:np.mean(val) for key, val in indices_dict.items()}).rename('order')*order_reward

    # return sum of penalties
    c_T = pd.concat([get_pos_probs(processed, pos_rewards),
                      order_probs(processed, first_prob, order_reward),
                      pd.concat([get_split_probs(processed, key)*value for key, value in split_rewards.items()], 
                                             axis=1, sort=True)
                     ], axis=1, join='outer', sort=True).sum(axis=1)

    return c_T[[x.isalpha() for x in c_T.index]]

In [14]:
s = ingredients.sample(n=5000, random_state=101)
s_clean = parse_text(s)

# build token matrix
A = get_A_matrix(s_clean)

# build coefficient vector
c_T = scores(s_clean, 
            pos_rewards={'NN':10, 'VB':.1, 'JJ':.1},
            split_rewards={' - ':3, ',':1},
            order_reward=5,
            first_prob=-1)

# check that all tokens are in both places
assert sum(c_T.index!=A.columns)==0

**2) Force some seeds for first iteration**

**2a)** A token will have ranking score of 10. if it's the only token available (provided it is also a noun)

**2b)** Any token that appears after a hyphen will have ranking score of 0.

**2c)** Any past-tense verbs that end in 'ed' (i.e. crushed, chopped, browned) will have ranking score of 0.

In [15]:
def force_seeds(s_clean, c_T):
    # word tokenize and cleanup
    s = s_clean.rename('original').reset_index()
    s['options'] = s_clean.rename('options').apply(nltk.word_tokenize).reset_index()['options']

    # add part of speech tags
    s['pos'] = s['options'].apply(lambda lst:[pos for word, pos in nltk.pos_tag(lst)])
    
    for idx, row in s.loc[s['options'].apply(len)==1, ['options', 'pos']].applymap(lambda lst:lst[0]).drop_duplicates().iterrows():
        if row['pos']=='NN':
            c_T.loc[row['options']] = 10.

    for token in set([item for sublist in s.loc[s['options'].apply(lambda lst:'-' in lst)]['options'].apply(lambda lst:lst[lst.index('-')+1:]).values for item in sublist]):
        c_T.loc[token] = 0.

    for token in set([item for sublist in s.apply(lambda row:[word for pos, word in zip(row['pos'], row['options']) if pos=='VBD' and word[-2:]=='ed'], axis=1).values.tolist() for item in sublist]):
        c_T.loc[token] = 0.

    return c_T

In [16]:
# re-index (skip non-alpha tokens or anything else that may have leaked in)
c_T = force_seeds(s_clean, c_T).reindex(A.columns, fill_value=5.)

**3) Define function to tag rows based on probabilities and some randomness**

In [17]:
def softmax(s):
    ''' Applies softmax to input series (maps values to (0,1) such that the entries sum to 1.0).
    '''
    e_s = s.apply(lambda x:np.exp(x))
    return e_s / e_s.sum()

def pick_max(row, ranks, seed):
    ''' Selects an entry in row (where row>0), drawn from distribution achieved
    via applying softmax to the ranks. If a seed is included, it will be randomly sampled.
    Otherwise, the entry with maximum probability will be chosen.
    '''
    final = pd.Series().reindex(row.index, fill_value=0).astype(int)
    mask = (row>0)
    if seed:
        index = (row*ranks)[mask].sample(1, random_state=seed, weights=softmax(ranks[mask])).index
    else:
        index = softmax(ranks[mask]).idxmax()
    
    final.loc[index] = 1
    return final

def norm(s):
    ''' Normalizes series s (subtracts mean, divides by standard deviation).
    '''
    return (s-s.mean())/s.std()

In [18]:
learning_rate = 1
threshold = 7.5

while c_T.std()<threshold:
    choices = A.apply(pick_max, ranks=c_T, seed=1, axis=1)
    c_T += norm(choices.sum()/A.sum())*learning_rate

In [19]:
# make final choices
choices = A.apply(pick_max, ranks=c_T, seed=None, axis=1)

In [20]:
pd.concat([s.rename('original'), choices.idxmax(axis=1).rename('core')], axis=1, sort=False)

Unnamed: 0,original,core
24831_2,"1 red bell pepper, julienned",pepper
218031_15,"8 ounces shredded Monterey Jack cheese, divided",cheese
77351_9,1/3 cup firmly packed brown sugar,sugar
223171_7,1/4 cup mango chutney,chutney
49444_3,32 fluid ounces Kentucky bourbon,bourbon
229270_10,1/2 cup chopped peanuts,peanut
7883_7,2 tablespoons milk,milk
18093_10,1/2 teaspoon garlic salt,garlic
14297_8,2 cups thinly sliced red cabbage,cabbage
14898_3,2 teaspoons curry powder,powder


In [21]:
del s, s_clean, A, c_T, choices, threshold, learning_rate