In [1]:
import requests
from sklearn.manifold import TSNE
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
from fuzzywuzzy import fuzz, process
from functools import lru_cache
%load_ext line_profiler

In [299]:
#User input examples

theurl = 'https://tasty.co/recipe/chicken-veggie-stir-fry'
#theurl = 'https://damndelicious.net/2019/08/06/easy-chicken-tacos/'
#theurl =  'https://www.thewholesomedish.com/the-best-classic-burger'

In [399]:
#function definiton block
@lru_cache()
def load_data():
    ''' Load previously generated data, all of this can be created via the other notebooks in this codebase
    except the farmers market data, which can be found here:
    https://docs.google.com/spreadsheets/d/1MOWl8Cg4xyCvAmR06cFhJ9obYR5ToZD_XhSEcgekjzY/edit#gid=1829695724'''
    
    w2vm = pickle.load(open("generated_data/model_w2v.pkl", 'rb')) #
    aisledict = pickle.load(open("generated_data/ingredient_aisle.pkl", 'rb')) #
    noise = pickle.load(open("generated_data/noiselist.pkl", 'rb')) #
    atFM = pickle.load(open("generated_data/FMproducts.pkl", 'rb')) #
    FMinfo = pickle.load(open("generated_data//FMfull.pkl", 'rb')) #
    
    ingvect = pickle.load(open("generated_data/tfidfvect_ingredients.pkl", 'rb')) #
    ingfeatures = pickle.load(open("generated_data/features_ingredients.pkl", 'rb'))
    fullnningredients = pickle.load(open("generated_data/cleaned_ingredients.pkl", 'rb'))
    fulling = []
    fulling.extend([', '.join(n) for n in fullnningredients])

    recvect = pickle.load(open("generated_data/tfidfvect_recipes.pkl", 'rb'))
    recfeatures = pickle.load(open("generated_data/features_recipes.pkl", 'rb'))
    recdoc = pickle.load(open("generated_data/full_recipedoc.pkl", 'rb'))

    return w2vm, aisledict, noise, atFM, FMinfo, ingvect, ingfeatures, fulling, recvect, recfeatures, recdoc

def request_comparison(userinput):
    '''Takes the user input recipe URL and extracts the recipe information 
    Note you must have a spoonacular API key for this to work'''
    
    mykey = open('generated_data/spoonapikey.txt').read().strip()
    params = {'url': userinput, 'forceExtraction': 'true', 'apiKey': mykey, 'analyze': 'true'}
    response = requests.get('https://api.spoonacular.com/recipes/extract', params=params)
    rec = response.json()
    ingcomp = rec['extendedIngredients']
    ingredients = [','.join([ing['name'].lower() for ing in ingcomp if ing['name'] ])]
    
    cur_rec = [rec['title'] + ' ' + ingredients[0] + ' ' + rec['instructions']]
    
    return ingredients[0], cur_rec

def removenoise(ingredients, noise): 
    '''Do some simple noise removal on the loaded ingredients'''
    noise_free_ing = []
    for word in ingredients.split(','):
        checked = []
        splitit = word.split()
        checked.extend(i for i in splitit if i not in noise)
        noise_free_ing.append(' '.join(checked))
    return noise_free_ing

def rulesofsimilarity(noise_free_ing, w2vm, aisledict, atFM, FMinfo):
    '''Goes through ingredients, assesses the similarity of them'''
    
    #Create starting dict to be filled out
    output = {
        'ingredient': None,
        'where_available': None,
        'unknown': None, #unknown to word2vec
        'baking': False, #true if is baking
        'spices': False, #true if spice/seasoning
        'spice_businesses': None,
        'match': None,
        'similar_vendor': None,
        'try_fresh': None,
        'store_hasreplacement': None,
        'cos_sim': None #this is defined in the validation call
    }
    
    #Shopping list dict
    wheretoshop = {
    }
    allout = []

    for i in noise_free_ing:
        thisout = output.copy()
        thisout['ingredient'] = i
        
        #Figure out if it is available at the farmers market
        highest = process.extractOne(i,atFM)
        if highest[1] >= 90:
            matchaisle = FMinfo.loc[FMinfo['TYPES OF PRODUCTS AVAILABLE'].str.contains(highest[0])]
            thebiz = matchaisle['BUSINESS NAME'].tolist()
            thisout['where_available'] = thebiz
            
            # place in shopping list
            found = False
            for vendor in thisout['where_available']:
                if vendor in wheretoshop:
                    wheretoshop[vendor].append(thisout['ingredient'])
                    found = True
                    break
            if not found:
                wheretoshop[thisout['where_available'][0]] = [thisout['ingredient']] 
            

        #If it is NOT available
        else:
            curaisle = aisledict.get(i)
            
            #use w2v to find similar ingredients
            try:
                similar = w2vm.wv.most_similar(i, topn=100)
                opposite = w2vm.wv.most_similar(similar[0][0], topn=1000)
            except KeyError: # Just in case we find an entirely new ingredient!
                thisout['unknown'] = i
                continue
                
            #Deal with the baking outliers
            if curaisle[0] is not None and curaisle[0] == 'Baking':
                thisout['baking'] = True
                continue
                
            #Deal with the seasoning outliers
            if curaisle[0] is not None and curaisle[0] == 'Spices and Seasonings':
                matchaisle = FMinfo.loc[FMinfo['aisles'] == curaisle[0].lower()]
                thebiz = matchaisle['BUSINESS NAME'].tolist()
                thisout['spices'] = True
                thisout['spice_businesses'] = thebiz
                continue
                
                
            # Find the ingredient our algorithm thinks is similar and IS available
            item = []
            for opp in opposite:
                ophighest = process.extractOne(opp[0],atFM)
                if ophighest[1] >= 90:
                    item.append(opp[0])
                    if len(item) == 1:
                        break
            thisout['match'] = item
            matchaisle = FMinfo.loc[FMinfo['TYPES OF PRODUCTS AVAILABLE'].str.contains(ophighest[0])]
            thebiz = matchaisle['BUSINESS NAME'].tolist()
            thisout['store_hasreplacement'] = thebiz
            
            found = False
            for vendor in thisout['store_hasreplacement']:
                if vendor in wheretoshop:
                    wheretoshop[vendor].append(thisout['match'][0])
                    found = True
                    break
            if not found:
                wheretoshop[thisout['store_hasreplacement'][0]] = [thisout['match'][0]] 
            
            
            
            # If it is something usually prepackaged, suggest making it fresh
            if curaisle[0] == 'Pasta and Rice' or curaisle[0] == 'Canned and Jarred':
                for sim in similar:
                    a = aisledict[sim[0]]
                    if a and a[0] == 'Produce':
                        trythis = sim[0]
                        thisout['try_fresh'] = trythis
                        continue
                                    
             #if something is not available, find a vendor that might be able to help        
            if curaisle[0] is not None: 
                matchaisle = FMinfo.loc[FMinfo['aisles'] == curaisle[0].lower()]
                thebiz = matchaisle['BUSINESS NAME'].tolist()
                if len(thebiz) > 0:
                    thisout['similar_vendor'] = thebiz



        allout.append(thisout)
    return allout, wheretoshop

def validationstep(allout, fulling, ingvect, ingfeatures, recvect, recfeatures, recdoc, cur_rec):
    '''Validate how well our suggestions fit based on cosine similarity of entire recipe'''
    initinglist = []
    thingstoremove = []
    thingstoadd = []
    #loop through ingredients
    for out in allout:
        initinglist.append(out['ingredient'])
        # If it is an item we have swapped out
        if out['where_available'] is None:
            thingstoremove.append(out['ingredient'])
            thingstoadd.append(out['match'][0])

    # create a new ingredient list withe swapping in place
    for rem, add in zip(thingstoremove, thingstoadd):
        newlist = initinglist.copy()
        newlist.remove(rem)
        newlist.append(add)

     
        # based on ingredients alone find the most similar recipe we know
        newlistj = [', '.join(newlist)]
        nsf = ingvect.transform(newlistj)
        cosine_similarities = linear_kernel(nsf, ingfeatures).flatten()
        related_rec_index = cosine_similarities.argsort()[-1]
        

        #now find out, based on more features, how similar these two recipes are
        currecfeat = recvect.transform(cur_rec)
        recipe_similarity = linear_kernel(currecfeat, recfeatures[related_rec_index]).flatten()
        
        for out in allout:
            if out['ingredient'] == rem:
                out['cos_sim'] = recipe_similarity
       
            
        
        

In [409]:
#function call
ingredients, cur_rec = request_comparison(theurl) # total time 0.9 seconds, could get down by not forcing extraction but that may break sometimes
w2vm, aisledict, noise, atFM, FMinfo, ingvect, ingfeatures, fulling, recvect, recfeatures, recdoc = load_data()
noise_free_ing = removenoise(ingredients, noise)
allout, wheretoshop = rulesofsimilarity(noise_free_ing, w2vm, aisledict, atFM, FMinfo) # takes 1.2 seconds
validationstep(allout, fulling, ingvect, ingfeatures, recvect, recfeatures, recdoc, cur_rec)


In [382]:
#Output -- equivalent is written in HTML in production

for ing in allout:
    if ing['where_available'] is None and ing['cos_sim'][0] >= 0.37: # mean-1 std cos similarity
        print(f'The farmers market doesnt have {ing["ingredient"]}, but our algorithm thinks {ing["match"][0]} is a solid replacement!\n')
    if ing['where_available'] is None and ing['cos_sim'][0] < 0.37 and ing['similar_vendor'] is not None:
        print(f'The farmers market doesnt have {ing["ingredient"]}, but our algorithm thinks {ing["match"][0]} is the closest ingredient available')
        print(f'However, we are smart enough to tell that is not a great replacement, so you can also try asking at these ' +
               f'these stores, who often have items similar to {ing["ingredient"]}: {ing["similar_vendor"]}\n' )
    if ing['where_available'] is None and ing['cos_sim'][0] < 0.37 and ing['similar_vendor'] is None:
        print(f'The farmers market doesnt have {ing["ingredient"]}, but our algorithm thinks {ing["match"][0]} is the closest ingredient available')
        print(f'However, we are smart enough to tell that is not a great replacement... I hate to say it, but {ing["ingredient"]} might require a trip to a traditional grocery store\n')
              
print('Based on these suggestions, inputting all the similar ingredients (use your descretion) for this grocery list\n')         
for place in wheretoshop:
    
    print(place + ": " + ', '.join(wheretoshop[place]))

The farmers market doesnt have bell pepper, but our algorithm thinks zucchini is a solid replacement!

The farmers market doesnt have sesame oil, but our algorithm thinks rice wine vinegar is the closest ingredient available
However, we are smart enough to tell that is not a great replacement, so you can also try asking at these these stores, who often have items similar to sesame oil: ['Kimchee Harvest / East Branch Farms']

The farmers market doesnt have reduced sodium soy sauce, but our algorithm thinks shao hsing wine is the closest ingredient available
However, we are smart enough to tell that is not a great replacement... I hate to say it, but reduced sodium soy sauce might require a trip to a traditional grocery store

Based on these suggestions, inputting all the similar ingredients (use your descretion) for this grocery list

Abundance Acres Farm: chicken breast
Alewife Farm: zucchini, broccoli florets, mushroom, ginger
Grazin Angus Acres: oil
keith's farm: garlic
Anthony Road