# Loading Things

In [5]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np

In [6]:
%%time
final_sparse = sparse.load_npz(r'/Users/giancarlotissot/code/mariliabreis/ingredient_matching/raw_data/sparse_final_df.npz')

# final_df = pd.DataFrame(from_sparse_df)

CPU times: user 72.1 ms, sys: 21.1 ms, total: 93.1 ms
Wall time: 112 ms


In [7]:
%%time
from_sparse_df = final_sparse.todense()

CPU times: user 1.28 s, sys: 2.87 s, total: 4.16 s
Wall time: 7.42 s


# Actual Code

In [8]:
df_pickle = pd.read_pickle('/Users/giancarlotissot/code/mariliabreis/ingredient_matching/raw_data/recipe_list/ingr_map.pkl')
ingredients_clean = df_pickle[['id','replaced','count','raw_ingr']]

In [9]:
def get_id(ingredient_str):
    # Transforms string input to pre-processed ID
    ingredient_id = ingredients_clean[ingredients_clean['raw_ingr'] ==ingredient_str]
    ingredient_id.reset_index(inplace=True)
    return ingredient_id.loc[0,'id']
    
def get_name(ingredient_id):
    # Transforms ID back to pre-processed string
    ingredient_name = ingredients_clean[ingredients_clean['id'] == ingredient_id]
    ingredient_name.reset_index(inplace=True)
    return ingredient_name.loc[0,'replaced']

In [10]:
### Old Version ###
# def output_func(input_ingredient,num_matches=10):
#     # Combines other functions into a workflow
#     num_matches += 1
    
#     if type(input_ingredient) == list:
#         id_input = []
#         for ingredient in input_ingredient:
#             id_input.append(get_id(ingredient))
#     else:
#         id_input = get_id(input_ingredient)
        
#     id_list = find_match(id_input,num_matches)
#     names = list_to_names(id_list)
    
#     names.pop(0)
#     return names


In [11]:
ingredients_clean

Unnamed: 0,id,replaced,count,raw_ingr
0,4308,lettuce,4507,"medium heads bibb or red leaf lettuce, washed,..."
1,4308,lettuce,4507,mixed baby lettuces and spring greens
2,4308,lettuce,4507,romaine lettuce leaf
3,4308,lettuce,4507,iceberg lettuce leaf
4,4308,lettuce,4507,red romaine lettuce
...,...,...,...,...
11654,6702,soybean,31,soybeans
11655,3318,goose,8,goose
11656,47,ajwain,13,ajwain
11657,750,brinjal,2,brinjals


In [47]:
def find_match(id_input,num_matches,min_ingredients):
    # Returns list of ingredient IDs and count of occurances
    numpy_arr = np.nonzero(from_sparse_df[:,id_input])
    indices = (-final_sparse[numpy_arr[0],:].sum(axis=0).A1).argsort()[min_ingredients:num_matches+min_ingredients]
    return indices
    
def list_to_names(ingredient_id_list):
    # Takes in list of ingredient IDs and returns list of names
    list_ = []
    for id in ingredient_id_list:
        list_.append(get_name(id))
    return list_

def be_adventurous(id_input,adventure_criteria):
    list_ = []
    for id in id_input:
        counts = ingredients_clean.loc[ingredients_clean['id']==id]['count'].to_list()
#         print('###',type(counts))
        list_.append(counts[0])
    max_ingredients = round(max(list_) * 0.05)
    min_criteria = min([max([max_ingredients, 10]),adventure_criteria])
    return min_criteria

def output_func(input_ingredient,num_matches=10,adventure=False,adventure_criteria=20):
    # Combines other functions into a workflow
    num_matches += 1
    
    if type(input_ingredient) != list:
        input_ingredient = [input_ingredient]
    id_input = []
    for ingredient in input_ingredient:
        id_input.append(get_id(ingredient))
    min_ingredients = 0
    if adventure:
        min_ingredients = be_adventurous(id_input,adventure_criteria)
    id_list = find_match(id_input,num_matches,min_ingredients)
    names = list_to_names(id_list)

    return names[len(input_ingredient):]

In [55]:
%%time
output_func('lettuce',num_matches=20,adventure=True,adventure_criteria=30)

CPU times: user 90.2 ms, sys: 5.86 ms, total: 96.1 ms
Wall time: 117 ms


['parmesan cheese',
 'red wine vinegar',
 'soy sauce',
 'black olife',
 'chili powder',
 'hamburger bun',
 'fresh lemon juice',
 'fresh ground black pepper',
 'garlic powder',
 'butter',
 'honey',
 'fresh cilantro',
 'feta cheese',
 'cherry tomato',
 'worcestershire sauce',
 'cheese',
 'red bell pepper',
 'pickle',
 'ground cumin',
 'balsamic vinegar']

In [14]:
%%time
output_func(['lettuce', 'cucumber'])

CPU times: user 389 ms, sys: 1.01 s, total: 1.4 s
Wall time: 2.09 s


['salt',
 'tomato',
 'olive oil',
 'mayonnaise',
 'scallion',
 'onion',
 'garlic clove',
 'red onion',
 'sugar']

In [13]:
%%time
output_func('salt',10)

CPU times: user 55.1 ms, sys: 4.38 ms, total: 59.5 ms
Wall time: 66.8 ms


['egg',
 'butter',
 'sugar',
 'onion',
 'pepper',
 'flmy',
 'olive oil',
 'water',
 'garlic clove',
 'milk']

In [14]:
print(get_name(5308))
# print(get_id('tomatoes'))

pecorino pepato cheese


In [15]:
num_matches = 3
id_input = [7213,5010]
# for ingredient in 
numpy_arr = np.nonzero(from_sparse_df[:,id_input])
indices = (-final_sparse[numpy_arr[0],:].sum(axis=0).A1).argsort()[:num_matches]
indices

array([5010, 6270, 7213])

In [16]:
get_name(8021)

'zucchini'