# Loading Things

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import scipy.sparse as sparse
import numpy as np

In [2]:
%%time
final_sparse = sparse.load_npz(r'/Users/giancarlotissot/code/mariliabreis/ingredient_matching/raw_data/sparse_final_df.npz')

# final_df = pd.DataFrame(from_sparse_df)

CPU times: user 75.4 ms, sys: 21.8 ms, total: 97.2 ms
Wall time: 165 ms


In [3]:
%%time
from_sparse_df = final_sparse.todense()

CPU times: user 1.24 s, sys: 2.58 s, total: 3.82 s
Wall time: 9.27 s


# Actual Code

In [7]:
df_pickle = pd.read_pickle('/Users/giancarlotissot/code/mariliabreis/ingredient_matching/raw_data/recipe_list/ingr_map.pkl')
ingredients_clean = df_pickle[['id','replaced','count','raw_ingr']]

In [8]:
def get_id(ingredient_str):
    # Transforms string input to pre-processed ID
    ingredient_id = ingredients_clean[ingredients_clean['raw_ingr'] ==ingredient_str]
    ingredient_id.reset_index(inplace=True)
    return ingredient_id.loc[0,'id']
    
def get_name(ingredient_id):
    # Transforms ID back to pre-processed string
    ingredient_name = ingredients_clean[ingredients_clean['id'] == ingredient_id]
    ingredient_name.reset_index(inplace=True)
    return ingredient_name.loc[0,'replaced']

In [9]:
### Old Version ###
# def output_func(input_ingredient,num_matches=10):
#     # Combines other functions into a workflow
#     num_matches += 1
    
#     if type(input_ingredient) == list:
#         id_input = []
#         for ingredient in input_ingredient:
#             id_input.append(get_id(ingredient))
#     else:
#         id_input = get_id(input_ingredient)
        
#     id_list = find_match(id_input,num_matches)
#     names = list_to_names(id_list)
    
#     names.pop(0)
#     return names


In [18]:
ingredients_clean

Unnamed: 0,id,replaced,count,raw_ingr
0,4308,lettuce,4507,"medium heads bibb or red leaf lettuce, washed,..."
1,4308,lettuce,4507,mixed baby lettuces and spring greens
2,4308,lettuce,4507,romaine lettuce leaf
3,4308,lettuce,4507,iceberg lettuce leaf
4,4308,lettuce,4507,red romaine lettuce
...,...,...,...,...
11654,6702,soybean,31,soybeans
11655,3318,goose,8,goose
11656,47,ajwain,13,ajwain
11657,750,brinjal,2,brinjals


In [35]:
def find_match(id_input,num_matches):
    # Returns list of ingredient IDs and count of occurances
    numpy_arr = np.nonzero(from_sparse_df[:,id_input])
    indices = (-final_sparse[numpy_arr[0],:].sum(axis=0).A1).argsort()[:num_matches]
    return indices
    
def list_to_names(ingredient_id_list):
    # Takes in list of ingredient IDs and returns list of names
    list_ = []
    for id in ingredient_id_list:
        list_.append(get_name(id))
    return list_

def be_adventurous(id_input):
    list_ = []
    for id in id_input:
        counts = ingredients_clean.loc[ingredients_clean['id']==id]['count'].to_list()
#         print('###',type(counts))
        list_.append(counts[0])
    return list_

def output_func(input_ingredient,num_matches=10,adventure=False):
    # Combines other functions into a workflow
    num_matches += 1
    
    if type(input_ingredient) != list:
        input_ingredient = [input_ingredient]
    id_input = []
    for ingredient in input_ingredient:
        id_input.append(get_id(ingredient))
    id_list = find_match(id_input,num_matches)
    names = list_to_names(id_list)

    return names[len(input_ingredient):]

In [36]:
be_adventurous([4308,6702,4061])

[4507, 31, 6]

In [11]:
%%time
output_func('salt')

CPU times: user 56.1 ms, sys: 10.1 ms, total: 66.3 ms
Wall time: 93.9 ms


['egg',
 'butter',
 'sugar',
 'onion',
 'pepper',
 'flmy',
 'olive oil',
 'water',
 'garlic clove',
 'milk']

In [12]:
%%time
output_func(['salt','pecorino pepato cheese'])

CPU times: user 222 ms, sys: 423 ms, total: 645 ms
Wall time: 974 ms


['butter',
 'sugar',
 'onion',
 'pepper',
 'flmy',
 'olive oil',
 'water',
 'garlic clove',
 'milk']

In [13]:
%%time
output_func('salt',10)

CPU times: user 55.1 ms, sys: 4.38 ms, total: 59.5 ms
Wall time: 66.8 ms


['egg',
 'butter',
 'sugar',
 'onion',
 'pepper',
 'flmy',
 'olive oil',
 'water',
 'garlic clove',
 'milk']

In [14]:
print(get_name(5308))
# print(get_id('tomatoes'))

pecorino pepato cheese


In [15]:
num_matches = 3
id_input = [7213,5010]
# for ingredient in 
numpy_arr = np.nonzero(from_sparse_df[:,id_input])
indices = (-final_sparse[numpy_arr[0],:].sum(axis=0).A1).argsort()[:num_matches]
indices

array([5010, 6270, 7213])

In [16]:
get_name(8021)

'zucchini'