In [1]:
import pandas as pd
import numpy as np
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

# INGR SIM

In [2]:
# load spacy english-long model
nlp = spacy.load('en_core_web_lg')

In [3]:
# read the ingr map
ingr_map = pd.read_pickle('ingr_map.pkl')

In [4]:
# create 2 lists
## ingr id list and ingr_name list

ingrs_df = ingr_map[['replaced','id']].drop_duplicates()

ingrs_id = ingrs_df['id'].to_list()
ingrs_name = ingrs_df['replaced'].to_list()

In [7]:
%%time
# convert each ingredient into an embedding (a 1x300 vector) using scipy word2vec
# stack all vectors to create a matrix (Nx300) where N=num ingredients

out = nlp(ingrs_name[0]).vector

for ing in ingrs_name[1:]:
    ing_vector = nlp(ing).vector
    out = np.vstack((out,ing_vector))

Wall time: 50.1 s


In [8]:
# print shape of resulting matrix
out.shape

(8023, 300)

In [9]:
# compute pairewise cos similarity using sklearn
sim = cosine_similarity(out)
np.fill_diagonal(sim,0) # fill diagonal with zeros

In [10]:
# sim matrix
sim

array([[0.        , 0.45709878, 0.41759226, ..., 0.        , 0.315817  ,
        0.        ],
       [0.45709878, 0.        , 0.77123934, ..., 0.        , 0.13233519,
        0.        ],
       [0.41759226, 0.77123934, 0.        , ..., 0.        , 0.12659758,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.315817  , 0.13233519, 0.12659758, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

In [11]:
# print shape of resulting pairewise similarity matrix
sim.shape

(8023, 8023)

In [12]:
# create a dataframe with 4 cols
#- ingr_id
#- ingr_name
#- sim_ingr_id
#- sim_ingr_name
#- similarity

ing_id = []
sim_ing_id = []
ing_name = []
sim_ing_name = []
sim_score = []

for i in range(sim.shape[0]):
     
    top3_sim_ids   = [ingrs_id[i]   for i in np.argsort(-sim[i])[:3]]
    top3_sim_names = [ingrs_name[i] for i in np.argsort(-sim[i])[:3]]
    sim_scores     = [i for i in -np.sort(-sim[i])[:3]]
    
    if sum(sim_scores) > 0:
        for j in range(3):
            ing_id.append(ingrs_id[i])
            ing_name.append(ingrs_name[i])
            
            sim_ing_id.append(top3_sim_ids[j])
            sim_ing_name.append(top3_sim_names[j])
            sim_score.append(sim_scores[j])
    
data_tuples2 = list(zip(ing_id,ing_name,sim_ing_id,sim_ing_name,sim_score))
ingr_sim_df = pd.DataFrame(data_tuples2, columns=['ing_id','ing_name','sim_ing_id','sim_ing_name','similarity'])

In [13]:
ingr_sim_df

Unnamed: 0,ing_id,ing_name,sim_ing_id,sim_ing_name,similarity
0,4308,lettuce,6161,romaine leaf,0.826433
1,4308,lettuce,6754,spinach,0.801966
2,4308,lettuce,6763,spinach pastum,0.801966
3,2744,french vanilla pudding and pie filling mix,7478,vanilla instant pudding and pie filling mix,0.965304
4,2744,french vanilla pudding and pie filling mix,348,banana cream pudding and pie filling mix,0.958920
...,...,...,...,...,...
23608,3318,goose,2468,duck,0.700905
23609,3318,goose,7920,wild duck,0.639045
23610,750,brinjal,254,aubergine,0.578521
23611,750,brinjal,4994,okra,0.569084


In [195]:
#ingr_sim_df.to_csv('ingr_sim_v2.csv')

# END