## Method1
#### Use Word2Vec embedding * Idf weights (ingredients)

In [1]:
import pandas as pd
import numpy as np

### 1. Process data

In [2]:
class recipe_ingredient_data(object):
    def __init__(self, ingredient_path = "dataset/ingredient.csv", reciepe_path = "dataset/recipe.csv"):
        self.ingredient_path = ingredient_path
        self.reciepe_path = reciepe_path
        
    def read_data(self):
        self.ingredient_df = pd.read_csv(self.ingredient_path)
        self.recipe_df = pd.read_csv(self.reciepe_path)
        print("Finsh reading raw ingredient and recipe data")
    
    def generate_recipe_ing_map(self,col_Name):
        # generate map {'recipe_id': {[ingredient1,ingredient2... ]}}
        recipe_ing_map = {}
        for ind, row in self.ingredient_df.iterrows(): 
            if row['recipe_id'] in recipe_ing_map.keys():
                recipe_ing_map.get(row['recipe_id']).add(row[col_Name])
            else:
                recipe_ing_map[row['recipe_id']] = set([row[col_Name]])
        print("Finish generating map for column: "+col_Name)
        return recipe_ing_map
    
    def prepare_data(self):
        # prepare all the data and maps
        self.read_data()
        recipe_map = self.generate_recipe_ing_map(col_Name='nutrition_key')
        recipe_map_raw =  self.generate_recipe_ing_map(col_Name='ingredient')
        # Read as pandas df
        self.recipe_ing_df = pd.DataFrame(data = recipe_map.items(),columns=['recipe_id','ingredients'])
        self.recipe_ing_raw_df = pd.DataFrame(data = recipe_map_raw.items(),columns=['recipe_id','ingredients_raw'])
        # convert from set to list for ingredients
        self.recipe_ing_df.ingredients = self.recipe_ing_df.ingredients.apply(list)
        self.recipe_ing_raw_df.ingredients_raw = self.recipe_ing_raw_df.ingredients_raw.apply(list)
    
    def prepare_corpus(self):
        # prepare corpus for Word2Vec Traning, iterrate, sort and aggregate
        # input: recipe_ing_df (['apple','jam','banana'])  output: [['apple','banana','jam']] -- sorted
        corpus_sorted = []
        for doc in self.recipe_ing_df.ingredients.values:
            doc.sort()
            corpus_sorted.append(doc)
        self.corpus = corpus_sorted
        print("Corpus is prepaed")

    def calculate_corpus_window_size(self):
        # For more precise/customized trainning on Word2Vec traning
        lengths = [len(doc) for doc in self.corpus]
        avg_len = float(sum(lengths)) / len(lengths)
        print("window size: "+ str(round(avg_len)))
        self.window_size = round(avg_len)



In [3]:
data = recipe_ingredient_data()
data.prepare_data()
data.prepare_corpus()
data.calculate_corpus_window_size()

Finsh reading raw ingredient and recipe data
Finish generating map for column: nutrition_key
Finish generating map for column: ingredient
Corpus is prepaed
window size: 9


### 2. Train Word2Vec models and save the model

In [4]:
from gensim.models import Word2Vec

In [5]:
def train_Word2Vec(corpus,window_size,vector_size=50,model="cbow",save=True):
    if model.lower() == 'cbow':
        model = Word2Vec(corpus, sg=0, workers=8, window=window_size, min_count=1, vector_size=vector_size)
        model_name = "model_cbow"
    else:  # skip-gram
        model = Word2Vec(corpus, sg=1, workers=8, window=window_size, min_count=1, vector_size=vector_size)
        model_name = "model_sg"
    print("Word2Vec model finish traning")
    if save:
        model.save("models/"+model_name+".bin")
        print("Word2Vec model weight has been saved in path:models/"+model_name+".bin")
    return model

In [6]:
model = train_Word2Vec(corpus=data.corpus, window_size=data.window_size,model="cbow",save=True)

Word2Vec model finish traning
Word2Vec model weight has been saved in path:models/model_cbow.bin


In [7]:
model.wv.most_similar("cheese")

[('mayonnaise', 0.998694658279419),
 ('mustard', 0.9986177682876587),
 ('salsa', 0.9984608888626099),
 ('mushroom', 0.9984300136566162),
 ('bread', 0.9984230995178223),
 ('green beans', 0.9984179735183716),
 ('almonds', 0.9984093308448792),
 ('olives', 0.9983571767807007),
 ('garden salad', 0.9983487725257874),
 ('cottage cheese', 0.9983158111572266)]

In [8]:
def load_model(path = "models/model_cbow.bin"):
    model = Word2Vec.load(path)
#     model.init_sims(replace=True)
    if model:
        print("Successfully loaded model from "+ path)
    return model
model_loaded = load_model()

Successfully loaded model from models/model_cbow.bin


### 3. Ingredient Embedding

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

In [10]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word_model):
        self.word_model = word_model
        self.word_idf_weight = None
        self.vector_size = word_model.wv.vector_size
        self.recipe_vecs = None

    def generate_idf_weight(self, corpus):
        # generate a map for the idf weights for each ingredients
        # traning input: [['apple','bananas','jam']]
        text_docs = []
        for doc in corpus:
            text_docs.append(" ".join(doc))

        tfidf = TfidfVectorizer()
        tfidf.fit(text_docs) 

        # for unknown word - assign the max idf 
        max_idf = max(tfidf.idf_)  
        self.word_idf_weight = defaultdict(
            lambda: max_idf,
            [(word, tfidf.idf_[i]) for word, i in tfidf.vocabulary_.items()],
        )

    def transform(self, docs):
        # get idf-weighted mean word vectors 
        doc_word_vector = self.word_average_list(docs)
        return doc_word_vector

    def word_average(self, sent):
        # use the mean of each ingredients' word embedding as the ingredient embedding
        mean = []
        for word in sent:
            if word in self.word_model.wv.index_to_key:
                mean.append(
                    self.word_model.wv.get_vector(word) * self.word_idf_weight[word]
                )  # idf weighted

        if not mean:  # empty words
            print("Empty words! "+sent)
            return np.zeros(self.vector_size)
        else:
            mean = np.array(mean).mean(axis=0)
            return mean

    def word_average_list(self, docs):
        return np.vstack([self.word_average(sent) for sent in docs])
    
    def generate_all_recipe_embeddings(self, corpus):
        self.recipe_vecs = self.transform(corpus)
        self.recipe_vecs = [recipe_vec.reshape(1, -1) for recipe_vec in self.recipe_vecs]
        assert len(self.recipe_vecs) == len(corpus)
        print("TfidfEmbeddingVectorizer model finish training")

In [11]:
tfidf_vec_tr = TfidfEmbeddingVectorizer(model)
tfidf_vec_tr.generate_idf_weight(data.corpus)
tfidf_vec_tr.generate_all_recipe_embeddings(data.corpus)
len(tfidf_vec_tr.recipe_vecs)

TfidfEmbeddingVectorizer model finish training


2256

In [17]:
tfidf_vec_tr.word_idf_weight

defaultdict(<function __main__.TfidfEmbeddingVectorizer.generate_idf_weight.<locals>.<lambda>()>,
            {'barbecue': 3.551307781779384,
             'sauce': 3.097774270630197,
             'blue': 5.28780457233239,
             'cheese': 2.193833859194985,
             'butter': 2.21750360328089,
             'carrot': 3.5177850897407406,
             'celery': 3.901510211212499,
             'chicken': 3.1086636704294657,
             'cream': 2.3674217360201855,
             'flour': 2.5605844551224592,
             'garlic': 2.2295519417970646,
             'onions': 2.075401261969806,
             'pepper': 1.9238513638426054,
             'potatoes': 3.30125677754525,
             'bread': 3.112319981632576,
             'cheddar': 4.012261575505201,
             'mayonnaise': 4.2109322703006855,
             'almonds': 4.404303663281225,
             'avocado': 3.9855933284230405,
             'berries': 5.677269339094113,
             'fennel': 5.32059439515538,
         

### 4. Recommendation

In [12]:
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [13]:
def get_topN_recommendations(N, scores,data,exclude=False):
    # Get the highest N scores, if RR; then exclude itself
    if exclude:
        top = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[1:N+1]
    else:
        top = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:N]
    
    # create dataframe to load in recommendations
    recommendation = pd.DataFrame(columns=["score","recipe_id","recipe", "ingredients", "sum_cal","sum_fat","sum_carb","sum_protein", "url"])
    count = 0
    print(top)
    for i in top: # i is row index for score (recipe_ing_df)
        recipe_id = data.recipe_ing_df['recipe_id'][i]
        recommendation.at[count, "recipe_id"] = recipe_id
        recommendation.at[count, "score"] = round(scores[i],5)
        recommendation.at[count, "ingredients"] =  data.recipe_ing_df['ingredients'][i]       
        
        recipe_row_ind =  data.recipe_df.loc[ data.recipe_df['recipe_id'] == recipe_id].index[0]
        recommendation.at[count, "recipe"] = data.recipe_df["title"][recipe_row_ind]
        recommendation.at[count, "url"] = data.recipe_df["source_url"][recipe_row_ind]
        recommendation.at[count, "sum_cal"] = data.recipe_df["sum_cal"][recipe_row_ind]
        recommendation.at[count, "sum_carb"] = data.recipe_df["sum_carb"][recipe_row_ind]
        recommendation.at[count, "sum_protein"] = data.recipe_df["sum_protein"][recipe_row_ind]
        
        count += 1
    return recommendation

In [14]:
def get_recommendation_RR(recipe_id,tfidf_vec_tr,data, N=5,sim = "cosine_similarity",generate_json=False):
    
    print("Current Recipe: "+list(data.recipe_df.loc[data.recipe_df['recipe_id'] == recipe_id]['title'])[0])
    input_ings = list(data.recipe_ing_df.loc[data.recipe_ing_df['recipe_id'] == recipe_id].ingredients)[0]
    print(input_ings)
    # get embeddings for ingredients
    input_embedding = tfidf_vec_tr.transform([input_ings])[0].reshape(1, -1)
#     print(input_embedding.shape)

    # get cosine similarity between input embedding and all the document embeddings
    if sim == "cosine_similarity":
        print("similarity function used: cosine_similarity")
        scores = list(map(lambda x: cosine_similarity(input_embedding, x)[0][0], tfidf_vec_tr.recipe_vecs))
    else:
        print("similarity function used: euclidean_distances")
        scores = list(map(lambda x: 1/(1 + euclidean_distances(input_embedding, x)[0][0]), tfidf_vec_tr.recipe_vecs))
    # Filter top N recommendations
    recommendations = get_topN_recommendations(N, scores,data, exclude=True)
    
    if generate_json:
        ts = datetime.now().strftime("%Y_%m_%d_%H_%M")
        json_file_nm = "RR_results_"+ts+".json"
        recommendations.to_json("results/"+json_file_nm,orient='records',indent=2)
        print("Generated results/"+ json_file_nm)
        print("Please check the result here!")
    return recommendations

In [15]:
get_recommendation_RR(recipe_id='35171',tfidf_vec_tr=tfidf_vec_tr,data=data, N=5, sim = "cosine_similarity",generate_json=True)

Current Recipe: Buffalo Chicken Grilled Cheese Sandwich
['barbecue sauce', 'blue cheese', 'bread', 'butter', 'carrot', 'celery', 'cheddar cheese', 'chicken', 'mayonnaise', 'onions']
similarity function used: cosine_similarity
[2182, 0, 2123, 2183, 514]
Generated results/RR_results_2022_04_30_07_43.json
Please check the result here!


Unnamed: 0,score,recipe_id,recipe,ingredients,sum_cal,sum_fat,sum_carb,sum_protein,url
0,0.99993,35118,Bacon Wrapped Buffalo Chicken Jalapeno Poppers,"[bacon, barbecue sauce, blue cheese, butter, c...",322.876,,2.9183,22.9434,http://www.closetcooking.com/2012/03/bacon-wra...
1,0.99993,35169,Buffalo Chicken Chowder,"[barbecue sauce, blue cheese, butter, carrot, ...",1716.50441,,169.4496,172.055435,http://www.closetcooking.com/2011/11/buffalo-c...
2,0.99992,d22e47,Buffalo Roasted Turkey with Blue Cheese Sauce ...,"[barbecue sauce, blue cheese, butter, celery, ...",12646.744063,,58.004713,1764.629934,http://www.chow.com/recipes/30531-buffalo-roas...
3,0.99991,47119,Make-Ahead Muffin Melts,"[bacon, barbecue sauce, cheddar cheese, eggs, ...",1085.157258,,109.565971,37.366332,http://thepioneerwoman.com/cooking/2010/07/mak...
4,0.9999,35069,Ale and Cheddar Soup,"[bacon, barbecue sauce, bell peppers, butter, ...",1913.284979,,63.446049,105.137771,http://www.closetcooking.com/2012/03/ale-and-c...


In [16]:
get_recommendation_RR(recipe_id='35171',tfidf_vec_tr=tfidf_vec_tr,data=data, N=5, sim = "euc",generate_json=True)

Current Recipe: Buffalo Chicken Grilled Cheese Sandwich
['barbecue sauce', 'blue cheese', 'bread', 'butter', 'carrot', 'celery', 'cheddar cheese', 'chicken', 'mayonnaise', 'onions']
similarity function used: euclidean_distances
[1443, 1438, 598, 1213, 160]
Generated results/RR_results_2022_04_30_07_43.json
Please check the result here!


Unnamed: 0,score,recipe_id,recipe,ingredients,sum_cal,sum_fat,sum_carb,sum_protein,url
0,0.82899,54982,Strawberry BBQ Chicken Spinach and Quinoa Sala...,"[avocado, bacon, barbecue sauce, chicken, cila...",189.038,,23.921,11.724,http://feedproxy.google.com/~r/ClosetCooking/~...
1,0.8179,47155,CPK’s BBQ Chicken Pizza,"[barbecue sauce, cilantro, corn, corn on the c...",2748.666751,,131.613956,149.763663,http://thepioneerwoman.com/cooking/2010/03/cpk...
2,0.8148,36223,Dad&#8217;s Stuffed Bell Peppers,"[barbecue sauce, bell peppers, garlic, ground ...",2805.700261,,261.603,77.123117,http://www.simplyrecipes.com/recipes/dads_stuf...
3,0.81222,50724,The Best Bloody Mary,"[barbecue sauce, beets, carrot, celery, chili,...",1218.267509,,226.321922,29.55242,http://www.bonappetit.com/recipes/2009/09/the_...
4,0.8102,46892,Supreme Pizza Burgers,"[barbecue sauce, ground beef, italian dressing...",2939.958914,,69.45234,146.12338,http://thepioneerwoman.com/cooking/2012/10/sup...
