In [1]:

import string
import numpy as np
import pandas as pd
import gensim
from gensim.models import Word2Vec
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import preprocessing


import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\L\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Create a function for tokenizer

stemmer = nltk.stem.PorterStemmer()
ENGLISH_STOP_WORDS = stopwords.words('english')

def recipe_tokenizer(sentence):
    # remove punctuation and set to lower case
    for punctuation_mark in string.punctuation:
        sentence = sentence.replace(punctuation_mark,' ').lower()

    # split sentence into words
    listofwords = sentence.split(' ')
    listofstemmed_words = []

    # remove stopwords and any tokens that are just empty strings
    for word in listofwords:
        if (not word in ENGLISH_STOP_WORDS) and (word!=''):
            # Stem words
            stemmed_word = stemmer.stem(word)
            listofstemmed_words.append(stemmed_word)

    return listofstemmed_words

In [3]:
# Function for word embedding using Word2Vec
def word_embedding(sampled_data, column):
    tokenized_data = sampled_data[column].apply(recipe_tokenizer)

    model = Word2Vec(tokenized_data, vector_size=100, window=5, min_count=1, workers=4)

    embeddings = {word: model.wv[word] for word in model.wv.index_to_key}

    return embeddings

In [4]:
def precompute_embeddings(sampled_data):    
    embeddings = word_embedding(sampled_data, 'Main food description')
    
    sampled_data['text_data'] = sampled_data[['WWEIA Category description', 'name_of_eating_occasion', 'source_of_food']].astype(str).agg(' '.join, axis=1)
    
    sampled_data['text_data'] = sampled_data['text_data'].str.lower()
    
    vectorizer = TfidfVectorizer(min_df=5,
                                 tokenizer=recipe_tokenizer)
    vectorized_data = vectorizer.fit_transform(sampled_data['text_data'])
    
    ingredient_embeddings = [np.mean([embeddings[word] for word in recipe_tokenizer(ingredients) if word in embeddings]
                                      or [np.zeros(100)], axis=0) for ingredients in sampled_data['Main food description']]
    
    combined_embeddings = np.concatenate([vectorized_data.toarray(), np.array(ingredient_embeddings)], axis=1)
        
    with open('combined_embeddings.pkl', 'wb') as f:
        pickle.dump(combined_embeddings, f)
        
    with open('tfidf_vectorizer.pkl', 'wb') as f:
        pickle.dump(vectorizer, f)

In [5]:
df = preprocessing.load_food_pref_dataset()[['usda_food_code', 'WWEIA Category description', 'name_of_eating_occasion', 'source_of_food', 'Main food description']]


In [6]:
df.drop_duplicates(subset=['usda_food_code'], inplace=True)

In [7]:
df.head()

Unnamed: 0,usda_food_code,WWEIA Category description,name_of_eating_occasion,source_of_food,Main food description
0,28320300,Soups,Dinner,Store - grocery/supermarket,"Pork with vegetable excluding carrots, broccol..."
11,91746110,Candy containing chocolate,Snack,Child/Adult care center,"Chocolate candy, candy shell with nuts"
55,58106210,Pizza,Lunch,Child/Adult care center,"Pizza, cheese, from restaurant or fast food, N..."
59,64104010,Apple juice,Snack,Store - grocery/supermarket,"Apple juice, 100%"
892,11710801,"Formula, ready-to-feed",Dinner,Store - grocery/supermarket,"Toddler formula, PediaSure"


In [8]:
def load_embeddings_and_vectorizer():
    with open('combined_embeddings.pkl', 'rb') as f:
        combined_embeddings = pickle.load(f)
    with open('tfidf_vectorizer.pkl', 'rb') as f:
        vectorizer = pickle.load(f)
    return combined_embeddings, vectorizer


def find_similar_recipes(sampled_data, user_input, num_similar=10):
    try:
        combined_embeddings, vectorizer = load_embeddings_and_vectorizer()
    except FileNotFoundError:
        precompute_embeddings(sampled_data)
        combined_embeddings, vectorizer = load_embeddings_and_vectorizer()
        
    user_data = pd.DataFrame({'text_data': [user_input]})
    user_data['text_data'] = user_data['text_data'].str.lower()
    
    user_vectorized_data = vectorizer.transform(user_data['text_data'])
    
    num_missing_features = combined_embeddings.shape[1] - user_vectorized_data.shape[1]
    if num_missing_features > 0:        
        user_vectorized_data = np.pad(user_vectorized_data.toarray(), ((0, 0), (0, num_missing_features)))
    
    cosine_sim_matrix = cosine_similarity(user_vectorized_data, combined_embeddings)
    
    similar_recipes = cosine_sim_matrix[0].argsort()[::-1]
    
    similar_recipe_names = sampled_data.iloc[similar_recipes]['Main food description'].tolist()

    return similar_recipe_names[:num_similar]

In [9]:
find_similar_recipes(df, "Oatmeal")



['Oatmeal, multigrain',
 'Oatmeal, instant, maple flavored, fat added',
 'Oatmeal, instant, maple flavored, no added fat',
 'Oatmeal, NFS',
 'Oatmeal, regular or quick, made with non-dairy milk, no added fat',
 'Oatmeal, regular or quick, made with non-dairy milk, fat added',
 'Oatmeal, regular or quick, made with milk, fat added',
 'Oatmeal, regular or quick, made with water, fat added',
 'Oatmeal, regular or quick, made with water, no added fat',
 'Oatmeal, instant, plain, made with non-dairy milk, no added fat']