Objective:
- group recipes into topics
- input: prompt topic query
- output: list of recommeded recipes

## Import packages

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

## Set Options

In [16]:
import warnings
warnings.filterwarnings('ignore') # ignore warnings

In [17]:
pd.set_option('display.max_columns', None) # displays all pd columns

## Explore Full Dataset

In [3]:
dataset_full = pd.read_csv('dataset/full_dataset.csv')

In [6]:
dataset_full.shape
# over 2mil obs

(2231142, 7)

In [18]:
dataset_full.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [7]:
dataset_full.isnull().sum().sort_values(axis=0, ascending=False)
# no missing values - very good for 2mil

Unnamed: 0     0
title          0
ingredients    0
directions     0
link           0
source         0
NER            0
dtype: int64

## Explore Subset

In [8]:
# subset with sample - get 10k obs
recipe_10k = dataset_full.sample(n=10000, random_state=10)

In [9]:
recipe_10k.shape

(10000, 7)

In [19]:
recipe_10k.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
869976,869976,Oatmeal Pie,"[""3 eggs"", ""2/3 c. granulated sugar"", ""1 c. li...","[""With electric mixer, combine eggs, sugars, v...",www.cookbooks.com/Recipe-Details.aspx?id=180165,Gathered,"[""eggs"", ""granulated sugar"", ""light brown suga..."
798114,798114,Old-Fashioned Lemonade,"[""4 lemons"", ""3/4 c. sugar"", ""4 c. cold water""]","[""Cut lemons into very thin slices. Remove see...",www.cookbooks.com/Recipe-Details.aspx?id=967704,Gathered,"[""lemons"", ""sugar"", ""cold water""]"
35937,35937,Instant Cocoa Mix,"[""2 3/4 c. instant nonfat dry milk"", ""1 1/2 c....","[""Sift together all ingredients."", ""To serve, ...",www.cookbooks.com/Recipe-Details.aspx?id=341684,Gathered,"[""milk"", ""instant cocoa"", ""nondairy coffee"", ""..."
1737298,1737298,Blueberry & Lemon Zest Biscuits-Grain/Diary Free,"[""2- 1/2 cups Almond Flour"", ""1/2 teaspoons Sa...","[""First off, Honeville Almond Flour has the be...",tastykitchen.com/recipes/special-dietary-needs...,Recipes1M,"[""Flour"", ""Salt"", ""Baking Soda"", ""Coconut Oil""..."
959447,959447,Chicken Masala Curry,"[""2 cups onions, thinly sliced"", ""4 tablespoon...","[""In a wok, dutch oven or deep saute pan, fry ...",www.food.com/recipe/chicken-masala-curry-493916,Gathered,"[""onions"", ""vegetable oil"", ""garlic"", ""ginger""..."


In [21]:
recipe_10k.iloc[0]

Unnamed: 0                                                869976
title                                                Oatmeal Pie
ingredients    ["3 eggs", "2/3 c. granulated sugar", "1 c. li...
directions     ["With electric mixer, combine eggs, sugars, v...
link             www.cookbooks.com/Recipe-Details.aspx?id=180165
source                                                  Gathered
NER            ["eggs", "granulated sugar", "light brown suga...
Name: 869976, dtype: object

In [22]:
recipe_10k.iloc[0][2]

'["3 eggs", "2/3 c. granulated sugar", "1 c. light brown sugar (not packed)", "1 tsp. vanilla", "1/2 c. milk", "2 Tbsp. butter, softened", "2/3 c. quick-cooking rolled oats (uncooked)", "2/3 c. flaked coconut", "1/2 c. chopped pecans", "1 (9-inch) deep dish or 1 (10-inch) pie shell (unbaked)", "vanilla ice cream (optional)", "caramel sauce (optional)"]'

In [23]:
recipe_10k.iloc[0][3]

'["With electric mixer, combine eggs, sugars, vanilla, milk and butter; mix until well blended.", "Stir in oats, coconut and pecans; stir to mix well.", "Pour into unbaked pie shell.", "Bake in preheated 350\\u00b0 oven for 45 minutes or until pie is set and lightly browned.", "Serve with vanilla ice cream topped with caramel sauce, if desired.", "Makes 1 pie, 6 to 8 servings."]'

In [24]:
recipe_10k.iloc[0][4]

'www.cookbooks.com/Recipe-Details.aspx?id=180165'

In [26]:
recipe_10k.iloc[0][6]

'["eggs", "granulated sugar", "light brown sugar", "vanilla", "milk", "butter", "quick-cooking", "flaked coconut", "pecans", "pie shell", "vanilla ice cream", "caramel sauce"]'

In [27]:
# could use the 'NER' col instead of extracting from 'ingredients' col

## Try Bertopic

In [49]:
from bertopic import BERTopic
# cannot install bertopic -> try on collab or sparks later

## Try LDA

In [32]:
import gensim

In [39]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tamtrinh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [42]:
stemmer = SnowballStemmer

In [43]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

In [45]:
# processed_docs = recipe_10k['NER'].map(preprocess)
# processed_docs[:10]

In [46]:
processed_docs = recipe_10k['NER']

In [47]:
dictionary = gensim.corpora.Dictionary(processed_docs)

TypeError: doc2bow expects an array of unicode tokens on input, not a single string

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [None]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 8, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)