# Loading and cleaning text data
## Import libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# import sklearn
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF
from sklearn.preprocessing import scale
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.corpus import stopwords 
from nltk.stem import LancasterStemmer, PorterStemmer
from nltk.stem import WordNetLemmatizer
#import spacy #for faster tokenization and lemmatization

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import re
import string

import functions

%pylab inline

Populating the interactive namespace from numpy and matplotlib


## Loading the RecipeBox dataset


### allrecipes

In [6]:
#the three files are scraped from foodnetwork, epicurious, and allrecipes.com
#start with allrecipes.com - took 18.4s to load, can consider putting it into database?
rbdata_ar = pd.read_csv('Food Ingredients and Recipe Dataset with Image Name Mapping.csv')
#rbdata_ar = pd.read_json(r'/Users/xinrucheng/Documents/GitHub/metisproject04/data/raw/recipes_raw/recipes_raw_nosource_ar.json')

In [7]:
rbdata_ar.head() #need to swap rows and columns
#features are title, ingredients, instructions, and picture_link(not needed?)

Unnamed: 0,Id,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher..."
1,1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (..."
2,2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ..."
3,3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in..."
4,4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho..."


In [8]:
#rb_ar=rbdata_ar.T #can just do pandas transpose, don't need to set orient parameter in read_json!
#rb_ar.head()

In [9]:
#rb_ar.info()
rbdata_ar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13496 entries, 0 to 13495
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Id                   13496 non-null  int64 
 1   Title                13491 non-null  object
 2   Ingredients          13496 non-null  object
 3   Instructions         13488 non-null  object
 4   Image_Name           13496 non-null  object
 5   Cleaned_Ingredients  13496 non-null  object
dtypes: int64(1), object(5)
memory usage: 632.8+ KB


In [10]:
#reset index because don't want the scraped link as index
rbdata_ar.index = range(len(rbdata_ar))
rbdata_ar.head() 

Unnamed: 0,Id,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher..."
1,1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (..."
2,2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ..."
3,3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in..."
4,4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho..."


## Data cleaning

### drop nulls from recipebox 

1. Clean allrecipes data

In [11]:
ardf=rbdata_ar.loc[:,['Title', 'Ingredients', 'Instructions','Image_Name']] 
ardf.head()

Unnamed: 0,Title,Ingredients,Instructions,Image_Name
0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella
1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger
2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams
3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559
4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail


In [12]:
ardf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13496 entries, 0 to 13495
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         13491 non-null  object
 1   Ingredients   13496 non-null  object
 2   Instructions  13488 non-null  object
 3   Image_Name    13496 non-null  object
dtypes: object(4)
memory usage: 421.9+ KB


In [13]:
#still has ~300 null rows? drop those also
ardf=ardf.dropna(how='any')
ardf

#Now have 39522 observations (recipes) with no null values

#*still need to get rid of link as well as punctuations before further processing

#also the word ADVERTISEMENT shows up a lot - filter it out

Unnamed: 0,Title,Ingredients,Instructions,Image_Name
0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella
1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger
2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams
3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559
4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail
...,...,...,...,...
13491,Brownie Pudding Cake,"['1 cup all-purpose flour', '2/3 cup unsweeten...",Preheat the oven to 350°F. Into a bowl sift to...,brownie-pudding-cake-14408
13492,Israeli Couscous with Roasted Butternut Squash...,"['1 preserved lemon', '1 1/2 pound butternut s...",Preheat oven to 475°F.\nHalve lemons and scoop...,israeli-couscous-with-roasted-butternut-squash...
13493,Rice with Soy-Glazed Bonito Flakes and Sesame ...,['Leftover katsuo bushi (dried bonito flakes) ...,"If using katsuo bushi flakes from package, moi...",rice-with-soy-glazed-bonito-flakes-and-sesame-...
13494,Spanakopita,['1 stick (1/2 cup) plus 1 tablespoon unsalted...,Melt 1 tablespoon butter in a 12-inch heavy sk...,spanakopita-107344


In [14]:
data =ardf
data.tail()

Unnamed: 0,Title,Ingredients,Instructions,Image_Name
13491,Brownie Pudding Cake,"['1 cup all-purpose flour', '2/3 cup unsweeten...",Preheat the oven to 350°F. Into a bowl sift to...,brownie-pudding-cake-14408
13492,Israeli Couscous with Roasted Butternut Squash...,"['1 preserved lemon', '1 1/2 pound butternut s...",Preheat oven to 475°F.\nHalve lemons and scoop...,israeli-couscous-with-roasted-butternut-squash...
13493,Rice with Soy-Glazed Bonito Flakes and Sesame ...,['Leftover katsuo bushi (dried bonito flakes) ...,"If using katsuo bushi flakes from package, moi...",rice-with-soy-glazed-bonito-flakes-and-sesame-...
13494,Spanakopita,['1 stick (1/2 cup) plus 1 tablespoon unsalted...,Melt 1 tablespoon butter in a 12-inch heavy sk...,spanakopita-107344
13495,"Mexican Poblano, Spinach, and Black Bean ""Lasa...",['12 medium to large fresh poblano chiles (2 1...,Lay 4 chiles on their sides on racks of gas bu...,mexican-poblano-spinach-and-black-bean-lasagne...


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13488 entries, 0 to 13495
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         13488 non-null  object
 1   Ingredients   13488 non-null  object
 2   Instructions  13488 non-null  object
 3   Image_Name    13488 non-null  object
dtypes: object(4)
memory usage: 526.9+ KB


### Reset index

In [16]:
data.index = range(len(data))
data.head() 

Unnamed: 0,Title,Ingredients,Instructions,Image_Name
0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella
1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger
2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams
3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559
4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail


### Remove digits

In [18]:
data_ingr=data['Ingredients'].apply(functions.regex_nodigits_new)
data_ingr

0        [' (½–-lb.) whole chicken', '¾ tsp. kosher sal...
1        [' large egg whites', ' pound new potatoes (ab...
2        [' cup evaporated milk', ' cup whole milk', ' ...
3        [' (¾- to -pound) round Italian loaf, cut into...
4        [' teaspoon dark brown sugar', ' teaspoon hot ...
                               ...                        
13483    [' cup all-purpose flour', '/ cup unsweetened ...
13484    [' preserved lemon', ' / pound butternut squas...
13485    ['Leftover katsuo bushi (dried bonito flakes) ...
13486    [' stick (/ cup) plus  tablespoon unsalted but...
13487    [' medium to large fresh poblano chiles ( / lb...
Name: Ingredients, Length: 13488, dtype: object

## Vectorization

### Custom stopwords

In [19]:
set(stopwords.words('english')) #look at the nltk English stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [20]:
stopwords_nltk = stopwords.words('english')

In [21]:
len(stopwords_nltk)

179

In [22]:
stopwords_nltk.extend(['ADVERTISEMENT', 'advertisement']) #when run cell again, keeps getting extended
print(len(stopwords_nltk))

181


In [23]:
stopwords_nltk

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [24]:
#added advertisement to stopwords, next step is to remove numbers from data before Vectorization
#CountVectorizer removes punctuation and only keeps alphanumeric
#vectorizers and nltk tokenizer have options to set tokens with regex, or to remove stopwords

### Vectorize - Count Vectorizer

In [25]:
#countVectorizer 
#Converts a collection of text documents to a matrix of token counts
#input is expected to be the sequence strings or bytes items are expected to be analyzed directly.

In [26]:
#only work with merged ingredients dataset (from all 3 sources) for now
corpus = data_ingr
ct_vectorizer = CountVectorizer(stop_words=stopwords_nltk)

ingr_ct = ct_vectorizer.fit_transform(corpus)
print(ct_vectorizer.get_feature_names())

['_gravié_ra', 'aarons', 'aattachment', 'abita', 'abruzzese', 'abruzzo', 'absente', 'absinthe', 'absolut', 'abuelita', 'aburaage', 'abv', 'acacia', 'accent', 'acceptable', 'accommodate', 'accompaniment', 'accompaniments', 'accordian', 'according', 'achiote', 'acid', 'acidity', 'acini', 'ackee', 'acorn', 'across', 'acrylic', 'acting', 'activate', 'activated', 'active', 'acto', 'actually', 'adams', 'add', 'added', 'adding', 'addition', 'additional', 'additions', 'additive', 'additives', 'adds', 'adjoining', 'adjust', 'adjustable', 'adjusted', 'adjusting', 'adjustments', 'adobo', 'adrianascaravan', 'adult', 'adults', 'advance', 'advise', 'aejado', 'aerosol', 'afford', 'affumicata', 'african', 'africantradingco', 'afterwards', 'agar', 'agave', 'age', 'aged', 'agricole', 'agrodolce', 'aguardiente', 'agur', 'ahead', 'ahi', 'aid', 'ail', 'aioli', 'air', 'airtight', 'airy', 'aisle', 'aitchbone', 'ajat', 'aji', 'ajika', 'ajowan', 'ajvar', 'ajwain', 'ají', 'ajíes', 'ajíí', 'aka', 'akkawi', 'al',

### Vectorize - TF-IDF
might work better, because want to choose words "unique" to this document

TF-IDF = (Term Frequency) * (Inverse Document Frequency)

(how often word occurs in this doc) * inverse of (how often this word occurs in all documents)

In [27]:
tf_vectorizer = TfidfVectorizer(stop_words=stopwords_nltk)
ingr_tfidf = tf_vectorizer.fit_transform(corpus)

print(tf_vectorizer.get_feature_names())

['_gravié_ra', 'aarons', 'aattachment', 'abita', 'abruzzese', 'abruzzo', 'absente', 'absinthe', 'absolut', 'abuelita', 'aburaage', 'abv', 'acacia', 'accent', 'acceptable', 'accommodate', 'accompaniment', 'accompaniments', 'accordian', 'according', 'achiote', 'acid', 'acidity', 'acini', 'ackee', 'acorn', 'across', 'acrylic', 'acting', 'activate', 'activated', 'active', 'acto', 'actually', 'adams', 'add', 'added', 'adding', 'addition', 'additional', 'additions', 'additive', 'additives', 'adds', 'adjoining', 'adjust', 'adjustable', 'adjusted', 'adjusting', 'adjustments', 'adobo', 'adrianascaravan', 'adult', 'adults', 'advance', 'advise', 'aejado', 'aerosol', 'afford', 'affumicata', 'african', 'africantradingco', 'afterwards', 'agar', 'agave', 'age', 'aged', 'agricole', 'agrodolce', 'aguardiente', 'agur', 'ahead', 'ahi', 'aid', 'ail', 'aioli', 'air', 'airtight', 'airy', 'aisle', 'aitchbone', 'ajat', 'aji', 'ajika', 'ajowan', 'ajvar', 'ajwain', 'ají', 'ajíes', 'ajíí', 'aka', 'akkawi', 'al',

# Dimensionality Reduction

## Recipe Box (120k dataset)

* CV+LSA
* CV+NMF
* TFIDF+LSA
* TFIDF+NMF
* LDA?

### Count Vectorized LSA, RB data

In [28]:
#dim reduction (LSA) on rb data, ingredients column only:
#corpus = data_ingr  in 1.4.2
#ct_vectorizer = CountVectorizer(stop_words=stopwords_nltk)
#ingr_ct = ct_vectorizer.fit_transform(corpus)

lsa_10 = TruncatedSVD(10)  
rb_lsa_10 = lsa_10.fit_transform(ingr_ct)
lsa_10.explained_variance_ratio_

array([0.08084526, 0.0625493 , 0.03859407, 0.02991516, 0.02633464,
       0.02384253, 0.02024997, 0.01821922, 0.01601517, 0.01420611])

In [29]:
functions.display_topics(lsa_10, ct_vectorizer.get_feature_names(), 10) 


Topic  0
cup, teaspoon, tablespoons, chopped, cups, salt, fresh, oil, large, tablespoon

Topic  1
cup, sugar, teaspoon, butter, flour, unsalted, vanilla, cream, purpose, extract

Topic  2
teaspoon, tablespoons, ground, teaspoons, tablespoon, salt, pepper, black, cups, powder

Topic  3
ground, teaspoon, tsp, tbsp, pepper, salt, kosher, freshly, black, oil

Topic  4
chopped, teaspoon, cup, finely, fresh, parsley, coarsely, onion, tablespoon, cilantro

Topic  5
cups, chopped, large, finely, tsp, tbsp, inch, cut, ounces, butter

Topic  6
inch, cut, fresh, teaspoon, peeled, sliced, cups, pieces, cup, pound

Topic  7
fresh, lemon, juice, cups, tsp, tbsp, grated, zest, finely, lime

Topic  8
cups, ounces, sliced, thinly, teaspoons, divided, oil, ounce, cheese, cup

Topic  9
sliced, thinly, teaspoon, tbsp, tsp, sugar, sauce, finely, ounces, chopped


CtVectorizer purely based on counts, quantities (teaspoons) show up the most! TF-IDF might make more sense. Also could add more stopwords

### TF-IDF, LSA

In [30]:
#TF-IDF from 1.4.3 above:
#vectorizer = TfidfVectorizer(stop_words=stopwords_nltk)
#ingr_tfidf = vectorizer.fit_transform(corpus)

#lsa_10 = TruncatedSVD(10)  
tf_lsa_10 = lsa_10.fit_transform(ingr_tfidf)
lsa_10.explained_variance_ratio_

array([0.00985606, 0.02626309, 0.0146443 , 0.00995802, 0.00939645,
       0.00872242, 0.00821925, 0.00748508, 0.00661511, 0.00637331])

In [31]:
functions.display_topics(lsa_10, tf_vectorizer.get_feature_names(), 10)


Topic  0
cup, teaspoon, tablespoons, chopped, cups, fresh, salt, tablespoon, oil, ground

Topic  1
sugar, vanilla, teaspoon, flour, cup, extract, butter, unsalted, baking, purpose

Topic  2
tsp, tbsp, oz, lb, plus, kosher, serving, virgin, extra, sliced

Topic  3
teaspoon, ground, pepper, freshly, black, salt, kosher, powder, baking, teaspoons

Topic  4
lemon, finely, fresh, juice, grated, teaspoon, zest, chopped, cup, ground

Topic  5
juice, lime, sauce, sliced, seeds, thinly, tablespoon, sugar, ginger, teaspoon

Topic  6
chopped, sauce, finely, cilantro, cup, onion, lime, rice, diced, oz

Topic  7
sliced, thinly, ounces, cup, chocolate, divided, vinegar, red, ounce, cheese

Topic  8
teaspoon, oz, extra, virgin, red, inch, cup, olive, lb, vinegar

Topic  9
ounces, ounce, cups, ground, cheese, divided, teaspoon, grated, lime, milk


In [32]:
#still get lots of tablespoons and cups, need more cleaning
#put them into stopwords? table/teaspoons, cups, ounces
#OR: do stemming before tokenization? phrase tagging/lemmatization? only keep nouns of ingr names

### Count Vectorized NMF, RB data

In [33]:
nmf_10 = NMF(10,init='nndsvda') 
rb_nmf_10 = nmf_10.fit_transform(ingr_ct) 

In [35]:
functions.display_topics(nmf_10, ct_vectorizer.get_feature_names(), 10)


Topic  0
cup, sugar, cream, water, packed, salt, milk, plus, whole, flour

Topic  1
cups, sugar, butter, unsalted, large, flour, salt, teaspoons, purpose, vanilla

Topic  2
teaspoon, ground, salt, powder, cinnamon, grated, finely, dried, ginger, seeds

Topic  3
pepper, salt, ground, freshly, oil, black, kosher, olive, tablespoon, teaspoons

Topic  4
tablespoons, divided, butter, plus, oil, sauce, unsalted, vinegar, vegetable, pounds

Topic  5
chopped, finely, cups, coarsely, garlic, onion, large, cloves, fresh, pound

Topic  6
inch, cut, peeled, pieces, large, pound, thick, medium, cubes, slices

Topic  7
fresh, juice, lemon, tablespoon, lime, leaves, grated, minced, teaspoons, olive

Topic  8
tsp, tbsp, oz, plus, lb, salt, kosher, finely, ground, grated

Topic  9
sliced, thinly, ounces, cups, oil, red, divided, leaves, sauce, white


In [36]:
#these nmf topics seem to make more sense

### TF-IDF, NMF, RB data


In [37]:
nmf_10 = NMF(10 ,init='nndsvda', max_iter=500, tol=1e-4) 
rbtf_nmf_10 = nmf_10.fit_transform(ingr_tfidf)  #using tfidf results from 2.3.4

In [39]:
functions.display_topics(nmf_10, tf_vectorizer.get_feature_names(), 10)


Topic  0
lemon, juice, fresh, orange, cup, zest, lime, grated, tablespoons, peel

Topic  1
cup, sugar, teaspoon, flour, butter, vanilla, unsalted, purpose, cups, baking

Topic  2
tsp, tbsp, oz, lb, plus, kosher, ground, serving, salt, divided

Topic  3
teaspoon, ground, pepper, black, freshly, salt, kosher, teaspoons, cumin, tablespoon

Topic  4
cut, inch, pieces, peeled, thick, slices, cubes, pounds, pound, medium

Topic  5
sauce, tablespoons, tablespoon, rice, soy, sesame, ginger, minced, teaspoons, seeds

Topic  6
chopped, finely, cup, fresh, coarsely, onion, parsley, garlic, cups, chicken

Topic  7
sliced, thinly, leaves, small, red, halved, cup, kosher, stems, serving

Topic  8
olive, extra, virgin, oil, cup, red, vinegar, tablespoons, garlic, wine

Topic  9
ounces, divided, ounce, cheese, cups, grated, parmesan, cream, milk, tablespoons


In [40]:

#**tune TF-IDF parameters? look into docs
#**also: GridSearchCV to find how many topics/ks to use?? sklearn doc example


## Further preprocessing to get more meaningful topics

### Add more stopwords before dim reduction
-- lemmatize first? before vectorizer (still in word space)

In [67]:
stopwords_ingr = stopwords_nltk #build upon previous custom stopwords (2.1) which included ADVERTISEMENT
stopwords_ingr.extend(['tablespoon', 'teaspoon', 'cup', 'ounce', 'pound', 'tablespoons', 'teaspoons', 'cups', 'ounces', 'pounds'
                       ,'inch','inches','tsp','tbsp','lb','pieces','chopped','fresh','finely','cut','sliced','freshly','removed'
                       ,'divided','temperature','room','peeled','thinly','large','medium','halved','serving','cut','plus','packed']) 
print(len(stopwords_ingr))#when run cell again, keeps getting extended

265


### Count Vectorized LSA, new stopwords

In [68]:
corpus = data_ingr
ct_vec = CountVectorizer(stop_words=stopwords_ingr)

ingr_ct = ct_vec.fit_transform(corpus)
print(ct_vec.get_feature_names())

['_gravié_ra', 'aarons', 'aattachment', 'abita', 'abruzzese', 'abruzzo', 'absente', 'absinthe', 'absolut', 'abuelita', 'aburaage', 'abv', 'acacia', 'accent', 'acceptable', 'accommodate', 'accompaniment', 'accompaniments', 'accordian', 'according', 'achiote', 'acid', 'acidity', 'acini', 'ackee', 'acorn', 'across', 'acrylic', 'acting', 'activate', 'activated', 'active', 'acto', 'actually', 'adams', 'add', 'added', 'adding', 'addition', 'additional', 'additions', 'additive', 'additives', 'adds', 'adjoining', 'adjust', 'adjustable', 'adjusted', 'adjusting', 'adjustments', 'adobo', 'adrianascaravan', 'adult', 'adults', 'advance', 'advise', 'aejado', 'aerosol', 'afford', 'affumicata', 'african', 'africantradingco', 'afterwards', 'agar', 'agave', 'age', 'aged', 'agricole', 'agrodolce', 'aguardiente', 'agur', 'ahead', 'ahi', 'aid', 'ail', 'aioli', 'air', 'airtight', 'airy', 'aisle', 'aitchbone', 'ajat', 'aji', 'ajika', 'ajowan', 'ajvar', 'ajwain', 'ají', 'ajíes', 'ajíí', 'aka', 'akkawi', 'al',

In [69]:
#previously, dim reduction, 10 topics:
#lsa_10 = TruncatedSVD(10)  
#ct_lsa_10 = lsa_10.fit_transform(ingr_ct)
ct_lsa_10 = lsa_10.fit_transform(ingr_ct)
lsa_10.explained_variance_ratio_

array([0.04038195, 0.06047083, 0.02206433, 0.01845918, 0.017692  ,
       0.01480575, 0.01344522, 0.01247263, 0.01165535, 0.01093004])

In [70]:
functions.display_topics(lsa_10, ct_vec.get_feature_names(), 20) 


Topic  0
salt, oil, ground, pepper, kosher, sugar, olive, garlic, black, butter, red, grated, juice, unsalted, lemon, leaves, cloves, white, flour, extra

Topic  1
sugar, butter, unsalted, flour, vanilla, cream, purpose, extract, baking, egg, powder, stick, chocolate, eggs, milk, brown, heavy, salt, sticks, granulated

Topic  2
ground, pepper, black, salt, kosher, cinnamon, cumin, powder, nutmeg, chicken, paprika, coriander, cayenne, brown, pork, allspice, onion, beef, cardamom, low

Topic  3
sauce, vegetable, red, rice, sugar, white, dried, garlic, cloves, seeds, soy, onion, chicken, vinegar, green, ginger, minced, water, cilantro, chiles

Topic  4
juice, lemon, ground, lime, sugar, sauce, ginger, grated, cilantro, orange, zest, seeds, cinnamon, chiles, rice, cumin, peel, leaves, soy, chile

Topic  5
butter, unsalted, chicken, grated, white, stick, broth, lemon, garlic, whole, dry, onion, thyme, cloves, parsley, minced, dried, black, pepper, celery

Topic  6
ground, grated, cheese, o

### TF-IDF, LSA, with new stopwords

In [71]:
tf_vec = TfidfVectorizer(stop_words=stopwords_ingr)
ingr_tfidf = tf_vec.fit_transform(corpus)

print(tf_vec.get_feature_names())

['_gravié_ra', 'aarons', 'aattachment', 'abita', 'abruzzese', 'abruzzo', 'absente', 'absinthe', 'absolut', 'abuelita', 'aburaage', 'abv', 'acacia', 'accent', 'acceptable', 'accommodate', 'accompaniment', 'accompaniments', 'accordian', 'according', 'achiote', 'acid', 'acidity', 'acini', 'ackee', 'acorn', 'across', 'acrylic', 'acting', 'activate', 'activated', 'active', 'acto', 'actually', 'adams', 'add', 'added', 'adding', 'addition', 'additional', 'additions', 'additive', 'additives', 'adds', 'adjoining', 'adjust', 'adjustable', 'adjusted', 'adjusting', 'adjustments', 'adobo', 'adrianascaravan', 'adult', 'adults', 'advance', 'advise', 'aejado', 'aerosol', 'afford', 'affumicata', 'african', 'africantradingco', 'afterwards', 'agar', 'agave', 'age', 'aged', 'agricole', 'agrodolce', 'aguardiente', 'agur', 'ahead', 'ahi', 'aid', 'ail', 'aioli', 'air', 'airtight', 'airy', 'aisle', 'aitchbone', 'ajat', 'aji', 'ajika', 'ajowan', 'ajvar', 'ajwain', 'ají', 'ajíes', 'ajíí', 'aka', 'akkawi', 'al',

In [72]:
#lsa_10 = TruncatedSVD(10)  
tf_lsa_10 = lsa_10.fit_transform(ingr_tfidf)

In [73]:
functions.display_topics(lsa_10, tf_vec.get_feature_names(), 20) 


Topic  0
salt, ground, oil, pepper, sugar, kosher, olive, butter, lemon, garlic, unsalted, black, grated, juice, red, flour, leaves, cloves, white, extra

Topic  1
sugar, vanilla, flour, butter, unsalted, extract, purpose, baking, cream, egg, chocolate, powder, stick, milk, eggs, sticks, granulated, heavy, brown, soda

Topic  2
lemon, juice, grated, zest, extra, virgin, olive, orange, peel, parmesan, cheese, parsley, oz, cream, ice, leaves, chilled, mint, basil, italian

Topic  3
juice, lime, lemon, sauce, sugar, orange, ginger, cilantro, seeds, rice, soy, sesame, zest, coconut, water, ice, mint, chiles, chile, syrup

Topic  4
ground, kosher, lemon, pepper, black, salt, seeds, cinnamon, juice, zest, baking, cumin, flour, purpose, yogurt, coriander, plain, grated, paprika, soda

Topic  5
chicken, broth, butter, low, grated, stick, unsalted, lemon, sodium, thyme, juice, dry, cubes, water, celery, minced, sauce, stock, white, purpose

Topic  6
vinegar, wine, white, sugar, red, dry, cider

### Count Vectorized NMF, new stopwords

In [74]:
nmf_10 = NMF(n_components=10, init='nndsvda', max_iter=1000, tol=1e-4) 
rb_nmf_10 = nmf_10.fit_transform(ingr_ct) 

In [75]:
functions.display_topics(nmf_10, ct_vectorizer.get_feature_names(), 20)


Topic  0
oblaten, oil, expressed, vermont, ganache, layers, paprika, tip, clover, clotted, tostada, basil, tends, dpdspirits, scharfen, todd, mildflavored, baby, legs, irish

Topic  1
strong, butter, uncovered, floral, untoasted, cre, pulsing, baking, extact, safe, edges, portuguese, starter, eggcups, chocolate, brown, microwavable, heaping, vitamin, weelicious

Topic  2
grit, pelargonium, black, cinder, cultures, clover, giblet, cores, notice, portuguese, allspice, panfried, safe, cayenne, oblaten, cardamom, brown, olive, true, ganache

Topic  3
oblaten, sandbakkel, usually, reserve, lightlife, season, sometimes, giblet, cider, strong, semiboneless, ganache, mildflavored, gravy, layers, tienda, chiles, sauterne, asian, jivara

Topic  4
lee, jivara, yellowfin, onetik, lightlife, strong, layers, pealed, grapes, milliliter, gardein, vitamin, slather, walla, husked, paprika, spreadable, chilled, xadvirgin, cider

Topic  5
chicken, webstaurantstore, ganache, clover, broth, draught, olive,

In [76]:
#nmf has some unusal words when using count-vectorized data (brand names?)

In [77]:
nmf_5 = NMF(n_components=5, init='nndsvda', max_iter=1000, tol=1e-4) 
rb_nmf_5 = nmf_5.fit_transform(ingr_ct) 

In [78]:
functions.display_topics(nmf_5, ct_vectorizer.get_feature_names(), 20)


Topic  0
oblaten, oil, expressed, vermont, safe, ganache, read, pelargonium, kolsch, layers, venison, wheels, paprika, slather, clover, cheese, tip, black, tends, ovalpie

Topic  1
strong, butter, uncovered, safe, floral, cre, pulsing, untoasted, baking, extact, edges, portuguese, starter, eggcups, microwavable, weelicious, chocolate, heaping, brown, vitamin

Topic  2
grit, pelargonium, safe, black, kolsch, cultures, cinder, grapes, butter, olive, uncovered, clover, chicken, panfried, cayenne, ganache, weelicious, portuguese, cores, coarse

Topic  3
ganache, oblaten, sandbakkel, webstaurantstore, read, clover, chicken, usually, draught, olive, mildflavored, season, gravy, layers, reserve, cider, lightlife, slather, venison, giblet

Topic  4
lee, jivara, grapes, yellowfin, onetik, lightlife, pealed, strong, layers, milliliter, gardein, vitamin, walla, frisée, slather, husked, spreadable, paprika, cider, clotted


### TF-IDF, NMF, new stopwords


In [79]:
nmf_10 = NMF(n_components=10, init='nndsvda', max_iter=500) 
rbtf_nmf_10 = nmf_10.fit_transform(ingr_tfidf) 

In [80]:
functions.display_topics(nmf_10, tf_vectorizer.get_feature_names(), 20)


Topic  0
grit, pelargonium, black, kolsch, safe, season, cultures, panfried, oblaten, cinder, cayenne, cores, xadvirgin, pistachios, murray, crunchy, scharfen, usually, clover, ganache

Topic  1
floral, pulsing, butter, uncovered, baking, strong, portuguese, starter, eggcups, safe, smoker, steaks, edges, extact, brown, untoasted, medieval, grams, smoothly, cinder

Topic  2
lee, jivara, yellowfin, onetik, grapes, pealed, strong, vitamin, gardein, milliliter, husked, chilled, hollandaise, frisée, giardiniera, stirred, spreadable, walla, xadvirgin, meringues

Topic  3
lightlife, cider, jivara, chiles, diamond, olive, sealing, chile, read, mildflavored, corkscrew, layers, israeli, walla, cocoa, gravy, draught, cultures, ganache, selection

Topic  4
oil, expressed, vermont, oblaten, read, venison, ganache, layers, wheels, slather, clover, tip, pelargonium, paprika, safe, todd, kolsch, crunchy, tostada, fixed

Topic  5
chicken, broth, looza, thicker, webstaurantstore, spirulina, drumettes, 

NMF has some unusal words, including words from other languages. Overall the NMF topics are nonsensical regardless of the number of topics. Choosing LSA for recommendations going forward.

# Find similar recipes within dataset
## Cosine Similarity

> cosine sim: from cos sim docs: input X, Y

> X : ndarray or sparse array, shape: (n_samples_X, n_features) Input data.

> Y : ndarray or sparse array, shape: (n_samples_Y, n_features) Input data. If None, the output will be the pairwise similarities between all samples in X. 

(previously running slowly because doing pairwise calculation between all documents in X; instead, want one vs. all)

**Find the recipes with the highest cosine similarity to a few recipes, compare recommendations between ct-v data and tf-idf vectorized data.**

### Count-Vectorized LSA

#### Recipes similar to cookies

In [81]:
data.iloc[8] #first we choose this recipe for recommendations

Title                                     Instant Pot Lamb Haleem
Ingredients     ['¾ cup assorted dals (such as chana dal, moon...
Instructions    Combine dals, rice, and barley in a medium bow...
Image_Name                                instant-pot-lamb-haleem
Name: 8, dtype: object

In [82]:
data_ingr[8] #ingredient list, before vectorizing and filtering out stopwords

'[\'¾ cup assorted dals (such as chana dal, moong dal, masoor dal, and/or urad dal)\', \'¼ cup white jasmine rice or other long-grain rice\', \'¼ cup pearl barley\', \'½ lb. bone-in lamb stew meat\', \' tsp. kosher salt, divided, plus more\', \' " piece fresh ginger\', \' medium shallots, thinly sliced\', \'⅓ cup ghee or vegetable oil\', \' garlic cloves, finely grated\', \' Tbsp. plus ½ tsp. garam masala\', \' tsp. (or more) cayenne pepper\', \' tsp. ground turmeric\', \' green Thai chiles, stems removed (optional)\', \'½ cup (lightly packed) chopped cilantro, plus more for serving\', \'½ white onion, finely chopped\', \' limes, cut into wedges\']'

Find the cosine similarity between this recipe and the rest of the dimensionally-reduced (via LSA) dataset:

In [83]:
#lsa_10 = TruncatedSVD(10)  
#ct_lsa_10 = lsa_10.fit_transform(ingr_ct)
lsa_s_choc_ctv=cosine_similarity(ct_lsa_10[8].reshape(1, -1), ct_lsa_10).round(3) 
lsa_s_choc_ctv

array([[0.604, 0.557, 0.564, ..., 0.565, 0.129, 0.611]])

In [84]:
#argsort returns sorted array pf indices with their values from low to high 
lsa_s_choc_ctv.argsort()

array([[5726, 1554, 1893, ..., 6330, 5011,    8]], dtype=int64)

In [85]:
a = lsa_s_choc_ctv.argsort()
np.fliplr(a) #flip array sorted by argsort, slower though

#expect the first result to be the recipe itself [3], with cosine similarity=1

array([[   8, 5011, 6330, ..., 1893, 1554, 5726]], dtype=int64)

In [86]:
#reverse array (-lsa_s) - sometimes order changes when flipped this way?
#(-(lsa_s_choc)).argsort() 

In [87]:
#look at the other recipes, see if the recommendation makes sense
data.iloc[5011]

Title                     Soba and Maitake Mushrooms in Soy Broth
Ingredients     ['2 garlic cloves, peeled, crushed', '1 1" pie...
Instructions    Bring garlic, ginger, and 4 cups water to a bo...
Image_Name       soba-and-maitake-mushrooms-in-soy-broth-51205280
Name: 5011, dtype: object

In [88]:
data_ingr[1539]

"[' / tablespoons vegetable oil, such as grapeseed, divided', ' tablespoons rice wine vinegar', ' / tablespoons soy sauce', ' garlic cloves, finely grated, divided', ' teaspoons finely grated ginger', ' / tablespoon hot chile paste, such as sambal oelek', '/ teaspoon kosher salt', ' pork tenderloins (each about  pound)', ' tablespoons fish sauce', ' tablespoons lime juice', '/ teaspoon honey', '/ to  red Thai chile pepper, very thinly sliced', ' / tablespoons finely chopped roasted unsalted peanuts', ' / pounds Brussels sprouts, trimmed and halved', '/ cup low-sodium chicken broth or water', ' tablespoon butter', ' tablespoon roughly chopped mint', 'Large oven-safe skillet']"

In [89]:
data.iloc[6330]

Title            Spicy Chicken Thighs with Rhubarb-Cucumber Salsa
Ingredients     ['1 habanero, Scotch bonnet, or Thai chile, wi...
Instructions    Preheat oven to 500°F. Line a baking sheet wit...
Image_Name      spicy-chicken-thighs-with-rhubarb-cucumber-sal...
Name: 6330, dtype: object

In [90]:
data_ingr[6330]

'[\' habanero, Scotch bonnet, or Thai chile, with seeds, stemmed\', \' garlic cloves\', \' scallions, thinly sliced, white and green parts divided\', \' tablespoon soy sauce\', \'/ cup olive oil\', \' large skin-on, bone-in chicken thighs\', \'Kosher salt\', \' / cups /" cubes rhubarb\', \' cup /" cubes unpeeled seeded English hothouse cucumber\', \'/ cup coarsely chopped fresh cilantro\', \' tablespoon honey\', \' tablespoon vegetable oil\', \' teaspoon fresh lime juice\', \'Freshly ground black pepper\']'

Yay! It worked! The two recipes most similar to Best Chocolate Chip Cookies are also cookies with chocolate

In [60]:
#quickly look at the most and least similar recipes
rev_a = np.fliplr(a)

In [61]:
data.iloc[rev_a[0]]

Unnamed: 0,title,ingredients,instructions
8,Classic Peanut Butter Cookies,1 cup unsalted butter ; 1 cup crunchy peanut b...,"Cream butter, peanut butter, and sugars togeth..."
3552,Chef John's Peanut Butter Cookies,½ cup unsalted butter ; ½ cup white sugar ; ½ ...,"Beat butter, white sugar, brown sugar, peanut ..."
28309,Heart-Shaped Cookies,1 cup white sugar ; 1 cup unsalted butter ; 1 ...,Preheat the oven to 350 degrees F (175 degrees...
17686,Jeanne's Chocolate Kiss Cookies,1 ¼ cups butter ; 2 cups white sugar ; 2 egg ...,Preheat oven to 350 degrees F (175 degrees C)....
1030,Brookies (Brownie Cookies),"½ cup butter, softened ; ½ cup light brown sug...",Preheat oven to 350 degrees F (175 degrees C)....
...,...,...,...
22089,Pizzadillas,4 each Mission® Sundried Tomato Basil Wraps ; ...,Evenly spread 3 tablespoons of marinara on eac...
4533,Pepperoncini Beef,1 (3 pound) beef chuck roast ; 4 cloves garlic...,"Make small cuts in roast, and insert garlic sl..."
20395,A Potato Salad Sandwich,1 hamburger bun ; 1 ½ tablespoons mayonnaise ...,Open bun and spread each side with mayonnaise....
32776,Summer Salad on a Stick,8 honeydew melon balls ; 16 salami ; 24 bab...,"Thread 1 melon ball, 2 slices salami, and 3 ar..."


I'm a bit surprised by the least-similar recipe (as I like both cookies and pretzels, and they are not that different to make). This is likely due to it having missing ingredients.

#### Fried Rice

In [91]:
data.iloc[4]

Title                                                Newton's Law
Ingredients     ['1 teaspoon dark brown sugar', '1 teaspoon ho...
Instructions    Stir together brown sugar and hot water in a c...
Image_Name                     newtons-law-apple-bourbon-cocktail
Name: 4, dtype: object

In [92]:
lsa_s_rst_ct=cosine_similarity(ct_lsa_10[4].reshape(1, -1), ct_lsa_10).round(3) 
lsa_s_rst_ct

array([[0.245, 0.222, 0.589, ..., 0.338, 0.427, 0.418]])

In [93]:
rst_ct = lsa_s_rst_ct.argsort()
rst_ct

array([[10189,   459,  2174, ..., 11383,     6,     4]], dtype=int64)

In [94]:
sorted_roast_ct= np.fliplr(rst_ct) #expect the first result to be the recipe itself [3], with cosine similarity=1
sorted_roast_ct

array([[    4,     6, 11383, ...,  2174,   459, 10189]], dtype=int64)

In [95]:
data.iloc[sorted_roast_ct[0]]

Unnamed: 0,Title,Ingredients,Instructions,Image_Name
4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail
6,Apples and Oranges,"['3 oz. Grand Marnier', '1 oz. Amaro Averna', ...","Add 3 oz. Grand Marnier, 1 oz. Amaro Averna, a...",apples-and-oranges-spiked-cider
11383,Pear-Cranberry Mincemeat Lattice Pie,"['2 firm-ripe Anjou pears, peeled, cored, and ...",Stir together all ingredients except pastry do...,pear-cranberry-mincemeat-lattice-pie-236429
12644,Kiwi Tart,['1 (9-inch) round of refrigerated pie dough (...,Put oven rack in middle position and preheat o...,kiwi-tart-231819
11379,Mascarpone and Prune Tartlets,['10 pitted prunes (2 oz; sometimes called dri...,Bring prunes and Armagnac just to a simmer in ...,mascarpone-and-prune-tartlets-236435
...,...,...,...,...
922,Shingled Sweet Potatoes with Harissa,['2/3 cup plus 2 Tbsp. extra-virgin olive oil'...,"Preheat oven to 400°F. Whisk 2/3 cup oil, 2/3 ...",shingled-sweet-potatoes-with-harissa
10756,Warm Potato Salad with Bacon,"['3 pound medium boiling potatoes, scrubbed', ...",Cover potatoes generously with cold salted wat...,warm-potato-salad-with-bacon-238933
2174,Gluten-Free Sticky Rice Buns,"['2 cups white sushi rice', '2 teaspoons koshe...",Bring rice (do not rinse rice before cooking t...,gluten-free-sticky-rice-buns
459,Oysters with Balsamic,"['24 raw oysters, freshly shucked', 'Balsamic ...",Spoon a few drops of vinegar over each oyster ...,oysters-with-balsamicandnbsp


So it seems for this slowcooker pot roast, the count-vectorized LSA model is recommending other European meat dishes with condensed cream of mushroom soup and some kind of noodles.

The 4 least similar recipes are all consistently drinks.

So does it recommend drink recipes for drinks?

#### Fizzy drink

In [98]:
data.iloc[143]

title                               The REAL Long Island Iced Tea
ingredients     ½ fluid ounce vodka ; ½ fluid ounce rum ; ½ fl...
instructions    Fill a cocktail shaker with ice. Pour vodka, r...
Name: 143, dtype: object

In [100]:
lsa_s_fizz_ct=cosine_similarity(ct_lsa_10[143].reshape(1, -1), ct_lsa_10).round(3) 
lsa_s_fizz_ct

array([[ 0.432,  0.538,  0.185, ..., -0.064,  0.453,  0.396]])

In [101]:
fizz_ct = lsa_s_fizz_ct.argsort()
fizz_ct

array([[14321, 33443,  1684, ..., 35448, 25189,   143]], dtype=int64)

In [102]:
sorted_fizz_ct= np.fliplr(fizz_ct) #expect the first result to be the recipe itself [3], with cosine similarity=1
sorted_fizz_ct


array([[  143, 25189, 35448, ...,  1684, 33443, 14321]], dtype=int64)

In [103]:
data_ingr[143]

'½ fluid ounce vodka ; ½ fluid ounce rum ; ½ fluid ounce gin ; ½ fluid ounce tequila ; ½ fluid ounce triple sec (orange-flavored liqueur) ;  fluid ounce sweet and sour mix ;  fluid ounce cola, or to taste ;   lemon slice'

In [104]:
data_ingr[25189]

' fluid ounces lemon-lime soda (such as Sprite®), or more to taste ;  (. fluid ounce) jigger grenadine syrup ;  scoop vanilla ice cream, or more to taste'

In [105]:
data.iloc[sorted_fizz_ct[0]]

Unnamed: 0,title,ingredients,instructions
143,The REAL Long Island Iced Tea,½ fluid ounce vodka ; ½ fluid ounce rum ; ½ fl...,"Fill a cocktail shaker with ice. Pour vodka, r..."
25189,Shirley Temple Ice Cream Float,8 fluid ounces lemon-lime soda (such as Sprite...,Pour soda into a mug and mix in grenadine. Car...
35448,Chipotle Guacamole,"2 avocados, peeled, seeded and cubed ; 1 tabl...","Mash together cubed avocado, lime juice, sour ..."
8702,The Real Rum Runner,"2 cups ice, divided, or as needed ; 1 fluid ou...","Combine 1 cup ice, pineapple juice, orange jui..."
17219,Strawberry Milkshake,13 fresh strawberries ; 4 scoops strawberry i...,"Blend strawberries, ice cream, milk, honey, an..."
...,...,...,...
16448,Spinach Salad with Peaches and Pecans,¾ cup pecans ; 2 ripe peaches ; 4 cups baby s...,Preheat oven to 350 degrees F (175 degrees C)....
33912,Runners Trail Mix,1 cup dried cherries ; 1 cup walnut halves ; ...,"Mix cherries, walnuts, pumpkin seeds, soy nuts..."
1684,Herbs de Provence,2 tablespoons dried rosemary ; 1 tablespoon fe...,Grind rosemary and fennel seed in a spice grin...
33443,Festive Nut Bowl,1 cup macadamia nuts ; 1 cup cashews ; 1 cup s...,"In a large bowl, mix together macadamia nuts, ..."


Yes! The top 4 most similar recipes to Tangerine-Gin Fizz are all also cocktails, and even all involve some kind of citrus fruit (tangerine, grapefruit, lime). The top recommendation, Grapefruit Sparkle, is also a fizzy drink (it probably caught the word "sparkling" in a topic).

The least similar ones are less consistent at first glance, from ribs to wakame (Japanese seaweed salad) and chocolate. But the recommendation is topic-based, and the topics are not exclusively ingredient items (also involve some description of the items which hints at the preparation method).

### TF-IDF LSA

#### Choco cookies
check recommendations for the same cookie recipe

In [131]:
data.iloc[8] 

title                               Classic Peanut Butter Cookies
ingredients     1 cup unsalted butter ; 1 cup crunchy peanut b...
instructions    Cream butter, peanut butter, and sugars togeth...
Name: 8, dtype: object

In [87]:
#lsa_10 = TruncatedSVD(10)  
#tf_lsa_10 = lsa_10.fit_transform(ingr_tfidf)

In [132]:
lsa_s_choc_tf=cosine_similarity(tf_lsa_10[8].reshape(1, -1), tf_lsa_10).round(3) 
lsa_s_choc_tf

array([[0.402, 0.148, 0.853, ..., 0.626, 0.083, 0.132]])

In [133]:
tf = lsa_s_choc_tf.argsort()
tf

array([[24394, 23221, 30534, ..., 26703,  3552,     8]], dtype=int64)

In [134]:
sorted_choc_rec_tf= np.fliplr(tf) #expect the first result to be the recipe itself [3], with cosine similarity=1
sorted_choc_rec_tf

array([[    8,  3552, 26703, ..., 30534, 23221, 24394]], dtype=int64)

In [135]:
data.iloc[sorted_choc_rec_tf[0]]

Unnamed: 0,title,ingredients,instructions
8,Classic Peanut Butter Cookies,1 cup unsalted butter ; 1 cup crunchy peanut b...,"Cream butter, peanut butter, and sugars togeth..."
3552,Chef John's Peanut Butter Cookies,½ cup unsalted butter ; ½ cup white sugar ; ½ ...,"Beat butter, white sugar, brown sugar, peanut ..."
26703,Dark Chocolate Marbled Banana Bread with Greek...,1 ½ cups all-purpose flour ; 1 teaspoon baking...,Preheat oven to 350 degrees F (175 degrees C)....
7150,Keto Brownies,¾ cup cocoa powder ; ½ teaspoon baking soda ;...,Preheat oven to 350 degrees F (175 degrees C)....
6889,Fast and Easy Pancakes,2 cups milk ; ¾ cup white sugar ; 2 eggs ; 1 ...,"Place milk, sugar, eggs, oil and vanilla in th..."
...,...,...,...
20715,Pantry Pasta Salad,"1 (15 ounce) can garbanzo beans, drained ; 1 (...","Combine garbanzo beans, artichoke hearts, froz..."
31802,Leek and Onion Vegetable Dip,½ (1 ounce) package dry onion soup mix ; 1 (1....,"In a medium bowl, mix together dry onion soup ..."
30534,Tortellini Skewers,1 (20 ounce) package refrigerated tortellini ;...,Fill a large pot with lightly salted water and...
23221,Instant Pot® Black Beans,1 ¼ cups dry black beans,Pour beans into a multi-functional pressure co...


The TF-IDF vectorized data with LSA seems to work better for this type of recipe. The top 4 most similar (recommended) recipes are all cookies. And other than the last two with missing ingredients, the least similar recipes are indeed quite different from chocolate chip cookies (chicken dishes and mashed cauliflower).

#### Pot roast 

In [136]:
data.iloc[3]

title                                                 Pork Steaks
ingredients     ¼ cup butter ; ¼ cup soy sauce ; 1 bunch green...
instructions    Melt butter in a skillet, and mix in the soy s...
Name: 3, dtype: object

In [137]:
lsa_s_rst_tf=cosine_similarity(tf_lsa_10[3].reshape(1, -1), tf_lsa_10).round(3) 
lsa_s_rst_tf

array([[-0.088,  0.348, -0.084, ...,  0.35 ,  0.561,  0.358]])

In [138]:
rst_tf = lsa_s_rst_tf.argsort()
rst_tf

array([[20028,  8760, 14399, ..., 25416, 16082,     3]], dtype=int64)

In [139]:
sorted_roast_tf= np.fliplr(rst_tf) #expect the first result to be the recipe itself [3], with cosine similarity=1
sorted_roast_tf

array([[    3, 16082, 25416, ..., 14399,  8760, 20028]], dtype=int64)

In [142]:
data.iloc[16082]

title                                Korean Marinated Flank Steak
ingredients     4 cloves garlic ; 1 teaspoon minced fresh ging...
instructions    Place garlic, ginger, and onion in the bowl of...
Name: 16082, dtype: object

In [143]:
data_ingr[16082]

' cloves garlic ;  teaspoon minced fresh ginger ;   onion, roughly chopped ; \u2009½ cups low sodium soy sauce ; ¼ cup toasted sesame oil ;  tablespoons Worcestershire sauce ;  tablespoons unseasoned meat tenderizer ;  cup white sugar ;  pounds beef flank steak, trimmed of excess fat'

In [144]:
data.iloc[25416]

title                                 Spicy Asian Noodles for One
ingredients     1 tablespoon creamy peanut butter ; 1 tablespo...
instructions    Combine peanut butter, ginger paste, chili oil...
Name: 25416, dtype: object

In [145]:
data_ingr[25416]

' tablespoon creamy peanut butter ;  tablespoon ginger paste ;  tablespoon chili oil ;  tablespoon apricot preserves ;  tablespoon balsamic vinegar ;  clove garlic, minced ; ½ tablespoon soy sauce ; ½ tablespoon sesame oil ; ½ teaspoon chili powder ; ¼ teaspoon crushed red pepper ;  ( ounce) package ramen noodles ; ½ tablespoon chopped unsalted peanuts'

In [146]:
data.iloc[sorted_roast_tf[0]]

Unnamed: 0,title,ingredients,instructions
3,Pork Steaks,¼ cup butter ; ¼ cup soy sauce ; 1 bunch green...,"Melt butter in a skillet, and mix in the soy s..."
16082,Korean Marinated Flank Steak,4 cloves garlic ; 1 teaspoon minced fresh ging...,"Place garlic, ginger, and onion in the bowl of..."
25416,Spicy Asian Noodles for One,1 tablespoon creamy peanut butter ; 1 tablespo...,"Combine peanut butter, ginger paste, chili oil..."
5247,My Favorite Sesame Noodles,½ (8 ounce) package spaghetti ; 2 tablespoons ...,Fill a large pot with lightly salted water and...
7582,Asian Beef Skewers,3 tablespoons hoisin sauce ; 3 tablespoons she...,"In a small bowl, mix together hoisin sauce, sh..."
...,...,...,...
5920,Hard-Steamed Eggs,"12 eggs, at room temperature",Place a steamer insert into a pot and fill wit...
26130,Simple Yorkshire Pudding,2 eggs ; 1 pinch salt ; 1 cup all-purpose flo...,Preheat oven to 350 degrees F (175 degrees C)....
14399,Quark (Homemade Cheese),2 cups milk ; ½ cup buttermilk,Bring milk to a simmer in a saucepan; remove f...
8760,Syracuse Salt Potatoes,4 pounds new potatoes ; 1 ½ cups fine salt ; 8...,Wash the potatoes and set aside. Fill a large ...


In [147]:
data_ingr[5247] #Lemon Granola, makes sense it's very different from beef roast!

'½ ( ounce) package spaghetti ;  tablespoons peanut butter ;  tablespoon honey ;  tablespoons tamari ;  teaspoon Thai chili sauce ;  teaspoon sesame oil ;  teaspoon ground ginger ;  clove garlic, minced ;   green onion, chopped ;  teaspoons sesame seeds'

In [148]:
data_ingr[7582]

' tablespoons hoisin sauce ;  tablespoons sherry ; ¼ cup soy sauce ;  teaspoon barbeque sauce ;   green onions, chopped ;  cloves garlic, minced ;  tablespoon minced fresh ginger root ; \u2009½ pounds flank steak ;   skewers'

The top 3 recipes similar to Awesome Slow Cooker Pot Roast (including itself) are all meat recipes using some pre-made ingredients such as canned condensed soup. The second-most similar (Slow Cooker London Broil) is also a slow cooker recipe.

Also the model seems to recognize beef, pot roast, and steak as similar (good!), likely due to them being in the same topics. This also shows we probably don't need to use soft cosine similarity (might try it in the future to see how "blurry" it makes things).

The least similar recipes have fewer ingredients, and there are quite a few recipes with lemons. Although it's interesting that Herbed Tuna Steaks was considered least similar to roasts using beef steaks.

#### waffles
Try another, more complicated recipe:

In [149]:
data.iloc[14]

title                                             Classic Waffles
ingredients     2 cups all-purpose flour ; 1 teaspoon salt ; 4...
instructions    In a large bowl, mix together flour, salt, bak...
Name: 14, dtype: object

In [153]:
lsa_s_tc_tf=cosine_similarity(tf_lsa_10[14].reshape(1, -1), tf_lsa_10).round(3) 
lsa_s_tc_tf

array([[0.473, 0.192, 0.961, ..., 0.518, 0.02 , 0.062]])

In [154]:
tc_tf = lsa_s_tc_tf.argsort()
tc_tf

array([[11723, 34897,  8163, ..., 13084, 18913,    14]], dtype=int64)

In [155]:
sorted_tc_tf= np.fliplr(tc_tf) #expect the first result to be the recipe itself [3], with cosine similarity=1
sorted_tc_tf


array([[   14, 18913, 13084, ...,  8163, 34897, 11723]], dtype=int64)

In [156]:
data.iloc[18913]

title                                               Berry Cobbler
ingredients     2 cups all-purpose flour ; 1 cup white sugar ;...
instructions    Preheat oven to 350 degrees F (175 degrees C)....
Name: 18913, dtype: object

In [157]:
data_ingr[18913] #did not expect this asian fish recipe as the top rec

' cups all-purpose flour ;  cup white sugar ; \u2009½ teaspoons baking powder ; ½ teaspoon salt ;  tablespoons butter, melted ; . cup milk ;  teaspoon vanilla extract ;   egg, beaten ;  cups raspberries'

In [158]:
data.iloc[13084]

title                                Mini Chocolate Chip Pancakes
ingredients     1 cup all-purpose flour ; 2 tablespoons white ...
instructions    Sift flour, sugar, baking powder, and salt tog...
Name: 13084, dtype: object

In [159]:
data_ingr[13084] #this nachos recipe is more similar to tacos

' cup all-purpose flour ;  tablespoons white sugar ;  tablespoon baking powder ; ¼ teaspoon salt ;   egg ;  cup milk ;  tablespoons butter, melted and cooled ; ½ cup miniature chocolate chips'

In [160]:
data.iloc[sorted_tc_tf[0]]

Unnamed: 0,title,ingredients,instructions
14,Classic Waffles,2 cups all-purpose flour ; 1 teaspoon salt ; 4...,"In a large bowl, mix together flour, salt, bak..."
18913,Berry Cobbler,2 cups all-purpose flour ; 1 cup white sugar ;...,Preheat oven to 350 degrees F (175 degrees C)....
13084,Mini Chocolate Chip Pancakes,1 cup all-purpose flour ; 2 tablespoons white ...,"Sift flour, sugar, baking powder, and salt tog..."
18360,Huckleberry Muffins,¾ cup butter ; 1 cup white sugar ; 1 egg ; ¾ ...,Preheat the oven to 400 degrees F (200 degrees...
27352,Treacle Scones,3 ⅔ cups all-purpose flour ; 1 teaspoon baking...,Preheat the oven to 400 degrees F (200 degrees...
...,...,...,...
4371,Chipotle Mayo,½ cup mayonnaise ; 2 chipotle chilies in adob...,"In a food processor, combine the mayonnaise, c..."
2526,Sriracha Aioli,1 cup mayonnaise ; 2 tablespoons sriracha hot ...,Stir mayonnaise and sriracha hot sauce togethe...
8163,Mustard Mayonnaise Sauce,1 ½ cups mayonnaise ; 3 tablespoons Dijon must...,"Stir mayonnaise, mustard, horseradish, and Wor..."
34897,Avo Spoon Snack,2 Avocados from Mexico ; Salsa ; Balsamic...,Cut avocados lengthwise and remove pits. Drizz...


The top recommendations for Jalapeño and Lime–Marinated Skirt Steak Tacos	 are quite intersting, as there are 2 Asian recipes (Fish and curry noodles) and 2 Mexican recipes (Nachos and Fish Tacos). In a way this confirmed the hunch I had that Mexican dishes popular in North America are quite similar to the popular Asian dishes.

Once again the least similar recipes are mainly the ones that have very few ingredients. This makes sense as the 10 topics from LSA are mostly ingredients. If there's no overlap in ingredients, the two recipes cannot have a high cosine similarity.

## Recommendation on new data

### Small test data 

In [161]:
testdata=pd.DataFrame(columns = ["title","ingredients"])

In [162]:
inglist1=['''
2 tablespoons reduced sodium soy sauce
2 tablespoons freshly squeezed lime juice
2 tablespoons canola oil, divided
3 cloves garlic, minced
2 teaspoons chili powder
1 teaspoon ground cumin
1 teaspoon dried oregano
1 1/2 pounds skirt steak, cut into 1/2-inch pieces
12 mini flour tortillas
3/4 cup diced red onion

1/2 cup chopped fresh cilantro leaves
1 lime, cut into wedges
''']
title1='Mexican Street Tacos by DamnDelicious'

In [163]:
inglist2=['''375g/13oz butter
375g/13oz dark chocolate
6 free-range eggs
350g/12oz caster sugar
1 tbsp vanilla extract
225g/8oz plain flour
1 tsp salt
250g/9oz white chocolate, chopped
2 tsp icing sugar, to decorate (optional)''']

title2='Double chocolate brownies'

In [164]:
testdatalst = []
testdatalst.append([title1, inglist1]) 
#cheaper computationally to append list, then create df once have all the data! https://stackoverflow.com/questions/13784192/creating-an-empty-pandas-dataframe-then-filling-it?rq=1

In [165]:
testdatalst.append([title2, inglist2]) #need to vectorize and do lsa for one document at a time (write in function?)

In [166]:
testdf = pd.DataFrame(testdatalst, columns=["title","ingredients"])
testdf

Unnamed: 0,title,ingredients
0,Mexican Street Tacos by DamnDelicious,[\n2 tablespoons reduced sodium soy sauce\n2 t...
1,Double chocolate brownies,[375g/13oz butter\n375g/13oz dark chocolate\n6...


Now have test data, can clean, vectorize, lemmatize, and calc cos similarity!

Remove digits

In [168]:
testdf_ingr=testdf['ingredients'].apply(project4_functions_n.regex_nodigits_new)

In [169]:
##! This new document is the test set, only .transform for vectorizer and tfidf (fit on train set),
#don't use .fit_transform on test set!

#### Vectorize with TF-IDF


In [170]:
#previously, already fit in 2.2.3:
tf_vec = TfidfVectorizer(stop_words=stopwords_ingr) 
ingr_tfidf = tf_vec.fit_transform(data_ingr)

In [171]:
testvec = tf_vec.transform(testdf_ingr)

In [172]:
tf_lsa_10 = lsa_10.fit_transform(ingr_tfidf)

In [173]:
test_tf = lsa_10.transform(testvec)

test_tf

array([[ 0.30055879, -0.13564671, -0.12985653,  0.02092   ,  0.00211679,
        -0.01196131,  0.07340965,  0.12421433, -0.07587063,  0.03980802],
       [ 0.06849832,  0.08989513,  0.00431821,  0.04786151,  0.00872394,
        -0.01616644,  0.01440772, -0.01174029, -0.00444904,  0.01569404]])

#### Cosine similarity with RB dataset
##### Tacos

In [174]:
test_csim=cosine_similarity(test_tf[0].reshape(1,-1), tf_lsa_10).round(3) 
test_csim

array([[0.131, 0.559, 0.065, ..., 0.608, 0.824, 0.517]])

In [175]:
csim_tacos = test_csim.argsort()
csim_tacos

array([[13901, 24303, 24228, ...,  4201, 20042, 29251]], dtype=int64)

In [176]:
sorted_test_tacos= np.fliplr(csim_tacos) #expect the first result to be the recipe itself, with cosine similarity=1
sorted_test_tacos

array([[29251, 20042,  4201, ..., 24228, 24303, 13901]], dtype=int64)

In [177]:
data.iloc[sorted_test_tacos[0]]

Unnamed: 0,title,ingredients,instructions
29251,Spicy Cucumber Soup,"2 tablespoons olive oil ; ½ onion, chopped ; ...",Heat the olive oil in a saucepan over medium h...
20042,Authentic Mexican Torta - Tortas Ahogadas,"16 cloves garlic, minced ; 2 tablespoons mince...",Preheat an oven to 475 degrees F (245 degrees ...
4201,Arroz Rojo (Mexican Red Rice),"2 Roma (plum tomatoes), cored ; 2 tablespoons...",Grate tomatoes into a bowl using a box grater;...
31498,Japanese Salted Chicken Wings,14 ounces chicken wings ; 1 tablespoon sake (J...,"Combine chicken wings, sake, garlic, sesame oi..."
26140,Bison Stew,2 tablespoons canola oil ; 2 pounds bison meat...,Heat 2 tablespoons canola oil in a Dutch oven ...
...,...,...,...
17120,Easy Creamy Mac 'N Cheese,8 ounces elbow macaroni ; 1 ½ cups milk ; ¼ cu...,Cook pasta in large pot according to package d...
5920,Hard-Steamed Eggs,"12 eggs, at room temperature",Place a steamer insert into a pot and fill wit...
24228,Individual Baked Eggs,1 slice bacon ; 1 teaspoon melted butter ; 1 ...,Preheat oven to 350 degrees F (175 degrees C)....
24303,Toad In a Hole,"6 slices bread ; 2 tablespoons butter, softene...",Preheat a large skillet to a high heat. With a...


##### Brownies

In [178]:
test_csim=cosine_similarity(test_tf[1].reshape(1,-1), tf_lsa_10).round(3) 
test_csim

array([[0.225, 0.178, 0.744, ..., 0.244, 0.197, 0.018]])

In [179]:
csim_b = test_csim.argsort()
csim_b

array([[32656,  4149, 30842, ...,  7502, 17860,  1825]], dtype=int64)

In [180]:
sorted_test_b= np.fliplr(csim_b) 
#expect the first result to be the recipe itself, with cosine similarity=1
sorted_test_b

array([[ 1825, 17860,  7502, ..., 30842,  4149, 32656]], dtype=int64)

In [181]:
data.iloc[sorted_test_b[0]]

Unnamed: 0,title,ingredients,instructions
1825,Heide's Kentucky Derby Dessert,1 ¼ cups chopped pecans ; 4 large eggs ; ¾ cup...,Preheat oven to 300 degrees F (150 degrees C)....
17860,Tar Heel Pie,"½ cup butter, melted ; ¾ cup chocolate chips ;...",Preheat oven to 350 degrees F (175 degrees C.)...
7502,German Strawberry Roll,"4 eggs, divided ; ½ cup white sugar ; 2 teasp...",Preheat the oven to 400 degrees F (200 degrees...
15530,Coconut Date Balls,"2 eggs, beaten ; 1 cup white sugar ; 1 cup ch...","In a medium saucepan over medium heat, combine..."
13364,White Chocolate Blondie Brownies,1 cup butter ; 3 (1 ounce) squares white choco...,Preheat oven to 350 degrees F (175 degrees C)....
...,...,...,...
10321,Kelly's Pan Fried Tilapia,1 ½ cups Italian-seasoned bread crumbs ; 5 ti...,Put bread crumbs into a shallow dish. Gently p...
32004,Sausage and Cheese Balls,"1 pound pork sausage, casings removed ; 1 poun...",Preheat the oven to 375 degrees F (190 degrees...
30842,Kale Krisps,"2 bunches kale, washed and dried ; 2 cups shre...",Preheat oven to 425 degrees F (220 degrees C)...
4149,Easy and Elegant Pork Tenderloin,2 cups Italian seasoned bread crumbs ; ½ cup o...,Preheat oven to 425 degrees F (220 degrees C)....


In [182]:
data_ingr[1825]

'\u2009¼ cups chopped pecans ;  large eggs ; ¾ cup brown sugar ; ¾ cup light corn syrup ; ½ cup all-purpose flour ; ½ cup butter, melted and cooled ; ¼ cup white sugar ;  tablespoons bourbon ; \u2009½ teaspoons vanilla extract ; ¾ cup miniature semisweet chocolate chips ;  ( inch) unbaked deep dish pie crust'

In [183]:
data_ingr[17860]

'½ cup butter, melted ; ¾ cup chocolate chips ; ½ cup all-purpose flour ; ½ cup brown sugar ; ½ cup white sugar ;   eggs, beaten ;  teaspoon vanilla extract ;  cup chopped pecans ; ½ cup flaked coconut ; ¼ cup chocolate chips ;  ( inch) deep dish pie crust'

Top recommendations for Double chocolate brownie mostly chocolate-based desserts, though the chocolate-espresso martini also got in there (only using ingredient column of data, if recommendations are based on instructions, they would likely be considered quite different).

In [None]:
pickel.dump

#### ^Next steps

In [184]:
#lemmatization fix; https://discuss.analyticsvidhya.com/t/lemmatizing-dataframe-using-nltk/67092
#soft cosine similarity?

In [185]:
#also tried Kmeans clustering after normalization in my original code,
#but it wasn't immediately helpful for getting to recommendations, so it was omitted here.

##### Aside: Visualize Topics with Word Clouds?

In [187]:
from matplotlib import pyplot as plt
import wordcloud
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

In [188]:
topics_lsa_tf = ['chopped, salt, pepper, fresh, ground, oil, black, garlic, olive, freshly, sliced, large, red, finely, sugar, cut, butter, onion, leaves, minced',
'sugar, butter, flour, vanilla, purpose, extract, baking, cream, unsalted, powder, salt, eggs, egg, milk, chocolate, large, white, brown, temperature, room',
'ground, pepper, black, salt, freshly, oil, kosher, sliced, olive, minced, diced, vinegar, red, garlic, sauce, thinly, cut, taste, extra, virgin',
'fresh, sliced, cut, juice, lemon, oil, thinly, leaves, olive, large, peeled, minced, extra, virgin, pieces, plus, lime, sugar, halved, orange'
'fresh, ground, freshly, lemon, juice, kosher, black, leaves, finely, grated, salt, parsley, extra, zest, plus, virgin, cinnamon, olive, orange, unsalted',
'fresh, ground, sauce, juice, sugar, minced, diced, sliced, lime, ginger, taste, green, onion, white, cinnamon, cilantro, water, brown, soy, powder',
'cut, large, peeled, pieces, ground, butter, unsalted, freshly, chicken, medium, slices, small, cubes, potatoes, stick, black, celery, carrots, thick, broth',
'sliced, cheese, thinly, cream, grated, freshly, butter, shredded, package, parmesan, ground, cheddar, black, fresh, slices, kosher, bread, unsalted, mushrooms, heavy',
'diced, cheese, pepper, minced, cream, taste, fresh, butter, salt, shredded, grated, chicken, package, onion, parmesan, garlic, heavy, milk, cheddar, black',
'pepper, red, finely, juice, diced, lemon, salt, freshly, sugar, kosher, bell, lime, black, vinegar, orange, small, seeded, recipe, follows, zest']

In [189]:
topics=topics_lsa_tf[0].split()
topics

['chopped,',
 'salt,',
 'pepper,',
 'fresh,',
 'ground,',
 'oil,',
 'black,',
 'garlic,',
 'olive,',
 'freshly,',
 'sliced,',
 'large,',
 'red,',
 'finely,',
 'sugar,',
 'cut,',
 'butter,',
 'onion,',
 'leaves,',
 'minced']

In [190]:
#want to split by topic? write function to repeat this?
#need to clean data better for word cloud to be informative

In [199]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt 
import pandas as pd 
  
comment_words = ' ' 
stopwords = stopwords_ingr.append('salt') #not working to remove salt, add to original stopwords?
# iterate through the csv file 
for topic in topics_lsa_tf: 
  
    # split the value 
    tokens = topic.split()  
      
    for words in tokens: 
        comment_words = comment_words + words + ' '
  
  
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white',  
                stopwords = stopwords, 
                min_font_size = 10).generate(comment_words) 
  
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

AttributeError: 'TransposedFont' object has no attribute 'getbbox'

In [204]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd
from nltk.corpus import stopwords as nltk_stopwords
import nltk

# Ensure stopwords are downloaded
nltk.download('stopwords')

# Define the stopwords
stopwords_ingr = set(nltk_stopwords.words('english'))
stopwords_ingr.add('salt')  # Add 'salt' to the stopwords

# Example topics list
topics_lsa_tf = ["salt sugar butter", "pepper onion garlic", "milk cream cheese"]

# Combine topics into a single string
comment_words = ' '.join(topics_lsa_tf)

# Generate the word cloud
wordcloud = WordCloud(width=800, height=800,
                      background_color='white',
                      stopwords=stopwords_ingr,
                      min_font_size=10).generate(comment_words)

# Plot the WordCloud image
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pasan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


AttributeError: 'TransposedFont' object has no attribute 'getbbox'

In [203]:
#topic_words = dict(topics[0].split())

In [142]:
# cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

# cloud = WordCloud(stopwords=stopwords_ingr,
#                   background_color='white',
#                   width=2500,
#                   height=1800,
#                   max_words=10,
#                   colormap='tab10',
#                   color_func=lambda *args, **kwargs: cols[i],
#                   prefer_horizontal=1.0)

# #topics = topics_lsa_tf
# topics=topics_lsa_tf[0].split()
# #project4_functions.display_topics(nmf_model, vectorizer.get_feature_names(), 20)
# ##** need the show_topics option in gensim for lda? Example:
# #topics = lda_model.show_topics(formatted=False)
# #alt: just use a list
# #topic_words = dict(topics[i][1])


# fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)

# for i, ax in enumerate(axes.flatten()):
#     fig.add_subplot(ax)
#     topic_words = dict(key=topics[i].split())##
    
#     cloud.generate_from_frequencies(topic_words, max_font_size=300)
#     plt.gca().imshow(cloud)
#     plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
#     plt.gca().axis('off')


# plt.subplots_adjust(wspace=0, hspace=0)
# plt.axis('off')
# plt.margins(x=0, y=0)
# plt.tight_layout()
# plt.show()

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# Get user input for the number of components for TruncatedSVD
n_components = int(input("Enter the number of components for TruncatedSVD: "))

# Assuming 'ingr_ct' is already defined
lsa_10 = TruncatedSVD(n_components)
ct_lsa_10 = lsa_10.fit_transform(ingr_ct)

# Get user input for the ingredient
ingredient = input("Enter the ingredients separated by commas: ")

# Split the input into individual ingredients
ingredient_list = ingredient.split(',')

# Find the indices of the ingredients in the list of ingredients
ingredient_indices = []
for ingr in ingredient_list:
    try:
        ingredient_indices.append(list_of_ingredients.index(ingr.strip()))
    except ValueError:
        print("Ingredient", ingr.strip(), "not found in the list.")
        continue

# Calculate cosine similarity for the specified ingredients
cosine_similarities = cosine_similarity(ct_lsa_10[ingredient_indices], ct_lsa_10)

# Print the result
print("Cosine similarity for", ingredient, "with all ingredients:")
print(cosine_similarities)


In [96]:
conda install openssl
conda install pyopenssl


SyntaxError: invalid syntax (<ipython-input-96-7963dc56b699>, line 1)