# Recommendation System TF-IDF

In [252]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [253]:
# import libraries
import pandas as pd
import numpy as np
import re
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pickle
from sklearn.utils import shuffle

from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

## Data Cleaning and Feature Eningeering

#### Dont have to run this section again

In [322]:
# Import data
recipe_data = pd.read_csv(r"data\RecipeNLG_dataset.csv")

In [323]:
temp_recipe_data = recipe_data

In [324]:
temp_recipe_data = temp_recipe_data.drop(['ingredients','directions','Unnamed: 0','source'], axis=1)
display(temp_recipe_data.head())
display(temp_recipe_data.shape)

Unnamed: 0,title,link,NER
0,No-Bake Nut Cookies,www.cookbooks.com/Recipe-Details.aspx?id=44874,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,Jewell Ball'S Chicken,www.cookbooks.com/Recipe-Details.aspx?id=699419,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,Creamy Corn,www.cookbooks.com/Recipe-Details.aspx?id=10570,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,Chicken Funny,www.cookbooks.com/Recipe-Details.aspx?id=897570,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,Reeses Cups(Candy),www.cookbooks.com/Recipe-Details.aspx?id=659239,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


(2231142, 3)

In [326]:
# randomly remove 96% of the dataset so I can assign a manageable amount of memory to it
remove_n = 2100000
drop_indices = np.random.choice(temp_recipe_data.index, remove_n, replace=False)
recipe_data_reduced = temp_recipe_data.drop(drop_indices)

In [327]:
display(recipe_data_reduced.shape)
display(recipe_data_reduced.head())

(131142, 3)

Unnamed: 0,title,link,NER
0,No-Bake Nut Cookies,www.cookbooks.com/Recipe-Details.aspx?id=44874,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,Jewell Ball'S Chicken,www.cookbooks.com/Recipe-Details.aspx?id=699419,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
3,Chicken Funny,www.cookbooks.com/Recipe-Details.aspx?id=897570,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
15,Fresh Strawberry Pie,www.cookbooks.com/Recipe-Details.aspx?id=161321,"[""shell"", ""cleaned strawberries"", ""water"", ""co..."
16,Easy German Chocolate Cake,www.cookbooks.com/Recipe-Details.aspx?id=983179,"[""chocolate fudge cake"", ""white cake"", ""Wesson..."


In [328]:
# save new dataset
recipe_data_reduced.to_csv(r'data\recipe_data_lite_reduced.csv', index=False)

In [329]:
# add calories and health scale (1-10) column
recipe_data_reduced["calories"] = np.random.randint(300, 800, len(recipe_data_reduced))
recipe_data_reduced["health scale"] = np.random.randint(1, 11, len(recipe_data_reduced))
display(recipe_data_reduced.head())

Unnamed: 0,title,link,NER,calories,health scale
0,No-Bake Nut Cookies,www.cookbooks.com/Recipe-Details.aspx?id=44874,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu...",615,10
1,Jewell Ball'S Chicken,www.cookbooks.com/Recipe-Details.aspx?id=699419,"[""beef"", ""chicken breasts"", ""cream of mushroom...",601,6
3,Chicken Funny,www.cookbooks.com/Recipe-Details.aspx?id=897570,"[""chicken"", ""chicken gravy"", ""cream of mushroo...",609,6
15,Fresh Strawberry Pie,www.cookbooks.com/Recipe-Details.aspx?id=161321,"[""shell"", ""cleaned strawberries"", ""water"", ""co...",546,10
16,Easy German Chocolate Cake,www.cookbooks.com/Recipe-Details.aspx?id=983179,"[""chocolate fudge cake"", ""white cake"", ""Wesson...",771,9


In [330]:
# save new dataset (including calories)
recipe_data_reduced.to_csv(r'data\recipe_data_lite_reduced_calories.csv', index=False)

## Import NEW recipe data

In [283]:
# Import data
data = pd.read_csv(r"data\recipe_data_reduced_calories.csv")

## FEATURE EXTRACTION via tfidf vectorizer
### Create a data-term matrix (vocabulary)

In [284]:
# Instantiate vecorizer object - call tokenize
tf_model = TfidfVectorizer(max_features=1000,max_df=0.25,min_df=0.01,stop_words='english')

In [285]:
# create corpus (categories) used for the TF-IDF
corpus = data['NER']

In [286]:
tf_matrix = tf_model.fit_transform(corpus).todense()
print('The shape of the matrix is:', tf_matrix.shape)
tf_df = pd.DataFrame(tf_matrix)
tf_df.columns = sorted(tf_model.vocabulary_)
display(tf_df.head())

The shape of the matrix is: (21142, 188)


Unnamed: 0,allspice,almond,almonds,apple,apples,bacon,baking,balsamic,bananas,basil,...,white,whites,wine,worcestershire,yeast,yellow,yogurt,yolks,zest,zucchini
0,0.0,0.0,0.0,0.0,0.0,0.0,0.381713,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.61466,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.34208,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.347801,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [339]:
# iterating the columns
for col in tf_df.columns:
    print(col)

allspice
almond
almonds
apple
apples
bacon
baking
balsamic
bananas
basil
bay
beans
beef
bell
black
boiling
bread
breadcrumbs
breasts
broccoli
broth
brown
buttermilk
cabbage
cake
canola
carrot
carrots
cayenne
celery
cheddar
cheese
cherries
cherry
chicken
chili
chips
chives
chocolate
cider
cilantro
cinnamon
clove
cloves
cocoa
coconut
cold
condensed
cooking
coriander
corn
cornstarch
cracker
crackers
cream
crumbs
crust
cucumber
cumin
curry
dill
dressing
dry
egg
eggs
extra
extract
filling
flakes
flour
fresh
freshly
frozen
garlic
ginger
graham
grated
green
ground
ham
hamburger
heavy
honey
hot
instant
italian
juice
ketchup
kidney
kosher
leaf
lean
leaves
lemon
lettuce
light
lime
maple
margarine
marshmallows
mayonnaise
milk
mint
mix
mozzarella
mushroom
mushrooms
mustard
noodles
nutmeg
nuts
oats
oleo
olive
olives
onion
onions
orange
oregano
paprika
parmesan
parsley
pasta
paste
peanut
peas
pecans
pepper
peppers
pie
pineapple
pork
potato
potatoes
powder
powdered
pudding
pumpkin
purpose
raisins
red

In [287]:
# Get cosine similarity of the matrix
cosine_sim = cosine_similarity(tf_matrix, tf_matrix)

# need to run again for a very small portion of the matrix

In [288]:
# Fit on TF-IDF Vectors
nn = NearestNeighbors(n_neighbors=10, algorithm='ball_tree')
nn.fit(tf_matrix)

NearestNeighbors(algorithm='ball_tree', n_neighbors=10)

In [289]:
# Construct a reverse mapping of indices and post titles, and drop duplicate titles if any
indices = pd.Series(data.index, index=data['title']).drop_duplicates()
print(type(indices))
print(type(cosine_sim))

<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>


In [290]:
# Pickle the dtm and tf for use in the prediction
pickle.dump(tf_matrix, open('tf_matrix.pkl', 'wb'))
pickle.dump(tf_model, open('tf_model.pkl', 'wb'))
indices.to_pickle('indices.pkl')
np.save("cosine_sim.npy", cosine_sim)

### Test

In [291]:
# Create the test case
ideal_category = ['brown sugar','milk','chicken','bacon','butter','honey','salmon','cod','flour','salt','avocado','creamer','lemons']

new = tf_model.transform(ideal_category)
results = nn.kneighbors(new.todense(), return_distance=False)

In [296]:
print(data.loc[6237,'NER'])

["yellow butter", "butter", "egg", "Philadelphia cream cheese", "eggs", "powdered sugar"]


In [295]:
# Print recommendations
for each in results[1]:
    print(each)
    print(data.loc[each, 'title'], '\n----')

1202
Sorghum Milkshake 
----
9119
Avocado Shake 
----
8511
Instant Hot Chocolate 
----
8347
Sweetened Condensed Milk 
----
5869
Caramel Clusters 
----
2711
Twinkie Dessert 
----
3158
Rich And Moist Coconut Cake 
----
1006
Frozen Frappuccino 
----
4293
Quicky "Pudding Surprise" For Kids 
----
3889
Cherry Yum-Yum 
----


## Pipeline time <3

In [337]:
def recommendation(ingredient_list, calorie, health):
    # processed dataset
    df = pd.read_csv(r"data\recipe_data_lite_reduced_calories.csv")
    # check if values are within limits
    if not 800 >= calorie >= 300:
        raise Exception("Enter a calorie value between 300 and 800")
    if not 10 >= health >= 0:
        raise Exception("Enter a health scale value between 0 and 10")
    # remove values less than input values (calorie & health)
    df.drop(df[df['calories'] <= calorie].index, inplace = True)
    df.drop(df[df['health scale'] <= health].index, inplace = True)
    df = df.reset_index(drop = True)
    display(df.head())
    display(df.shape)
    # vectorize
    tf_model = TfidfVectorizer(max_features=1000,max_df=0.25,min_df=0.01,stop_words='english')
    corpus = df['NER']
    display(corpus.head())
    # fit + transform
    tf_matrix = tf_model.fit_transform(corpus).todense()
    tf_df = pd.DataFrame(tf_matrix)
    tf_df.columns = sorted(tf_model.vocabulary_)
    # predict and fit
    nn = NearestNeighbors(n_neighbors=10, algorithm='ball_tree')
    nn.fit(tf_matrix)
    new = tf_model.transform(ingredient_list)
    results = nn.kneighbors(new.todense(), return_distance=False)
    # print results
    for i in results[1]:
        print(i)
        print(df.loc[i, ('title')])
        print(df.loc[i, ('link')])
        print(df.loc[i, ('NER')], '\n----')

In [338]:
recommendation(['brown sugar','milk','chicken','bacon','butter','honey','salmon','cod','flour','salt','avocado','creamer','lemons'],300,4)

Unnamed: 0,title,link,NER,calories,health scale
0,No-Bake Nut Cookies,www.cookbooks.com/Recipe-Details.aspx?id=44874,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu...",615,10
1,Jewell Ball'S Chicken,www.cookbooks.com/Recipe-Details.aspx?id=699419,"[""beef"", ""chicken breasts"", ""cream of mushroom...",601,6
2,Chicken Funny,www.cookbooks.com/Recipe-Details.aspx?id=897570,"[""chicken"", ""chicken gravy"", ""cream of mushroo...",609,6
3,Fresh Strawberry Pie,www.cookbooks.com/Recipe-Details.aspx?id=161321,"[""shell"", ""cleaned strawberries"", ""water"", ""co...",546,10
4,Easy German Chocolate Cake,www.cookbooks.com/Recipe-Details.aspx?id=983179,"[""chocolate fudge cake"", ""white cake"", ""Wesson...",771,9


(78581, 5)

0    ["brown sugar", "milk", "vanilla", "nuts", "bu...
1    ["beef", "chicken breasts", "cream of mushroom...
2    ["chicken", "chicken gravy", "cream of mushroo...
3    ["shell", "cleaned strawberries", "water", "co...
4    ["chocolate fudge cake", "white cake", "Wesson...
Name: NER, dtype: object

236
Macaroni Cheez
www.cookbooks.com/Recipe-Details.aspx?id=677622
["elbow macaroni", "milk"] 
----
14583
Baked Fish
www.cookbooks.com/Recipe-Details.aspx?id=507996
["fish", "Toasties", "milk"] 
----
13999
Pink Lemonade Pie
www.cookbooks.com/Recipe-Details.aspx?id=716010
["concentrate", "milk"] 
----
11442
Frozen Lemon Mousse
www.cookbooks.com/Recipe-Details.aspx?id=520461
["milk", "lemons", "sugar"] 
----
11440
Frozen Passion
www.cookbooks.com/Recipe-Details.aspx?id=568390
["milk", "beverage"] 
----
12487
Kahlua Dip
www.cookbooks.com/Recipe-Details.aspx?id=791506
["milk", "Kahlua"] 
----
6111
Butter Pecan Cake
www.cookbooks.com/Recipe-Details.aspx?id=135665
["butter", "milk", "butterscotch topping"] 
----
3857
Hot Chocolate Mix
www.cookbooks.com/Recipe-Details.aspx?id=78747
["milk", "confectioners sugar", "Nestles"] 
----
5614
Lemon Sherbet
www.cookbooks.com/Recipe-Details.aspx?id=1050186
["lemons", "sugar", "lemons", "milk"] 
----
6079
Frosty'S Ice Cream
www.cookbooks.com/Recipe-Deta