## Content based recommender system base on recipes

- vectorize the ingredients (feature extraction)

- Calculate the similarity score for every recipe

- Sort the recipe based on the score and output the top results

In [71]:
import pandas as pd
import numpy as np

In [72]:
df = pd.read_csv('recipes.csv')
df.shape

(231637, 12)

In [73]:
df[['calories','total fat (PDV)','sugar (PDV)','sodium (PDV)','protein (PDV)','saturated fat (PDV)','carbohydrates (PDV)']] = df.nutrition.str.split(",",expand=True) 

In [74]:
df['calories'] = df['calories'].apply(lambda x: x.replace('[', ''))
df['carbohydrates (PDV)'] = df['carbohydrates (PDV)'].apply(lambda x: x.replace(']', ''))
df = df.drop(columns=['id', 'submitted', 'description', 'nutrition', 'contributor_id', 'n_steps', 'n_ingredients'])
df.head()

Unnamed: 0,name,minutes,tags,steps,ingredients,calories,total fat (PDV),sugar (PDV),sodium (PDV),protein (PDV),saturated fat (PDV),carbohydrates (PDV)
0,arriba baked winter squash mexican style,55,"['60-minutes-or-less', 'time-to-make', 'course...","['make a choice and proceed with recipe', 'dep...","['winter squash', 'mexican seasoning', 'mixed ...",51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,a bit different breakfast pizza,30,"['30-minutes-or-less', 'time-to-make', 'course...","['preheat oven to 425 degrees f', 'press dough...","['prepared pizza crust', 'sausage patty', 'egg...",173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,all in the kitchen chili,130,"['time-to-make', 'course', 'preparation', 'mai...","['brown ground beef in large pot', 'add choppe...","['ground beef', 'yellow onions', 'diced tomato...",269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,alouette potatoes,45,"['60-minutes-or-less', 'time-to-make', 'course...",['place potatoes in a large pot of lightly sal...,"['spreadable cheese with garlic and herbs', 'n...",368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,amish tomato ketchup for canning,190,"['weeknight', 'time-to-make', 'course', 'main-...",['mix all ingredients& boil for 2 1 / 2 hours ...,"['tomato juice', 'apple cider vinegar', 'sugar...",352.9,1.0,337.0,23.0,3.0,0.0,28.0


In [75]:
# nutritions = df[['name', 'calories','total fat (PDV)','sugar (PDV)','sodium (PDV)','protein (PDV)','saturated fat (PDV)','carbohydrates (PDV)']]
# nutritions.index = df['id']
# nutritions.head()

In [76]:
df.isna().sum()

name                   1
minutes                0
tags                   0
steps                  0
ingredients            0
calories               0
total fat (PDV)        0
sugar (PDV)            0
sodium (PDV)           0
protein (PDV)          0
saturated fat (PDV)    0
carbohydrates (PDV)    0
dtype: int64

In [77]:
df = df.dropna()
df = df[:10000]
df.shape

(10000, 12)

In [78]:
# df.index = df['id']
# df = df.drop(columns = 'id')

In [79]:
# import ast
# df['ingredients'] = df['ingredients'].apply(lambda x: ast.literal_eval(x))

In [80]:
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()
def treat_ingredients(input):
    output = []
    for ingredient in input:
        ingredient_list = ingredient.split(' ')
        output.append(" ".join(ingredient_list))
    return "".join(output)

df['ingredients'] = df['ingredients'].apply(lambda x: [ls.stem(w) for w in x])
df['ingredients'] = df['ingredients'].apply(lambda x : treat_ingredients(x))

print(df['ingredients'][0])

df.head()

['winter squash', 'mexican seasoning', 'mixed spice', 'honey', 'butter', 'olive oil', 'salt']


Unnamed: 0,name,minutes,tags,steps,ingredients,calories,total fat (PDV),sugar (PDV),sodium (PDV),protein (PDV),saturated fat (PDV),carbohydrates (PDV)
0,arriba baked winter squash mexican style,55,"['60-minutes-or-less', 'time-to-make', 'course...","['make a choice and proceed with recipe', 'dep...","['winter squash', 'mexican seasoning', 'mixed ...",51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,a bit different breakfast pizza,30,"['30-minutes-or-less', 'time-to-make', 'course...","['preheat oven to 425 degrees f', 'press dough...","['prepared pizza crust', 'sausage patty', 'egg...",173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,all in the kitchen chili,130,"['time-to-make', 'course', 'preparation', 'mai...","['brown ground beef in large pot', 'add choppe...","['ground beef', 'yellow onions', 'diced tomato...",269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,alouette potatoes,45,"['60-minutes-or-less', 'time-to-make', 'course...",['place potatoes in a large pot of lightly sal...,"['spreadable cheese with garlic and herbs', 'n...",368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,amish tomato ketchup for canning,190,"['weeknight', 'time-to-make', 'course', 'main-...",['mix all ingredients& boil for 2 1 / 2 hours ...,"['tomato juice', 'apple cider vinegar', 'sugar...",352.9,1.0,337.0,23.0,3.0,0.0,28.0


In [81]:
df['name'] = df['name'].apply(lambda x : x.replace('  ', ' '))
# df['name'] = df['name'].apply(lambda x : x.replace(r'^\sS$', '\'s'))
df['name'] = df['name'].str.title()
df['name'].head()

0    Arriba  Baked Winter Squash Mexican Style
1              A Bit Different Breakfast Pizza
2                     All In The Kitchen Chili
3                            Alouette Potatoes
4             Amish Tomato Ketchup For Canning
Name: name, dtype: object

In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english', max_df = 0.7,min_df = 2)
ing_tfidf = tfidf.fit_transform(df['ingredients'])

In [83]:
print(ing_tfidf.shape)
tfidf.get_feature_names()[10:20]

(10000, 1552)




['acid',
 'acini',
 'acorn',
 'acting',
 'active',
 'added',
 'adobo',
 'adzuki',
 'agave',
 'ahi']

In [84]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(ing_tfidf, ing_tfidf)
cosine_sim.shape

(10000, 10000)

In [85]:
cosine_sim[1]

array([0.01460993, 1.        , 0.03490268, ..., 0.04130069, 0.        ,
       0.        ])

In [86]:
# Get the pairwsie similarity scores of all movies with that movie
sim_scores = list(enumerate(cosine_sim[1013]))

# Sort the movies based on the similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

# Get the scores of the 10 most similar recipes
sim_scores = sim_scores[1:6]

# Get the ingredient indices
ing_indices = [i[0] for i in sim_scores]

result = df.iloc[ing_indices]

print(f'The choosen recipe is {df.name.iloc[1013]}')
print(f'Recommended recipes are: {result.name.values.tolist()}')



The choosen recipe is 1937 Strawberry Shortcake
Recommended recipes are: ['Apple Crunch Cakelets Muffins', 'Apple Grunt', 'Apple Doughnuts Baked', 'Make It Your Way Shortcakes', 'Apple Puff Bake']


In [87]:
# testing
input = ['eggs', 'sausage', 'bacon', 'salt']
results1 = df.loc[df.ingredients.str.contains(r'(?=.*{})(?=.*{})(?=.*{})(?=.*{})'.format(input[0], input[1], input[2], input[3])) == True]
results1.head(2)

Unnamed: 0,name,minutes,tags,steps,ingredients,calories,total fat (PDV),sugar (PDV),sodium (PDV),protein (PDV),saturated fat (PDV),carbohydrates (PDV)
1940,A Fully Monty Healthier Version,35,"['60-minutes-or-less', 'time-to-make', 'course...","['prep your full monty:', 'preheat the broiler...","['tomatoes', 'portabella mushrooms', 'canadian...",770.4,62.0,70.0,90.0,97.0,62.0,18.0
4576,Amazing Breakfast Bake,115,"['time-to-make', 'course', 'main-ingredient', ...","['preheat the oven to 350 degree f', 'lightly ...","['unsalted butter', 'yellow onion', 'green bel...",850.0,61.0,31.0,86.0,80.0,98.0,26.0


In [88]:
def get_recommendations(input):
    if len(input) == 1:
        results = df.loc[df.ingredients.str.contains(r'(?=.*{})'.format(input[0])) == True]
    elif len(input) == 2:
        results = df.loc[df.ingredients.str.contains(r'(?=.*{})(?=.*{})'.format(input[0], input[1])) == True]
    elif len(input) == 3:
        results = df.loc[df.ingredients.str.contains(r'(?=.*{})(?=.*{})(?=.*{})'.format(input[0], input[1], input[2])) == True]
    else:
        results = df.loc[df.ingredients.str.contains(r'(?=.*{})(?=.*{})(?=.*{})(?=.*{})'.format(input[0], input[1], input[2], input[3])) == True]
    return results.drop(columns=['tags']).head()

In [114]:
# testing
input = ['cake']
get_recommendations(input).name.values[0]


'Beat This Banana Bread'

In [115]:
def more_recipes_with_similar_ingredients(input):
    results = get_recommendations(input)
    recipe_name = results.name.values[0]
    # get the initial recipe's index
    initial_recipe = results.loc[results['name'] == recipe_name]
    idx = initial_recipe.index[0] - 1

    # Get the pairwsie similarity scores of all ing with ing
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the ing based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of top 5 most similar ing
    sim_scores = sim_scores[1:6]

    # Get the ingredient indices for recipes
    ing_indices = [i[0] for i in sim_scores]

    # get dataframe 
    result = df.iloc[ing_indices].drop(columns='tags')
    # result = result.sort_values('calories', ascending=True)

    # print(f'The choosen recipe is {recipe_name}\n')
    # print('Recommended recipes are: ', *result.name.values, sep='\n')
    return result


more_recipes_with_similar_ingredients(input)

The choosen recipe is Beat This Banana Bread

Recommended recipes are: 
Almost Starbucks Frappuccino
3 Ingredient Ice Cream Sandwich Cake
A Homemade  Sundae
4 Inch Strawberry Chocolate Cheesecake
Absolutely Anything Ice Cream Layer Log


Unnamed: 0,name,minutes,steps,ingredients,calories,total fat (PDV),sugar (PDV),sodium (PDV),protein (PDV),saturated fat (PDV),carbohydrates (PDV)
311,Almost Starbucks Frappuccino,5,['in a blender combine coffee with ice cream a...,"['strong coffee', 'chocolate ice cream', 'choc...",313.0,19.0,136.0,8.0,10.0,35.0,15.0
1254,3 Ingredient Ice Cream Sandwich Cake,365,['arrange 4 ice cream sandwiches long sides to...,"['ice cream sandwiches', 'chocolate whipped cr...",75.5,4.0,26.0,3.0,1.0,5.0,3.0
1960,A Homemade Sundae,6,['in tall sundae glass start with a drizzle of...,"['vanilla ice cream', 'hersheys chocolate syru...",72.4,6.0,30.0,1.0,2.0,12.0,2.0
1431,4 Inch Strawberry Chocolate Cheesecake,495,['crust: mix cookies and butter until crumbs a...,"['chocolate sandwich style cookies', 'butter',...",249.3,28.0,59.0,8.0,7.0,50.0,6.0
2301,Absolutely Anything Ice Cream Layer Log,30,"['take ice cream out of freezer to soften', 'c...","['vanilla ice cream', 'ice cream topping']",2185.9,178.0,896.0,35.0,73.0,358.0,83.0


In [None]:
# randomized options if none of the recipes are interesting
def surprise_me(category):
    random = df.loc[df.tags.str.contains(category) == True].sample(3)
    return random.drop(columns=['tags'])

surprise_me('dietary')

Unnamed: 0,name,minutes,steps,ingredients,calories,total fat (PDV),sugar (PDV),sodium (PDV),protein (PDV),saturated fat (PDV),carbohydrates (PDV)
5439,Angel Hair Primavera,25,['heat a large pot of water over a medium heat...,"[angel hair pasta, broccoli, carrot, chicken b...",842.9,64.0,31.0,13.0,44.0,37.0,31.0
4136,Aloha Carrots,30,"['combine carrots , water and salt in a saucep...","[baby carrots, brown sugar, butter, crushed pi...",95.3,3.0,63.0,16.0,1.0,6.0,6.0
7142,Apple Pie With Walnut Streusel,105,['to make topping: in a small bowl with your f...,"[all-purpose flour, brown sugar, cinnamon, fre...",331.4,16.0,171.0,4.0,4.0,19.0,19.0


In [122]:
df.to_csv('final_df1.csv')

In [120]:
df.calories = df.calories.astype('float')
df.minutes = df.minutes.astype('float')
df['total fat (PDV)'] = df['total fat (PDV)'].astype('float')
df['sugar (PDV)'] = df['sugar (PDV)'].astype('float')
df['protein (PDV)'] = df['protein (PDV)'].astype('float')
df['saturated fat (PDV)'] = df['saturated fat (PDV)'].astype('float')
df['carbohydrates (PDV)'] = df['carbohydrates (PDV)'].astype('float')

In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 10000
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   name                 10000 non-null  object 
 1   minutes              10000 non-null  float64
 2   tags                 10000 non-null  object 
 3   steps                10000 non-null  object 
 4   ingredients          10000 non-null  object 
 5   calories             10000 non-null  float64
 6   total fat (PDV)      10000 non-null  float64
 7   sugar (PDV)          10000 non-null  float64
 8   sodium (PDV)         10000 non-null  object 
 9   protein (PDV)        10000 non-null  float64
 10  saturated fat (PDV)  10000 non-null  float64
 11  carbohydrates (PDV)  10000 non-null  float64
dtypes: float64(7), object(5)
memory usage: 1.2+ MB


In [99]:
import pickle              # import module first

f = open('consine_sim', 'wb')   # Pickle file is newly created where foo1.py is
pickle.dump(cosine_sim, f)          # dump data to f
f.close()   

# f = open('get_rec.pkl', 'wb')   # Pickle file is newly created where foo1.py is
# pickle.dump(get_recommendations(), f)          # dump data to f
# f.close() 

# f = open('more_rec.pkl', 'wb')   # Pickle file is newly created where foo1.py is
# pickle.dump(more_recipes_with_similar_ingredients(), f)          # dump data to f
# f.close() 

# f = open('surprise.pkl', 'wb')   # Pickle file is newly created where foo1.py is
# pickle.dump(surprise_me(), f)          # dump data to f
# f.close() 



In [None]:
df.tags[530]

"['time-to-make', 'course', 'preparation', 'low-protein', 'healthy', 'side-dishes', 'easy', 'beginner-cook', 'low-fat', 'dietary', 'low-cholesterol', 'low-saturated-fat', 'low-calorie', 'healthy-2', 'low-in-something', 'number-of-servings', '4-hours-or-less']"

### Word2vec

In [None]:
# def get_and_sort_corpus(df):
#     corpus_sorted = []
#     for doc in df.ingredients.values:
#         doc.sort()
#         corpus_sorted.append(doc)
#     return corpus_sorted

# corpus = get_and_sort_corpus(df)

In [None]:
# from gensim.models.word2vec import Word2Vec
# # train word2vec model 
# sg = 0 # CBOW: build a language model that correctly predicts the center word given the context words in which the center word appears
# workers = 8 # number of CPUs
# window = 6 # window size: average length of each document 
# min_count = 1 # unique ingredients are important to decide recipes 

# model_cbow = Word2Vec(corpus, sg=sg, workers=workers, window=window, min_count=min_count, vector_size=100)

In [None]:
# #Summarize the loaded model
# print(model_cbow)

# #Summarize vocabulary
# words = list(model_cbow.wv.index_to_key)
# words.sort()
# # print(words)

# #Acess vector for one word
# model_cbow.wv.similarity('cream', 'butter')

Word2Vec(vocab=5332, vector_size=100, alpha=0.025)


0.92756176