Capstone Project - Recipe/Meal Recommendation System

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import numpy as np


In [2]:
user_data = pd.read_csv('data/RAW_interactions.csv')
user_data.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


In [3]:
recipe_data = pd.read_csv('data/RAW_recipes.csv')
recipe_data.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


## Data Exploration

In [4]:
user_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1132367 entries, 0 to 1132366
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   user_id    1132367 non-null  int64 
 1   recipe_id  1132367 non-null  int64 
 2   date       1132367 non-null  object
 3   rating     1132367 non-null  int64 
 4   review     1132198 non-null  object
dtypes: int64(3), object(2)
memory usage: 43.2+ MB


In [5]:
print(len(user_data['user_id'].unique()))

226570


There are 226,570 unique users in this dataset

In [6]:
recipe_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB


In [7]:
print(len(recipe_data['id'].unique()))

231637


There are 231,637 unique recipes in this dataset.

With 226,570 users and 231,637 recipes, there are less users than there are recipes. Therefore, it is probably best for our recommender system to be user-user based.

## Preprocessing

The text data in description needs to be cleaned to ensure all punctuation is removed and words are all lowercase. The text data in the other columns looks pretty well cleaned already, but we still need to remove stopwords.

In [8]:
# Creating a function to perform cleaning steps at once
stopwords_list = stopwords.words('english')

no_bad_chars = re.compile('[!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n - ]')
no_nums = re.compile('[\d-]')

def clean_text(text):
    #text = no_nums.sub('', text) 
    text = no_bad_chars.sub(' ', text) 
    text = text.lower() 
    text = ' '.join(word for word in text.split() if word not in stopwords_list)
    return text


In [9]:
recipe_data['description'] = recipe_data['description'].astype(str)
descr_cleaned = recipe_data['description'].apply(clean_text)
descr_cleaned

0         autumn favorite time year cook recipe prepared...
1         recipe calls crust prebaked bit adding ingredi...
2         modified version 'mom's' chili hit 2004 christ...
3         super easy great tasting make ahead side dish ...
4         dh's amish mother raised recipe much prefers s...
                                ...                        
231632    delicious soup originally found better homes g...
231633                      spice mix make taste buds dance
231634                             deviled eggs cajun style
231635    i've heard 'cookies design' company never trie...
231636    i've heard 'cookies design' company never trie...
Name: description, Length: 231637, dtype: object

In [10]:
recipe_data['name'] = recipe_data['name'].astype(str)
names_cleaned = recipe_data['name'].apply(clean_text)
names_cleaned

0         arriba baked winter squash mexican style
1                    bit different breakfast pizza
2                                    kitchen chili
3                                alouette potatoes
4                     amish tomato ketchup canning
                            ...                   
231632                                 zydeco soup
231633                            zydeco spice mix
231634                   zydeco ya ya deviled eggs
231635                cookies design cookies stick
231636     cookies design sugar shortbread cookies
Name: name, Length: 231637, dtype: object

In [11]:
recipe_data['tags'] = recipe_data['tags'].astype(str)
tags_cleaned = recipe_data['tags'].apply(clean_text)
tags_cleaned

0         '60 minutes less' 'time make' 'course' 'main i...
1         '30 minutes less' 'time make' 'course' 'main i...
2         'time make' 'course' 'preparation' 'main dish'...
3         '60 minutes less' 'time make' 'course' 'main i...
4         'weeknight' 'time make' 'course' 'main ingredi...
                                ...                        
231632    'ham' '60 minutes less' 'time make' 'course' '...
231633    '15 minutes less' 'time make' 'course' 'prepar...
231634    '60 minutes less' 'time make' 'course' 'main i...
231635    '30 minutes less' 'time make' 'course' 'prepar...
231636    '30 minutes less' 'time make' 'course' 'prepar...
Name: tags, Length: 231637, dtype: object

In [12]:
# how many recipe names include the word 'vegetarian'?
len(names_cleaned[names_cleaned.str.contains('vegetarian')])


928

In [13]:
# how many recipe descriptions include the word 'vegetarian'?
len(descr_cleaned[descr_cleaned.str.contains('vegetarian')])

3784

In [14]:
# how many recipe names include the word 'vegan'?
len(names_cleaned[names_cleaned.str.contains('vegan')])

1362

In [15]:
# how many recipe descriptions include the word 'vegan'?
len(descr_cleaned[descr_cleaned.str.contains('vegan')])

1909

In [16]:
# how many recipe names include the word 'gluten-free'?
#We already removed dashes, so 'gluten-free' won't exist as one word anymore... best to just search for 'gluten'
print(len(names_cleaned[names_cleaned.str.contains('gluten')]))
names_cleaned[names_cleaned.str.contains('gluten')]

1003


586                 miracle honey oatmeal bread gluten free
755                             1 pound gluten free lasagna
1267      3 layer chocolate cake chocolate mousse fillin...
1303      3 variations gluten free bread recipe bread ma...
1833           gluten free springfield style cashew chicken
                                ...                        
229936             yummy foolproof gluten free french bread
229958                              yummy gluten free pasta
229959                    yummy gluten free pumpkin muffins
231030                           zucchini bread gluten free
231039                           zucchini bread gluten free
Name: name, Length: 1003, dtype: object

In [17]:
recipe_data['tags'] = recipe_data['tags'].astype(str)
tags_cleaned = recipe_data['tags'].apply(clean_text)
tags_cleaned

0         '60 minutes less' 'time make' 'course' 'main i...
1         '30 minutes less' 'time make' 'course' 'main i...
2         'time make' 'course' 'preparation' 'main dish'...
3         '60 minutes less' 'time make' 'course' 'main i...
4         'weeknight' 'time make' 'course' 'main ingredi...
                                ...                        
231632    'ham' '60 minutes less' 'time make' 'course' '...
231633    '15 minutes less' 'time make' 'course' 'prepar...
231634    '60 minutes less' 'time make' 'course' 'main i...
231635    '30 minutes less' 'time make' 'course' 'prepar...
231636    '30 minutes less' 'time make' 'course' 'prepar...
Name: tags, Length: 231637, dtype: object

The text data in the recipe_data columns 'name', 'tags', and 'ingredients' looks pretty well cleaned already. The next thing to do is to tokenize it.

In [18]:
from nltk.tokenize import RegexpTokenizer, word_tokenize

In [19]:
basic_token_pattern = r"(?u)\b\w\w+\b"
tokenizer = RegexpTokenizer(basic_token_pattern)

I imagine that certain recipes that fall under different diet categories, such as vegan/vegetarian, gluten-free, etc., would have that listed as a tag...
- similarity metric might do this on its own?

In [20]:
# tokenize tags:
tags_tokenized = tags_cleaned.apply(tokenizer.tokenize)
tags_tokenized

0         [60, minutes, less, time, make, course, main, ...
1         [30, minutes, less, time, make, course, main, ...
2         [time, make, course, preparation, main, dish, ...
3         [60, minutes, less, time, make, course, main, ...
4         [weeknight, time, make, course, main, ingredie...
                                ...                        
231632    [ham, 60, minutes, less, time, make, course, m...
231633    [15, minutes, less, time, make, course, prepar...
231634    [60, minutes, less, time, make, course, main, ...
231635    [30, minutes, less, time, make, course, prepar...
231636    [30, minutes, less, time, make, course, prepar...
Name: tags, Length: 231637, dtype: object

In [21]:
# tokenize recipe names:
name_tokenized = names_cleaned.apply(tokenizer.tokenize)
name_tokenized.head(10)

0    [arriba, baked, winter, squash, mexican, style]
1                 [bit, different, breakfast, pizza]
2                                   [kitchen, chili]
3                               [alouette, potatoes]
4                  [amish, tomato, ketchup, canning]
5                          [apple, day, milk, shake]
6                           [aww, marinated, olives]
7                 [backyard, style, barbecued, ribs]
8                         [bananas, ice, cream, pie]
9                              [beat, banana, bread]
Name: name, dtype: object

In [22]:
# tokenize recipe descriptions:
descr_tokenized = descr_cleaned.apply(tokenizer.tokenize)
descr_tokenized.head(10)

0    [autumn, favorite, time, year, cook, recipe, p...
1    [recipe, calls, crust, prebaked, bit, adding, ...
2    [modified, version, mom, chili, hit, 2004, chr...
3    [super, easy, great, tasting, make, ahead, sid...
4    [dh, amish, mother, raised, recipe, much, pref...
5                                                [nan]
6    [italian, mil, thoroughly, impressed, non, ita...
7    [recipe, posted, request, originaly, chef, sam...
8                                                [nan]
9                                       [ann, hodgman]
Name: description, dtype: object

Aside from tags, to categorize recipes into different diets I will likely need to manually filter through the ingredients data.

In [23]:
# tokenize ingredients:
ingr_tokenized = recipe_data['ingredients'].apply(tokenizer.tokenize)
ingr_tokenized.head(10)

0    [winter, squash, mexican, seasoning, mixed, sp...
1    [prepared, pizza, crust, sausage, patty, eggs,...
2    [ground, beef, yellow, onions, diced, tomatoes...
3    [spreadable, cheese, with, garlic, and, herbs,...
4    [tomato, juice, apple, cider, vinegar, sugar, ...
5    [milk, vanilla, ice, cream, frozen, apple, jui...
6    [fennel, seeds, green, olives, ripe, olives, g...
7    [pork, spareribs, soy, sauce, fresh, garlic, f...
8    [chocolate, sandwich, style, cookies, chocolat...
9    [sugar, unsalted, butter, bananas, eggs, fresh...
Name: ingredients, dtype: object

In [None]:
len(recipe_data[recipe_data['ingredients'].str.contains('vegetarian')])



In [None]:
# how many recipe ingredients lists include the word 'vegetarian'?
len(ingr_tokenized[ingr_tokenized.str.contains('vegetarian')])

In [None]:
# how many recipe ingredients lists include the word 'vegan'?
len(ingr_tokenized[ingr_tokenized.str.contains('vegan')])

In [None]:
#most common tags?
top_tags = tags_tokenized.value_counts()[:10].sort_values(ascending=False)
top_tags

In [12]:
#create 'diet' column for recipes to categorize them as different diet types
recipe_data["diet"] = np.nan

In [41]:
user_sorted = user_data.groupby('user_id')['recipe_id']

In [44]:
user_sorted.head()

0           40893
1           40893
2           44394
3           85009
4           85009
            ...  
1132359     82303
1132360     82303
1132362     72730
1132363    386618
1132366     78003
Name: recipe_id, Length: 375776, dtype: int64

In [None]:
#grouping all recipes each user has rated together in a new column

#user_data['sorted']=user_data.recipe_id.str.split(',')
user_data['sorted']=user_data.explode('recipe_id').groupby('user_id')['recipe_id'].agg(list).reindex(user_data['user_id']).tolist()
user_data['List']=(user_data.sorted.apply(set)-user_data['recipe_id'].apply(set)).apply(list)
user_data.loc[~user_data.List.astype(bool),'List']=df.recipe_id
user_data

____

In [None]:
#function from rec. systems lesson, removes previously rated items and orders items for recommendations based on rating (high to low)
def recommend_recipes(user, user_similarities, user_ratings, df, n_users=20, n_items=10):
    """n is the number of similar users who you wish to use to generate recommendations."""
    # User_Similarities Offset By 1 and Must Remove Current User
    top_n_similar_users = user_similarities[user-1].drop(user-1).sort_values().index[:n_users] 
    # Again, fixing the offset of user_ids
    top_n_similar_users = [i+1 for i in top_n_similar_users] 
    already_rated = set(df[df.user_id == 0].item_id.unique())
    unrated = set(df.item_id.unique()) - already_rated
    projected_user_ratings = user_ratings[user_ratings.index.isin(top_n_similar_users)].mean()[list(unrated)].sort_values(ascending=False)
    return projected_user_ratings[:n_items]

In [None]:
recommend_movies(1, user_similarities, user_ratings, df)