Capstone Project - Recipe/Meal Recommendation System

In [1]:
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
import re
import numpy as np
import pickle
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.stem import WordNetLemmatizer
from surprise.model_selection import train_test_split
import io
from surprise import SVD, BaselineOnly, SVDpp, NMF
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nicolemichaud/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
user_data = pd.read_csv('data/RAW_interactions.csv')
user_data.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


In [3]:
recipe_data = pd.read_csv('data/RAW_recipes.csv')
recipe_data.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [4]:
recipe_df = recipe_data.drop(columns=['contributor_id', 'submitted', 'tags', 'nutrition', 'description', 'ingredients', 'n_steps', 'steps', 'n_ingredients'])
recipe_df.head()

Unnamed: 0,name,id,minutes
0,arriba baked winter squash mexican style,137739,55
1,a bit different breakfast pizza,31490,30
2,all in the kitchen chili,112140,130
3,alouette potatoes,59389,45
4,amish tomato ketchup for canning,44061,190


In [5]:
# read in values as Surprise dataset 

reader = Reader()
recipe_df = Dataset.load_from_df(recipe_df, reader)


In [6]:
#Prepare user_data to be made suprise-compatible: can only have 3 columns total
user_df = user_data.drop(columns=['date', 'review'])
user_df.head()

Unnamed: 0,user_id,recipe_id,rating
0,38094,40893,4
1,1293707,40893,5
2,8937,44394,4
3,126440,85009,5
4,57222,85009,5


In [7]:
user_df = Dataset.load_from_df(user_df, reader)
user_df

<surprise.dataset.DatasetAutoFolds at 0x7fc725abae80>

In [8]:
#recipes_PP = pd.read_csv('data/PP_recipes.csv')
#recipes_PP.head()

In [9]:
#users_PP = pd.read_csv('data/PP_users.csv')
#users_PP.head()

In [10]:
#users_PP.info()

In [11]:
#pickled_map = pd.read_pickle('data/ingr_map.pkl')
#pickled_map

## Data Exploration

In [12]:
user_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1132367 entries, 0 to 1132366
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   user_id    1132367 non-null  int64 
 1   recipe_id  1132367 non-null  int64 
 2   date       1132367 non-null  object
 3   rating     1132367 non-null  int64 
 4   review     1132198 non-null  object
dtypes: int64(3), object(2)
memory usage: 43.2+ MB


In [13]:
print(len(user_data['user_id'].unique()))

226570


In [17]:
user_data['date'] = pd.to_datetime(user_data['date'])

In [22]:
max(user_data['date'])

Timestamp('2018-12-20 00:00:00')

In [23]:
min(user_data['date'])

Timestamp('2000-01-25 00:00:00')

There are 226,570 unique users in this dataset

In [14]:
recipe_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB


In [15]:
print(len(recipe_data['id'].unique()))

231637


In [16]:
recipe_data = recipe_data.drop(columns=['contributor_id', 'submitted', 'nutrition', 'steps'])

There are 231,637 unique recipes in this dataset.

With 226,570 users and 231,637 recipes, there are less users than there are recipes. Therefore, it is probably best for our recommender system to be user-user based.

## Preprocessing

The text data in description needs to be cleaned to ensure all punctuation is removed and words are all lowercase. The text data in the other columns looks pretty well cleaned already, but we still need to remove stopwords.

In [17]:
# Creating a function to perform cleaning steps at once
stopwords_list = stopwords.words('english')

no_bad_chars = re.compile('[!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n - ]')
no_nums = re.compile('[\d-]')

def clean_text(text):
    text = no_bad_chars.sub(' ', text) 
    text = text.lower() 
    text = ' '.join(word for word in text.split() if word not in stopwords_list)
    return text


In [18]:
#recipe_data['tags'] = recipe_data['tags'].astype(str)
tags_cleaned = recipe_data['tags'].apply(clean_text)


In [19]:
GF = []
for row in recipe_data['tags']: 
    if "gluten-free" in row : GF.append(1)
    elif "gluten free" in row : GF.append(1) 
    else: GF.append(0) 

In [20]:
recipe_data['GF'] = GF

In [21]:
recipe_data['GF'].value_counts()

0    225894
1      5743
Name: GF, dtype: int64

In [22]:
#Ingredient lists for diet filtering:
vegan = ['ham', 'beef', 'meat', 'chicken', 'pork', 'bacon', 'sausage', 'lamb', 'veal', 'turkey', 'steak', 'rib', 'frankfurter', 'duck', 'poultry', 'goat', 'liver', 'hen', 'quail', 'brisket', 'goose','fish', 'shrimp', 'seafood', 'crab', 'lobster', 'clam', 'oyster', 'scallop', 'mussel', 'cod', 'salmon', 'halibut', 'shellfish', 'roe', 'tuna', 'caviar', 'pollock', 'yellowtail', 'squid', 'calamari', 'octopus', 'crawfish', 'crayfish', 'sardine', 'trout', 'flounder', 'anchovy', 'bass', 'haddock', 'sole','egg', 'honey','milk', 'cheese', 'yogurt', 'mayonnaise', 'butter', 'margarine', 'cream']

vegetarian = ['ham', 'beef', 'meat', 'chicken', 'pork', 'bacon', 'sausage', 'lamb', 'veal', 'turkey', 'steak', 'rib', 'frankfurter', 'duck', 'poultry', 'goat', 'liver', 'hen', 'quail', 'brisket', 'goose','fish', 'shrimp', 'seafood', 'crab', 'lobster', 'clam', 'oyster', 'scallop', 'mussel', 'cod', 'salmon', 'halibut', 'shellfish', 'roe', 'tuna', 'caviar', 'pollock', 'yellowtail', 'squid', 'calamari', 'octopus', 'crawfish', 'crayfish', 'sardine', 'trout', 'flounder', 'anchovy', 'bass', 'haddock', 'sole']

In [23]:
basic_token_pattern = r"(?u)\b\w\w+\b"
tokenizer = RegexpTokenizer(basic_token_pattern)

In [24]:
#tags_tokenized = tokenizer.tokenize(tags_cleaned)

In [25]:
recipe_data['vegetarian'] = None
recipe_data['vegan'] = None



In [26]:
vege_pattern = '|'.join(vegetarian)
vegan_pattern = '|'.join(vegan)


recipe_data.vegetarian = recipe_data.ingredients.str.contains(vege_pattern)
recipe_data.vegan = recipe_data.ingredients.str.contains(vegan_pattern)

In [27]:
recipe_data.head()

Unnamed: 0,name,id,minutes,tags,n_steps,description,ingredients,n_ingredients,GF,vegetarian,vegan
0,arriba baked winter squash mexican style,137739,55,"['60-minutes-or-less', 'time-to-make', 'course...",11,autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,0,False,True
1,a bit different breakfast pizza,31490,30,"['30-minutes-or-less', 'time-to-make', 'course...",9,this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,0,True,True
2,all in the kitchen chili,112140,130,"['time-to-make', 'course', 'preparation', 'mai...",6,this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,0,True,True
3,alouette potatoes,59389,45,"['60-minutes-or-less', 'time-to-make', 'course...",11,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11,0,False,True
4,amish tomato ketchup for canning,44061,190,"['weeknight', 'time-to-make', 'course', 'main-...",5,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8,0,False,False


In [28]:
new_vals = {'False': 1, 'True': 0}

In [29]:
recipe_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   name           231636 non-null  object
 1   id             231637 non-null  int64 
 2   minutes        231637 non-null  int64 
 3   tags           231637 non-null  object
 4   n_steps        231637 non-null  int64 
 5   description    226658 non-null  object
 6   ingredients    231637 non-null  object
 7   n_ingredients  231637 non-null  int64 
 8   GF             231637 non-null  int64 
 9   vegetarian     231637 non-null  bool  
 10  vegan          231637 non-null  bool  
dtypes: bool(2), int64(5), object(4)
memory usage: 16.3+ MB


In [30]:
recipe_data['vegetarian'] = recipe_data['vegetarian'].astype(str)
recipe_data['vegetarian'] = recipe_data['vegetarian'].replace({'False': 1, 'True': 0})

In [31]:
recipe_data['vegan'] = recipe_data['vegan'].astype(str)
recipe_data['vegan'] = recipe_data['vegan'].replace({'False': 1, 'True': 0})

In [32]:
recipe_data.head()

Unnamed: 0,name,id,minutes,tags,n_steps,description,ingredients,n_ingredients,GF,vegetarian,vegan
0,arriba baked winter squash mexican style,137739,55,"['60-minutes-or-less', 'time-to-make', 'course...",11,autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,0,1,0
1,a bit different breakfast pizza,31490,30,"['30-minutes-or-less', 'time-to-make', 'course...",9,this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,0,0,0
2,all in the kitchen chili,112140,130,"['time-to-make', 'course', 'preparation', 'mai...",6,this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,0,0,0
3,alouette potatoes,59389,45,"['60-minutes-or-less', 'time-to-make', 'course...",11,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11,0,1,0
4,amish tomato ketchup for canning,44061,190,"['weeknight', 'time-to-make', 'course', 'main-...",5,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8,0,1,1


In [33]:
recipe_data['vegetarian'].value_counts()

1    134596
0     97041
Name: vegetarian, dtype: int64

In [34]:
recipe_data['vegan'].value_counts()

0    197961
1     33676
Name: vegan, dtype: int64

In [35]:
#Making column names match and merging dfs to classify user diets based on the recipes they've used
recipe_data = recipe_data.rename(columns = {'id': 'recipe_id'})

user_diets = pd.merge(user_data, recipe_data, on='recipe_id', how='left')



In [36]:
user_ratings = user_diets.copy()
user_ratings = user_ratings.drop(columns=['date', 'review', 'name', 'minutes', 'tags', 'n_steps', 'description', 'ingredients', 'n_ingredients'])
user_ratings.head()

Unnamed: 0,user_id,recipe_id,rating,GF,vegetarian,vegan
0,38094,40893,4,0,0,0
1,1293707,40893,5,0,0,0
2,8937,44394,4,0,1,0
3,126440,85009,5,0,0,0
4,57222,85009,5,0,0,0


In [37]:
user_diets = user_diets.drop(columns=['date', 'review', 'rating', 'name', 'minutes', 'tags', 'n_steps', 'description', 'ingredients', 'n_ingredients'])
user_diets.head()

Unnamed: 0,user_id,recipe_id,GF,vegetarian,vegan
0,38094,40893,0,0,0
1,1293707,40893,0,0,0
2,8937,44394,0,1,0
3,126440,85009,0,0,0
4,57222,85009,0,0,0


In [38]:
user_diets_count = user_diets.groupby(['user_id'])['recipe_id'].count()
user_diets_count.head()

user_id
1533    128
1535    794
1581      1
1634     60
1676     31
Name: recipe_id, dtype: int64

In [39]:
user_diets_full = (user_diets.groupby(['user_id']).agg({ 'GF': sum, 'vegetarian':sum, 'vegan': sum}).reset_index())
user_diets.head()

Unnamed: 0,user_id,recipe_id,GF,vegetarian,vegan
0,38094,40893,0,0,0
1,1293707,40893,0,0,0
2,8937,44394,0,1,0
3,126440,85009,0,0,0
4,57222,85009,0,0,0


In [40]:
user_diets = (user_diets.groupby(['user_id']).agg({'recipe_id': lambda x: x.tolist(), 'GF': sum, 'vegetarian':sum, 'vegan': sum}).reset_index())
user_diets.head()

Unnamed: 0,user_id,recipe_id,GF,vegetarian,vegan
0,1533,"[116345, 32907, 14750, 24136, 63598, 83375, 35...",11,57,8
1,1535,"[349022, 50022, 78834, 47474, 230720, 14111, 8...",20,630,114
2,1581,[341050],0,0,0
3,1634,"[16512, 158215, 34533, 44459, 26212, 168194, 3...",3,35,9
4,1676,"[34233, 99156, 66799, 166273, 250600, 179836, ...",3,4,2


In [41]:
user_diets['is_vegetarian'] = None
user_diets['is_vegan'] = None
user_diets['is_GF'] = None
user_diets.head()

Unnamed: 0,user_id,recipe_id,GF,vegetarian,vegan,is_vegetarian,is_vegan,is_GF
0,1533,"[116345, 32907, 14750, 24136, 63598, 83375, 35...",11,57,8,,,
1,1535,"[349022, 50022, 78834, 47474, 230720, 14111, 8...",20,630,114,,,
2,1581,[341050],0,0,0,,,
3,1634,"[16512, 158215, 34533, 44459, 26212, 168194, 3...",3,35,9,,,
4,1676,"[34233, 99156, 66799, 166273, 250600, 179836, ...",3,4,2,,,


In [42]:
#count = 0
#for str_list in user_diets['recipe_id']:
#    for name in str_list:
#        if name == "arts":
#            count += 1

#print(count)

In [43]:
user_diets['recipe_totals'] = user_diets['recipe_id'].str.len()


In [44]:
user_diets['is_vegetarian'] = np.where((user_diets['vegetarian']) >= ((user_diets['recipe_totals'])*(.75)), True, False)
user_diets.head()

Unnamed: 0,user_id,recipe_id,GF,vegetarian,vegan,is_vegetarian,is_vegan,is_GF,recipe_totals
0,1533,"[116345, 32907, 14750, 24136, 63598, 83375, 35...",11,57,8,False,,,128
1,1535,"[349022, 50022, 78834, 47474, 230720, 14111, 8...",20,630,114,True,,,794
2,1581,[341050],0,0,0,False,,,1
3,1634,"[16512, 158215, 34533, 44459, 26212, 168194, 3...",3,35,9,False,,,60
4,1676,"[34233, 99156, 66799, 166273, 250600, 179836, ...",3,4,2,False,,,31


In [45]:
user_diets['is_vegetarian'].value_counts()

True     114857
False    111713
Name: is_vegetarian, dtype: int64

In [46]:
user_diets['is_vegan'] = np.where(user_diets['vegan'] >= ((user_diets['recipe_totals'])*(.75)), True, False)
user_diets['is_vegan'].value_counts()

False    206565
True      20005
Name: is_vegan, dtype: int64

In [47]:
user_diets['is_GF'] = np.where(user_diets['GF'] >= ((user_diets['recipe_totals'])*(.75)), True, False)
user_diets['is_GF'].value_counts()

False    220978
True       5592
Name: is_GF, dtype: int64

In [48]:
#user_diets_full = user_diets.explode('recipe_id', 'rating')

In [49]:
#user_diets_full.head()

## Modeling

In [50]:
user_data = user_data.drop(columns= ['date', 'review'])

In [64]:
#train-test splitting the data
trainset, testset = train_test_split(user_df, test_size=0.25)

In [65]:
# #from surprise documentation: https://surprise.readthedocs.io/en/stable/FAQ.html
# def get_top_n(predictions, n=3):
    

#     # First map the predictions to each user.
#     top_n = defaultdict(list)
#     for uid, iid, true_r, est, _ in predictions:
#         top_n[uid].append((iid, est))

#     # Then sort the predictions for each user and retrieve the k highest ones.
#     for uid, user_ratings in top_n.items():
#         user_ratings.sort(key=lambda x: x[1], reverse=True)
#         top_n[uid] = user_ratings[:n]

#     return top_n


# # First train an SVD algorithm on the dataset.
# data = Dataset.load_from_df(user_data, reader)
# trainset = data.build_full_trainset()
# algo_ = SVD(n_factors=20, n_epochs=10, reg_all=0.05)
# algo_.fit(trainset)



# # Than predict ratings for all pairs (u, i) that are NOT in the training set.
# testset = trainset.build_anti_testset()
# predictions = algo.test(testset)



In [66]:
# filtered_preds = []
# def diet_prefs(predictions):
#   for prediction in predictions:
#     for user in user_diets:
#       if is_vegetarian == "True":
#         filtered_preds = predictions[prediction-1].drop(prediction-1)
#       elif is_vegan == "True":
#         filtered_preds = predictions[prediction-1].drop(prediction-1)
#       elif is_GF == "True":
#         filtered_preds = predictions[prediction-1].drop(prediction-1)
#       else:
#         filtered_preds.append(prediction)


In [67]:
# top_n = get_top_n(predictions, n=3)

# # Print the recommended items for each user
# for uid, user_ratings in top_n.items():
#     print(uid, [iid for (iid, _) in user_ratings])

In [70]:
# #BaselineOnly model - (Algorithm predicting the baseline estimate for given user and item.)
# from surprise import BaselineOnly
# cross_validate(BaselineOnly(), user_df, verbose=False)

In [79]:
# benchmark = []
# # Iterate over all algorithms

# algorithms = [SVD(), SVDpp(), NMF(), BaselineOnly()]

# print ("Attempting: ", str(algorithms), '\n\n\n')

# for algorithm in algorithms:
#     print("Starting: " ,str(algorithm))
#     # Perform cross validation
#     results = cross_validate(algorithm, user_df, measures=['RMSE'], cv=3, verbose=False)
#     # results = cross_validate(algorithm, data, measures=['RMSE','MAE'], cv=3, verbose=False)
    
#     # Get results & append algorithm name
#     tmp = pd.DataFrame.from_dict(results).mean(axis=0)
#     tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
#     benchmark.append(tmp)
#     print("Done: " ,str(algorithm), "\n\n")

# print ('\n\tDONE\n')

In [71]:
#GridSearch to find best parameters for SVD, and how a model with those parameters is expected to perform
# ## Perform a gridsearch with SVD
# # ⏰ This cell may take several minutes to run
# params = {'n_factors': [10, 20,50,75],
#          'reg_all': [0.01, 0.02, 0.05]}
# g_s_svd = GridSearchCV(SVD, param_grid=params, n_jobs=-1)
# g_s_svd.fit(user_df)


In [72]:
# # print out optimal parameters for SVD after GridSearch
# print(g_s_svd.best_score)
# print(g_s_svd.best_params)

In [74]:
#Basic SVD algorithm for comparisions and building on top of:

#bsl_options = {
               #'user_based': False
               #}
svd = SVD(n_factors=10, n_epochs=10, lr_all=0.005, reg_all=0.07, random_state=42)
#n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4
svd.fit(trainset)
preds_svd = svd.test(testset)
print(accuracy.rmse(preds_svd))

RMSE: 1.2137
1.2136692012198171


In [None]:
# #use best alg. with ALS (we have a large and sparse matrix ?)

# print('Using ALS')
# bsl_options = {'method': 'als',
#                'n_epochs': 5,
#                'reg_u': 12,
#                'reg_i': 5
#                }
# algo = BaselineOnly(bsl_options=bsl_options)
# cross_validate(algo, recipe_df, measures=['RMSE'], cv=3, verbose=False)


In [30]:
svd.qi.shape


(205475, 10)

In [39]:
svd.pu.shape

(183388, 10)

In [76]:
# U, S, V = np.linalg.svd(a, full_matrices=True)

In [77]:
# #This type of SVD might work with a dataframe that has extra features - the stretching factor would have to account for these (k=3)
# A = ratings_matrix
# # Apply SVD
# u, s, vt = svds(A, k=2) # k is the number of stretching factors

# print ('A:\n', A.toarray())
# print ('=')
# print ('\nU:\n', u)
# print ('\nΣ:\n', s)
# print ('\nV.T:\n', vt)

In [78]:
##to see other param options for suprise's SVD
#svd.__dict__

________

In [80]:
# #for simplicity, renaming column 'recipe_id' to 'item_id'
# items_df = items_df.copy().rename(columns={'recipe_id': 'item_id'})
# item_features_df = item_features_df.copy().rename(columns={'recipe_id': 'item_id'})
# user_item_ratings = user_item_ratings.copy().rename(columns={'recipe_id': 'item_id'})
# items_df.head()

In [81]:
# #This is an alternative way to generate the necessary matrices - doing so by pivoting my df caused the new data to be too large for my computer

# # Create a mapping between user/item IDs and matrix indices
# user_id_to_index = {user_id: index for index, user_id in enumerate(users_df['user_id'])}
# item_id_to_index = {item_id: index for index, item_id in enumerate(items_df['item_id'])}

In [82]:

# # Initialize the ratings matrix
# ratings_matrix = np.zeros((len(users_df), len(items_df)))

# # Fill in the ratings matrix based on user_item_ratings DataFrame
# for _, row in user_item_ratings.iterrows():
#     user_index = user_id_to_index.get(row['user_id'])
#     item_index = item_id_to_index.get(row['item_id'])
#     if user_index is not None and item_index is not None:
#         ratings_matrix[user_index, item_index] = row['rating']

In [53]:
# from surprise.prediction_algorithms import BaselineOnly
# from surprise import accuracy 

# bsl_options = {'method': 'als',
#                'n_epochs': 5,
#                'reg_u': 12,
#                'reg_i': 5
#                }

# algo = BaselineOnly(bsl_options=bsl_options)
# predictions = algo.fit(trainset).test(testset)
# accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 1.2132


1.2131928381465635

____

In [83]:
# #Trying without surprise
# from scipy.sparse.linalg import svds
# U, sigma, Vt = svds(R_demeaned, k = 50)

In [None]:
#doesn't work, can't pivot df before this step which is necessary

# for col in user_ratings:
#     mean = user_ratings[col].mean()
#     user_ratings[col] = user_ratings[col].fillna(value=mean)
# user_ratings.head()

In [26]:
# import numpy as np
# import datetime

In [84]:
# # ⏰ Expect this cell to take several minutes to run
# start = datetime.datetime.now()
# user_matrix = []
# for i, row in enumerate(user_ratings.index):
#     u1 = user_ratings[row]
#     # Matrix is symetric, so fill in values for previously examined users
#     user_distances = [entry[i] for entry in user_matrix] 
#     for j, row2 in enumerate(user_ratings.index[i:]):
#         u2 = user_ratings[row2]
#         d = distance(u1,u2)
#         user_distances.append(d)
#     user_matrix.append(user_distances)
# user_similarities = pd.DataFrame(user_matrix)

# end = datetime.datetime.now()
# elapsed = end - start
# print(elapsed)

# user_similarities.head()

In [None]:
#function from rec. systems lesson, removes previously rated items and orders items for recommendations based on rating (high to low)
#based on avg of closest users ratings
def recommend_recipes(user, user_similarities, user_ratings, df, n_users=20, n_items=10):
    """n is the number of similar users who you wish to use to generate recommendations."""
    # User_Similarities Offset By 1 and Must Remove Current User
    .sort_values().index[:n_users] 
    # Again, fixing the offset of user_ids
    top_n_similar_users = [i+1 for i in top_n_similar_users] 
    already_rated = set(df[df.user_id == 0].item_id.unique())
    unrated = set(df.item_id.unique()) - already_rated
    projected_user_ratings = user_ratings[user_ratings.index.isin(top_n_similar_users)].mean()[list(unrated)].sort_values(ascending=False)
    return projected_user_ratings[:n_items]

In [None]:
recommend_movies(1, user_similarities, user_ratings, df)

In [None]:
# return the top n recommendations using the 
def recommended_recipes(user_ratings,movie_title_df,n):
        for idx, rec in enumerate(user_ratings):
            title = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['title']
            print('Recommendation #', idx+1, ': ', title, '/n')
            n-=1 
            if n==0:
                break
                
recommended_movies(ranked_movies,df_movies,5)

limitation: 
- person can be multiple 'diet types' at once
- words like "vegetarian beef"

Next steps to try:
- get matrix using dataframe.to_numpy...
- need to first figure out user_diets


once achieved goal, to do:
- gridsearch for more params
- train, test, and validate sets?
- ALS vs SGD