In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [107]:
reviews = pd.read_pickle('../data/revdense.pk')
keywords =  pd.read_pickle('../data/keywords.pk')
recipe_ids = list(set(keywords.index).intersection(set(reviews.RecipeId.values)))
keywords = keywords[keywords.index.isin(recipe_ids)]
reviews = reviews[reviews['RecipeId'].isin(recipe_ids)]
reviews['Keywords'] = [keywords['Keywords'][recipe] for recipe in reviews.RecipeId]
reviews['KeywordsTitle'] = [keywords['KeywordsTitle'][recipe] for recipe in reviews.RecipeId]


In [108]:
reviews

Unnamed: 0_level_0,UserId,RecipeId,Rating,Keywords,KeywordsTitle
ReviewId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
829,7802,8782,4,"[ham, whole chicken, poultry, roast, chicken, ...","[roast, chicken]"
1179,9869,8754,4,"[for large groups, peanut butter, cookie & bro...","[sweet, peanut butter]"
1399,6357,9841,5,"[dessert, cake, easy, oven]",[cake]
1642,11297,5478,4,"[casserole, potato, rice, healthy, poultry, ch...","[chicken, easy]"
1771,4470,8739,5,"[refrigerator, mexican, tex mex, southwestern ...",[]
...,...,...,...,...,...
2090011,395981,116266,5,"[for large groups, potluck, chicken, meat, spi...",[chicken]
2090051,137477,12591,5,"[for large groups, healthy, cake, dessert, oven]",[cake]
2090149,3288,261889,5,"[for large groups, kid friendly, christmas, re...",[]
2090199,128473,18458,5,"[beverages, russian, healthy, weeknight, easy]",[]


In [109]:
keywords_w_rep = []
for kws in keywords.Keywords:
    keywords_w_rep+=list(kws)

keyword_counts = {word:0 for word in set(keywords_w_rep)}
for word in keywords_w_rep:
    keyword_counts[word]+=1

keyword_list_master = list(keyword_counts.keys())
keyword_index = {keyword_list_master[i]:i for 
                 i in range(len(keyword_list_master))}

def kwlist_to_vec(keywordlist:list)->list:
    vec = np.zeros(len(keyword_list_master))
    for w in keywordlist:
        vec[keyword_index[w]]+=1
    return vec

recipe_kwvecs = {}
for recipe in keywords.index:
    recipe_kwvecs[recipe]=kwlist_to_vec(keywords['Keywords'][recipe])

recipe_mean_vec = sum(list(recipe_kwvecs.values()))/len(recipe_kwvecs)


In [110]:
recipe_kwvecs_norm = {recipe:recipe_kwvecs[recipe]-recipe_mean_vec for recipe in recipe_kwvecs}
for recipe in recipe_kwvecs_norm:
    recipe_kwvecs_norm[recipe] = recipe_kwvecs_norm[recipe]/np.linalg.norm(recipe_kwvecs_norm[recipe])

## User Embedding

We can obtain an embedding of the user space to the keyword vector space by taking the average of the recipes reviewed by that user.

However, since this will be the basis of our subsequent predictions, we will do a train test split and compute the average using only recipes in the training set.

In [5]:
from sklearn.model_selection import train_test_split

rev_train, rev_test = train_test_split(reviews,test_size=0.2,stratify=reviews['UserId'])

In [7]:
user_mean_vecs = {}
for user in rev_train.UserId.unique():
    uservecs = [kwlist_to_vec(kwl) for kwl in rev_train.loc[reviews.UserId==user].Keywords]
    user_mean_vecs[user] = sum(uservecs)/len(uservecs)

user_vecs_transl = {user:user_mean_vecs[user]-recipe_mean_vec for user in user_mean_vecs}
user_vt_lens ={}
for user in user_vecs_transl:
    uvec = user_vecs_transl[user]
    user_vt_lens[user] = np.sqrt(np.dot(uvec,uvec))
user_vecs_norm = {user: user_vecs_transl[user]/user_vt_lens[user] for user in user_vt_lens}


In [8]:
user_list = list(user_mean_vecs.keys())
recipe_list = list(recipe_kwvecs.keys())

In [31]:
user_dist_all = {user:{recipe:np.linalg.norm(user_mean_vecs[user]-recipe_kwvecs[recipe]) for recipe in recipe_list} for user in user_list}
user_dot_all = {user:{recipe:np.dot(user_vecs_norm[user],recipe_kwvecs_norm[recipe]) for recipe in recipe_list} for user in user_list}


mean_dot_all = {user:np.mean(list(user_dot_all[user].values())) for user in user_dist_all}
user_dot_train = {}
user_dot_test = {}
mean_dot_train = {}
mean_dot_test = {}
for user in user_list:
    user_recs_train = rev_train.loc[rev_train.UserId==user]['RecipeId'].values
    user_recs_test = rev_test.loc[rev_test.UserId==user]['RecipeId'].values
    user_dot_train[user] = [user_dot_all[user][recipe] for recipe in user_recs_train]
    user_dot_test[user] = [user_dot_all[user][recipe] for recipe in user_recs_test]
    mean_dot_train[user] = np.mean(user_dot_train[user])
    mean_dot_test[user] = np.mean(user_dot_test[user])

        

In [22]:
mean_dist_all = {user:np.mean(list(user_dist_all[user].values())) for user in user_dist_all}

user_dist_train = {}
user_dist_test = {}
mean_dist_train = {}
mean_dist_test = {}
for user in user_list:
    user_recs_train = rev_train.loc[rev_train.UserId==user]['RecipeId'].values
    user_recs_test = rev_test.loc[rev_test.UserId==user]['RecipeId'].values
    user_dist_train[user] = [user_dist_all[user][recipe] for recipe in user_recs_train]
    user_dist_test[user] = [user_dist_all[user][recipe] for recipe in user_recs_test]
    mean_dist_train[user] = np.mean(user_dist_train[user])
    mean_dist_test[user] = np.mean(user_dist_test[user])

        

## User similarity
We can use these embeddings to give a measure of similarity between users.

In [35]:
uservecs = np.array([user_vecs_norm[user] for user in user_list])


In [41]:
usersimmat = np.matmul(uservecs,np.transpose(uservecs))
usersimdic = {(user_list[i],user_list[j]):usersimmat[i][j] for i in range(len(user_list)) for j in range(len(user_list))}

In [43]:
rev_train

Unnamed: 0_level_0,UserId,RecipeId,Rating,Keywords,KeywordsTitle
ReviewId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1498511,70073,26210,5,"[potato, kosher, vegetable, roast, easy]","[roast, potato]"
552119,280271,132765,5,"[chinese, italian, microwave, small appliance,...","[microwave, lemon]"
148491,156653,60119,5,"[deep fried, potato, healthy, vegetable, easy]",[]
1052473,542159,102909,5,"[kid friendly, egg, lunch/snacks, fried, break...","[egg, simple, fried]"
669335,262312,82102,5,"[sauce, poultry, fried, chicken breast, oven, ...","[chicken breast, fried, chicken, oven]"
...,...,...,...,...,...
1257243,190230,74844,0,"[rice, sauce, japanese, sweet, chicken, summer...",[]
392437,424680,203261,5,"[breads, healthy, easy, simple, indian, asian]",[]
509967,546010,132052,4,"[lunch/snacks, breakfast, camping, easy, begin...",[]
22734,12118,16702,5,"[brunch, russian, vegetable, cake, grains, bre...","[pancake, cake]"


In [48]:
recipe_users_train = {recipe:[] for recipe in recipe_list}
user_recipes_train = {user:[] for user in user_list}
for review in rev_train.index:
    user = rev_train['UserId'][review]
    recipe = rev_train['RecipeId'][review]
    recipe_users_train[recipe].append(user)
    user_recipes_train[user].append(recipe)


In [54]:
recipe_list.sort(key = lambda x:len(recipe_users_train[x]),reverse=True)
popularity_ranking = {recipe_list[i]:i+1 for i in range(len(recipe_list))}

In [52]:
user_recipes_test = {user:[] for user in user_list}
for review in rev_test.index:
    user = rev_test['UserId'][review]
    recipe = rev_test['RecipeId'][review]
    user_recipes_test[user].append(recipe)

In [50]:
user_recipe_scores = {}
for user in user_list:
    user_scores = {}
    for recipe in recipe_list:
        recipe_userlist = recipe_users_train[recipe]
        user_scores[recipe] = sum([usersimdic[(user,user2)] for user2 in recipe_userlist])
    user_recipe_scores[user]=user_scores

In [51]:
def scores_to_rankings(rankings:dict)->dict:
    recs = list(rankings.keys())
    recs.sort(key = lambda x:rankings[x],reverse=True)
    return {recs[i]:i+1 for i in range(len(recs))}



In [102]:
user_ranking_data = {'UserId':[],'RankingsPopTest':[],'RankingsPopTrain':[],'RankingsRecTest':[],'RankingsRecTrain':[]}

for user in user_list:
    user_ranking_data['UserId'].append(user)
    user_ranking_data['RankingsPopTest'].append(1/np.mean([popularity_ranking[recipe] for 
                                                 recipe in user_recipes_test[user]]))
    user_ranking_data['RankingsPopTrain'].append(1/np.mean([popularity_ranking[recipe] for 
                                                 recipe in user_recipes_train[user]]))
    user_rec_ranking = scores_to_rankings(user_recipe_scores[user])
    user_ranking_data['RankingsRecTest'].append(1/np.mean([user_rec_ranking[recipe] for 
                                                 recipe in user_recipes_test[user]]))
    user_ranking_data['RankingsRecTrain'].append(1/np.mean([user_rec_ranking[recipe] for 
                                                 recipe in user_recipes_train[user]]))
user_ranking_df = pd.DataFrame(user_ranking_data)
user_ranking_df=user_ranking_df.set_index('UserId')

In [89]:
def train_sign(user):
    return (user_ranking_df.RankingsPopTrain-user_ranking_df.RankingsRecTrain)[user]

In [90]:
def hybrid0(user):
    if train_sign(user)>= 0:
        return 'Pop'
    else:
        return 'Rec'

def hybrid1(user):
    if train_sign(user)> 0:
        return 'Pop'
    else:
        return 'Rec'

In [104]:
user_ranking_df['RankingsPopTest'].mean()

0.0014052434016194578

In [105]:
user_ranking_df['RankingsRecTest'].mean()

0.0014179424965778597

In [None]:
user_ranking_df['RankingsPopTest'].mean()

In [95]:
user_ranking_df['Hybrid0']=[user_ranking_df['Rankings'+hybrid0(user)+'Test'][user] for user in user_ranking_df.index]
user_ranking_df['Hybrid1']=[user_ranking_df['Rankings'+hybrid1(user)+'Test'][user] for user in user_ranking_df.index]

In [101]:
user_ranking_df['RankingsPopTest'].mean()

0.1260214044856628

In [65]:
user_ranking_df['Dist_from_mean']=[user_vt_lens[user] for user in user_ranking_df.index]
user_ranking_df['No_revs_tr']=[len(user_recipes_train[user]) for user in user_ranking_df.index]

In [83]:
user_ranking_df.loc[user_ranking_df.RankingsRecTest>user_ranking_df.RankingsPopTrain]['Dist_from_mean'].mean()

0.567642923611505

In [84]:
user_ranking_df.loc[user_ranking_df.RankingsRecTest<user_ranking_df.RankingsPopTrain]['Dist_from_mean'].mean()

0.5481869551720925

In [111]:
len(keyword_list_master)

273

In [112]:
keyword_list_master

['african',
 'scottish',
 'chicken thigh & leg',
 'toddler friendly',
 'meatballs',
 'deep fried',
 'no shell fish',
 'south african',
 'sweet',
 'salad',
 'easy',
 'italian',
 'southwest asia (middle east)',
 'squid',
 'tuna',
 'hawaiian',
 'filipino',
 'medium grain rice',
 'steam',
 'canadian',
 'pork',
 'pressure cooker',
 'camping',
 'lobster',
 'danish',
 'manicotti',
 'kosher',
 'costa rican',
 'moose',
 'indian',
 'moroccan',
 'christmas',
 'winter',
 'savory pies',
 'lime',
 'russian',
 'curries',
 'szechuan',
 'citrus',
 'greens',
 'household cleaner',
 'belgian',
 '< 4 hours',
 'pasta',
 'south american',
 'vegetable',
 'macaroni and cheese',
 'homeopathy/remedies',
 'beans',
 'trout',
 'new zealand',
 'yam/sweet potato',
 'lentil',
 'lamb/sheep',
 'oven',
 'french toast',
 'stir fry',
 'strawberry',
 'mashed potatoes',
 'for large groups',
 'cranberry sauce',
 'inexpensive',
 'meatloaf',
 'australian',
 'wild game',
 'no cook',
 'pennsylvania dutch',
 'baked beans',
 'lemon