In [3]:
import pandas as pd
import numpy as np

# Collaborative filtering

We're going to construct a model (i.e. we're going to find embeddings of users and recipes into a common vector space) using only the reviews data.

In [4]:
reviews = pd.read_pickle('../data/reviewsred.pk')

We start by getting rid of unpopular recipes/users so that our "affinity matrix" is reasonably dense. This will help ensure a few things:
* We can do a train-test split stratified by users (if a user has only 1 recipe, we can't take a sample of their recipes).
* This ensures we have a good amount of data for each user/recipe.
* This will hopefully ensure that every recipe appears in the training set - we can't stratify by both user and recipe.

In [5]:
rev_df = (reviews.groupby('UserId').filter(lambda x: len(x) > 19))
rev_df = (rev_df.groupby('RecipeId').filter(lambda x: len(x) > 19))
rev_df = (rev_df.groupby('UserId').filter(lambda x: len(x) > 19)).copy()

We now do a train test split on the dense dataframe.

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
rev_train, rev_test = train_test_split(rev_df,test_size=0.2,random_state=193,stratify = rev_df.UserId)

In [11]:
print(str(len(rev_df.RecipeId.unique()))+" recipes in original set")
print(str(len(rev_train.RecipeId.unique()))+" recipes in training set")

5610 recipes in original set
5610 recipes in training set


This is good: every recipe appears in our training set, so we will be able to make predictions

## Obtaining the Affinity Matrix

Now that we have our training data frame, we will construct a matrix that records whether a user has reviewed a recipe or not.
*The rows represent users.
*The columns represent recipes.
*The (u,r) entry is 1 if there is a review for recipe r by user u in the training set, and 0 otherwise.

In [12]:
rev_train['InTraining'] = 1
mat_df_train = pd.pivot_table(data=rev_train,index='UserId',columns='RecipeId',values='InTraining',fill_value = 0)
user_ids_ordered = list(mat_df_train.index)
recipe_ids_ordered = list(mat_df_train.columns)
user_id_to_index = {user_ids_ordered[i]:i for i in range(len(user_ids_ordered))}
recipe_id_to_index = {recipe_ids_ordered[i]:i for i in range(len(recipe_ids_ordered))}
mat_train = mat_df_train.values



We will subtract off the mean of each row before computing the matrix factorization, and add it back at the end.

In [13]:
user_mean_mat = mat_train-[r-np.mean(r) for r in mat_train]

We compute the singular value decomposition of the matrix with zero-mean rows.

In [14]:
U, D, Vh = np.linalg.svd([r-np.mean(r) for r in mat_train],full_matrices=False)


The singular value decomposition gives us a factorization of our matrix as $M = U_0 DV^{tr}$; we can instead think of this as a factorization $UR^{tr}$ where $U$ represents the user embedding and $R$ represents the recipe embedding, by setting $U = U_0\sqrt{D}$ and $R^{tr} = \sqrt{D}V^{tr}$.

In [15]:
rtdiag = np.diag([np.sqrt(s) for s in D])
U_mat = np.matmul(U,rtdiag)
R_mat = np.matmul(rtdiag,Vh)

The original factorization is "too closely" fitted to the data in the training set: any user, recipe pair which do not appear in the training set will be orthogonal. By dropping the least significant values, we can obtain a coarser embedding that will hopefully allow us to predict which recipes are in the test set.

In [16]:
def set_embedding_rank(umat,rmat,rank):
    newrmat = rmat[:rank]
    newumat = np.transpose(np.transpose(umat)[:rank])
    return newumat,newrmat


In [17]:
U_mat_256, R_mat_256 = set_embedding_rank(U_mat,R_mat,256)
U_mat_16, R_mat_16 = set_embedding_rank(U_mat,R_mat,16)
U_mat_6, R_mat_6 = set_embedding_rank(U_mat,R_mat,6)

In [19]:
np.transpose(R_mat_6)[0]

array([ 0.02955606, -0.10787489,  0.04506531, -0.06582352, -0.09728826,
       -0.06906386])

In [20]:
np.linalg.norm(mat_train-user_mean_mat-np.matmul(U_mat,R_mat))

3.758865785055931e-12

In [22]:
np.linalg.norm(mat_train-user_mean_mat-np.matmul(U_mat_256,R_mat_256))

313.94958116372817

In [21]:
np.linalg.norm(mat_train-user_mean_mat-np.matmul(U_mat_16,R_mat_16))

390.40277112120674

In [23]:
np.linalg.norm(mat_train-user_mean_mat-np.matmul(U_mat_6,R_mat_6))

396.63049888036676

In [24]:
def getdics(df,usercol,itemcol):
    user_to_recs = {user:[] for user in set(df[usercol].values)}
    rec_to_users= {rec:[] for rec in set(df[itemcol].values)}
    for rev in df.index:
        user = df[usercol][rev]
        rec = df[itemcol][rev]
        user_to_recs[user].append(rec)
        rec_to_users[rec].append(user)
    return user_to_recs,rec_to_users

user_to_recs_train,rec_to_users_train = getdics(rev_train,'UserId','RecipeId')
user_to_recs_test,rec_to_users_test = getdics(rev_test,'UserId','RecipeId')

In [46]:
def check_user_scores(user,reclist,mat):
    i = user_id_to_index[user]
    rec_indices = [recipe_id_to_index[rec] for rec in reclist]
    return [mat[i][j] for j in rec_indices]

def get_score_averages_user(user,mat):
    user_train_recs = np.mean(check_user_scores(user,user_to_recs_train[user],mat))
    user_test_recs = np.mean(check_user_scores(user,user_to_recs_test[user],mat))
    other_recs = [rec for rec in rec_to_users_train if rec not in user_to_recs_test[user]+user_to_recs_train[user]]
    user_other_recs = np.mean(check_user_scores(user,other_recs,mat))
    return {'train':user_train_recs,'test':user_test_recs,'other':user_other_recs}

def get_score_averages(mat):
    users = []
    trainmean = []
    testmean = []
    othermean = []
    for user in user_to_recs_train:
        users.append(user)
        scores = get_score_averages_user(user,mat)
        trainmean.append(scores['train'])
        testmean.append(scores['test'])
        othermean.append(scores['other'])
    df = pd.DataFrame({'UserId':users,'TrainMean':trainmean,'TestMean':testmean,'Other':othermean})
    return df.set_index('UserId')

def get_score_average_summary(mat):
    users = []
    trainmean = []
    testmean = []
    othermean = []
    for user in user_to_recs_train:
        users.append(user)
        scores = get_score_averages_user(user,mat)
        trainmean.append(scores['train'])
        testmean.append(scores['test'])
        othermean.append(scores['other'])
    return {'train':np.mean(trainmean),'test':np.mean(testmean),'other':np.mean(othermean)}
    

In [50]:
score_summary = {'rank':[],'train':[],'test':[],'other':[]}
for r in range(4,64):
    umat_r,rmat_r = set_embedding_rank(U_mat,R_mat,r)
    aff_r = np.matmul(umat_r,rmat_r)
    summ_r = get_score_average_summary(aff_r)
    score_summary['rank'].append(r)
    for cat in summ_r:
        score_summary[cat].append(summ_r[cat])
    if r % 10 == 0:
        print(str(r)+' finished')

score_summary_df = pd.DataFrame(score_summary)
score_summary_df.set_index('rank',inplace = True)


10 finished
20 finished


KeyboardInterrupt: 

It looks like $r = 6$ is a good choice.

In [51]:

score_summary_df = pd.DataFrame(score_summary)
score_summary_df.set_index('rank',inplace = True)


In [52]:
score_summary_df

Unnamed: 0_level_0,train,test,other
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,0.017405,0.013206,-0.000356
5,0.018681,0.013699,-0.000393
6,0.020103,0.014219,-0.000428
7,0.022659,0.014266,-0.00046
8,0.024032,0.014611,-0.000489
9,0.025652,0.014633,-0.000516
10,0.027056,0.014892,-0.000544
11,0.028018,0.014901,-0.00057
12,0.029242,0.015084,-0.000595
13,0.031128,0.015168,-0.00062


In [65]:
userscores_r6_df.UserCount.mean()

46.989901823281905

In [25]:
keywords_df = pd.read_pickle('../data/keywords.pk')


In [26]:
rec_names = {recipe:keywords_df['Name'][recipe] for recipe in recipe_ids_ordered}

def rank_recipes_dot(vec,mat):
    rl = list(rec_names.keys())
    rl.sort(key = lambda x:np.dot(vec,mat[recipe_id_to_index[x]]),reverse=True)
    return [rec_names[rec] for rec in rl]

def rank_recipes_dot_fromRecId(recid,mat):
    vec = mat[recipe_id_to_index[recid]]
    return rank_recipes_dot(vec,mat)

In [44]:
rank_recipes_dot_fromRecId(recipe_ids_ordered[20],np.transpose(R_mat))

['yorkshire pudding popovers',
 'corn fritters',
 'sausage rolls',
 'crispy potato bites',
 'chicken makhani (indian butter chicken)',
 'arabian pita bread',
 'pesto pastry stars',
 "minado's perfect sushi rice",
 'award-winning butter tarts',
 'prosciutto wrapped chicken',
 'broccoli salad',
 'remarkable rhubarb bites',
 'baby bok choy with garlic',
 'low carb lemon "cheesecake" bars',
 'the best marinade for kabobs! (beef, pork and lamb)',
 'giant soft pretzels',
 'pasta rustica',
 'chicken with mushrooms and mustard',
 'mexican flan (baked caramel custard)',
 'pork tenderloin',
 'steak marinade, quick and easy',
 'caramelised steak',
 'mocha hot chocolate',
 'garlic mashed potatoes',
 'easy weeknight sloppy joes',
 'perfect chocolate brownies',
 "tyler florence's ultimate fish and chips",
 'delicious roasted beef ribs',
 'make ahead mashed potato casserole',
 'savoury couscous',
 'mahogany beef stew',
 'family favorite sesame chicken',
 'chinese beef with broccoli',
 'best marinated

In [45]:
rank_recipes_dot_fromRecId(recipe_ids_ordered[20],np.transpose(R_mat_6))

['spinach salad with mustard-bacon dressing',
 'classic bistro style gratin dauphinois - french gratin potatoes',
 'mediterranean chicken breasts',
 'coconut red lentils with spinach, cashews &amp; lime (vegan)',
 'coconut chicken fingers with 30 minute mango chutney',
 'crusty sourdough rye bread',
 'chocolate chip or m &amp; m oatmeal cookies',
 "carrabba's chicken marsala",
 'butterscotch apple pecan cobbler',
 'peppered buffalo ranch shrimp pizza #rsc',
 "kelly's french silk chocolate pie",
 'aloo gobi - cauliflower and potatoes',
 'classic san francisco sourdough bread',
 'remoulade sauce',
 'grilled greek chicken breasts',
 'jamaican lentil stew with coconut',
 'authentic no refrigeration bakery frosting/icing',
 'rosemary and lemon pork stew',
 'chocolate banana chocolate chip bundt cake',
 'brisket',
 'crustless swiss chard quiche',
 'ground beef burritos',
 'fiesta chicken with jalapeno cilantro creme',
 'healthy fish tacos with chipotle cream',
 'homemade yogurt',
 'chile rel