In [197]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [182]:
revdf = pd.read_pickle('data/revdense.pk')
author_ids = list(revdf.AuthorId.unique())
recipe_ids = list(revdf.RecipeId.unique())

In [183]:
ratings_by_author = {author_id:{} for author_id in author_ids}
ratings_by_recipe = {recipe_id:{} for recipe_id in recipe_ids}

for review_index in revdf.index:
    author_id = revdf['AuthorId'][review_index]
    recipe_id = revdf['RecipeId'][review_index]
    rating = revdf['Rating'][review_index]
    ratings_by_author[author_id][recipe_id] = rating
    ratings_by_recipe[recipe_id][author_id] = rating



In [194]:
revdf['NRating'] = [revdf['Rating'][i]-np.mean(list(ratings_by_author[revdf['AuthorId'][i]].values())) for i in revdf.index]

In [201]:
revtraindf, revtestdf = train_test_split(revdf,test_size=0.2, random_state=1729,stratify = revdf.AuthorId.values)

We're going to use our training set to construct a model.

First, we organize the data in the training set by author/recipe to make it easy to retrieve things later.

In [216]:
ratings_by_author = {author_id:{} for author_id in author_ids}
reviewers_by_recipe = {recipe_id:[] for recipe_id in revtraindf.RecipeId.unique()}
for review in revtraindf.index:
    author_id = revtraindf['AuthorId'][review]
    recipe_id = revtraindf['RecipeId'][review]
    ratings_by_author[author_id][recipe_id] = revtraindf['Rating'][review]
    reviewers_by_recipe[recipe_id].append(author_id)


In [218]:
authors0std = [author_id for author_id in ratings_by_author if np.std(list(ratings_by_author[author_id].values())) == 0]

In [221]:
def rating_sim(ratings1,ratings2):
    common_recipes = set(ratings1.keys()).intersection(set(ratings2.keys()))
    if len(common_recipes) == 0:
        return 0
    std1 = np.std(list(ratings1.values()))
    std2 = np.std(list(ratings2.values()))
    if min(std1,std2)== 0:
        return 0
    mean1 = np.mean(list(ratings1.values()))
    mean2 = np.mean(list(ratings2.values()))
    len1 = 0
    len2 = 0
    dot = 0
    for recipe_id in common_recipes:
        norm_rating1 = ratings1[recipe_id] - mean1
        norm_rating2 = ratings2[recipe_id] - mean2
        dot+=norm_rating1*norm_rating2
        len1+=norm_rating1**2
        len2+=norm_rating2**2
    if min(len1,len2)==0:
        return 0
    else:
        return dot/np.sqrt(len1*len2)

In [222]:
similarity_matrix = {}
similarity_by_author = {author_id:{} for author_id in author_ids}
for i in range(len(author_ids)-1):
    author_id1 = author_ids[i]
    ratings1 = ratings_by_author[author_id1]
    recs1 = set(ratings1.keys())
    for j in range(i+1,len(author_ids)):
        author_id2 = author_ids[j]
        ratings2 = ratings_by_author[author_id2]
        recs2 = set(ratings2.keys())
        if len(recs1.intersection(recs2))>0:
            sim = rating_sim(ratings1,ratings2)
            similarity_matrix[(i,j)] = sim
            similarity_by_author[author_id1][author_id2] = sim
            similarity_by_author[author_id2][author_id1] = sim

In [225]:
mean_ratings = {author_id:np.mean(list(ratings_by_author[author_id].values()))
                for author_id in ratings_by_author}
std_ratings = {author_id:np.std(list(ratings_by_author[author_id].values()))
                for author_id in ratings_by_author}

In [232]:
def predict_rating(pair):
    author_id = pair[0]
    author_sims = similarity_by_author[author_id]
    author_ratings = ratings_by_author[author_id]
    author_recipes = set(author_ratings.keys())
    recipe_id = pair[1]
    if recipe_id not in reviewers_by_recipe:
        return 0
    other_reviewers = reviewers_by_recipe[recipe_id]
    other_reviewers_comparable = [author_id1 for author_id1 in other_reviewers if author_id1 in author_sims]
    cumulative_info = 0
    for author_id1 in other_reviewers_comparable:
        similarity = author_sims[author_id1]
        rating = ratings_by_author[author_id1][recipe_id]-mean_ratings[author_id1]
        no_common_recipes = len(set(ratings_by_author[author_id1].keys()).intersection(author_recipes))
        cumulative_info+=rating*similarity*no_common_recipes
    if cumulative_info > 0:
        return 1
    elif cumulative_info < 0:
        return -1

In [233]:
revtestdf['Pred'] = [predict_rating(pair) for pair in revtestdf.index]

In [242]:
revtestdf.loc[revtestdf.Pred==1]

Unnamed: 0_level_0,AuthorId,RecipeId,Rating,NRating,Pred
Pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(27678, 26068)",27678,13549,5,0.215909,1.0
"(100980, 100594)",100980,8782,4,-0.266667,1.0
"(129333, 101041)",129333,50737,0,-4.647059,1.0
"(32772, 29184)",32772,13948,0,-4.895105,1.0
"(5060, 9327)",5060,12591,5,0.224044,1.0
"(29014, 112725)",29014,88453,5,0.16129,1.0
"(55729, 69173)",55729,23959,4,0.51751,1.0
"(27678, 87879)",27678,13126,5,0.215909,1.0
"(84657, 56965)",84657,14911,0,-4.575758,1.0
"(6357, 15229)",6357,26276,5,0.16,1.0


In [133]:
revdf_dense['No_revs_auth'] = [list(ratings_by_recipe[recipe_id].keys()) for recipe_id in revdf_dense['RecipeId'].values]
revdf_dense['No_reviews_rec'] = [len(ratings_by_recipe[recipe_id].keys()) for recipe_id in revdf_dense['RecipeId'].values]

Unnamed: 0_level_0,ReviewId,RecipeId,AuthorId,Rating,All_reviewers,No_reviews_rec
Pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(1773, 13)",13,7435,1773,5,"[1773, 16346, 10649, 35414, 68535, 76715, 2665...",32
"(2046, 19)",19,13307,2046,5,"[2046, 11065, 35377, 43505, 54840, 61105, 3161...",95
"(2046, 23)",23,4684,2046,5,"[2046, 12882, 37636, 54697, 37779, 126440, 160...",31
"(3228, 102)",102,5297,3228,4,"[3228, 15820, 30854, 109983, 108954, 19051, 53...",51
"(3385, 125)",125,3470,3385,1,"[3385, 99979, 152855, 55205, 136147, 124614, 1...",251
...,...,...,...,...,...,...
"(2002480537, 2090279)",2090279,53605,2002480537,5,"[49304, 27783, 95743, 102602, 64322, 153067, 5...",32
"(843817, 2090280)",2090280,73866,843817,5,"[54678, 54697, 58300, 12481, 72254, 117565, 53...",126
"(2038463, 2090287)",2090287,26370,2038463,0,"[37036, 42503, 52381, 47559, 58836, 6357, 5864...",225
"(2002871210, 2090291)",2090291,80156,2002871210,0,"[58300, 64583, 63630, 115658, 104600, 50441, 5...",214


In [152]:
ratings_by_author_dense = {author_id:{} for author_id in revdf_dense.AuthorId.unique()}

for review_index in revdf_dense.index:
    author_id = revdf_dense['AuthorId'][review_index]
    recipe_id = revdf_dense['RecipeId'][review_index]
    rating = revdf_dense['Rating'][review_index]
    ratings_by_author_dense[author_id][recipe_id] = rating

auth_dense_nzstd = [author_id for author_id in ratings_by_author_dense
                    if np.std(list(ratings_by_author_dense[author_id].values())) > 0]

revdf_dense = (revdf_dense.T[[pair for pair in revdf_dense.index if pair[0] in auth_dense_nzstd]].T).copy()

In [165]:
auth_dense_test = [author_id for author_id in ratings_by_author_dense
                    if len(list(ratings_by_author_dense[author_id].values())) > 10]

In [166]:
revdf_dense_test = (revdf_dense.T[[pair for pair in revdf_dense.index if pair[0] in auth_dense_test]].T)

In [163]:
[len(revdf_dense_test.AuthorId.unique()),len(revdf_dense_test.RecipeId.unique()),len(revdf_dense_test)]

[17713, 4094, 256498]

In [167]:
[len(revdf_dense_test.AuthorId.unique()),len(revdf_dense_test.RecipeId.unique()),len(revdf_dense_test)]

[5472, 4093, 204045]

In [181]:
revdf_dense.to_pickle('data/revdense.pk')

In [45]:
authors30 = [author_id for author_id in authorcounts30 
                      if authorcounts30[author_id]>1]

4094

In [104]:
def get_dense(a_dic,r_dic,a_min,r_min,d_max):
    items = set([item for item in r_dic if len(r_dic[item]) >= r_min])
    users = set([user for user in a_dic if len(set(a_dic[user].keys()).intersection(items)) >= a_min])
    count = 1
    while True:
        new_items = set([item for item in r_dic if len(set(r_dic[item].keys()).intersection(users)) >= r_min])
        new_users = set([user for user in a_dic if len(set(a_dic[user].keys()).intersection(new_items)) >= a_min])
        count+=1
        if (len(new_items) == len(items) and len(new_users) == len(users)) or count >= d_max:
            return [new_users,new_items]
        items = new_items
        users = new_users

In [113]:
test = get_dense(ratings_by_author,ratings_by_recipe,10,30,2)

In [92]:
revdf['Pair'] = [tuple(pair) for pair in np.transpose([revdf['AuthorId'].values,revdf['ReviewId'].values])]
revdf = revdf.set_index('Pair')

In [93]:
revdf

Unnamed: 0_level_0,ReviewId,RecipeId,AuthorId,Rating
Pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(1634, 7)",7,4384,1634,4
"(2046, 9)",9,4523,2046,2
"(1773, 13)",13,7435,1773,5
"(2046, 17)",17,5221,2046,4
"(2046, 19)",19,13307,2046,5
...,...,...,...,...
"(2002871210, 2090292)",2090292,188578,2002871210,5
"(35251, 2090301)",2090301,339905,35251,5
"(2002323049, 2090302)",2090302,537541,2002323049,3
"(2002553884, 2090329)",2090329,103772,2002553884,2


In [107]:
test[0]

{32772,
 1900559,
 720912,
 360471,
 65573,
 163877,
 294953,
 131126,
 262200,
 917568,
 229448,
 262216,
 65634,
 295018,
 1179773,
 688254,
 262272,
 65670,
 1376400,
 163986,
 229524,
 163995,
 458914,
 98467,
 917666,
 262312,
 426156,
 458927,
 1801126064,
 65720,
 98511,
 393426,
 262357,
 229605,
 655596,
 327920,
 229619,
 655603,
 327938,
 983307,
 262423,
 590105,
 360730,
 98590,
 950577,
 327989,
 328000,
 164182,
 360813,
 33135,
 328048,
 1048945,
 1048952,
 229753,
 622972,
 33159,
 2000519559,
 328077,
 65936,
 229787,
 491933,
 197023,
 98722,
 65955,
 33188,
 328099,
 229800,
 295341,
 229808,
 164276,
 688566,
 1769917,
 491970,
 98761,
 491979,
 66008,
 229850,
 1049057,
 1507817,
 98812,
 98828,
 1442318,
 164368,
 1671718,
 328235,
 98861,
 852529,
 1278514,
 98867,
 1049139,
 1311294,
 2002256447,
 852554,
 131659,
 361041,
 197201,
 131674,
 33376,
 98919,
 131698,
 197244,
 655996,
 230020,
 295565,
 459415,
 1376928,
 459426,
 33443,
 295591,
 885416,
 983723

In [95]:
len(test[1])

5519

In [115]:
pairs = [(a,r) for a in test[0] for r in test[1]]

In [117]:
len(pairs)

15629028

In [116]:
len(set(pairs).intersection(set(revdf.index)))

1611