In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [120]:
def get_dictionaries(ratingsdf,usercol,itemcol,ratingcol):
    users = list(set(ratingsdf[usercol].values))
    items = list(set(ratingsdf[itemcol].values))
    ratings_by_user = {user_id:{} for user_id in users}
    users_by_item = {item_id:[] for item_id in items}
    for review_index in ratingsdf.index:
        user_id = ratingsdf[usercol][review_index]
        item_id = ratingsdf[itemcol][review_index]
        rating = ratingsdf[ratingcol][review_index]
        ratings_by_user[user_id][item_id] = rating
        users_by_item[item_id].append(user_id)
    return ratings_by_user,users_by_item


def sign(x):
    if x > 0:
        return 1
    elif x < 0:
        return -1
    else:
        return 0
        
def normalize(data):
    value = data[0]
    mean = data[1]
    std = data[2]
    if std == 0:
        return 0
    else:
        return (value-mean)/std

def similarity(ratings1,ratings2):
    common_recipes = set(ratings1.keys()).intersection(set(ratings2.keys()))
    if len(common_recipes)== 0:
        return 0
    l1 = 0
    l2 = 0
    dot = 0
    for recipe_id in common_recipes:
        r1 = ratings1[recipe_id]
        r2 = ratings2[recipe_id]
        l1+=r1**2
        l2+=r2**2
        dot+=r1*r2
    # If the normalized ratings are all 0 for one of the users,
    # the corresponding length will be 0. We may as well return 0.
    if l1*l2 == 0:
        return 0
    else:
        return dot/np.sqrt(l1*l2)

In [27]:
revdf = pd.read_pickle('../data/revdense.pk')

In [66]:
rev_train, rev_test = train_test_split(revdf,test_size=0.5, random_state=1729,stratify = revdf.UserId.values)

In [76]:
user_ratings,users_by_recipe = get_dictionaries(rev_train,'UserId','RecipeId','Rating')
mean_ratings = {user_id:np.mean(list(user_ratings[user_id].values())) for user_id in user_ratings}
std_ratings = {user_id:np.std(list(user_ratings[user_id].values())) for user_id in user_ratings}
rev_train['UserMeanTrain'] = [mean_ratings[user] for user in rev_train.UserId]
rev_test['UserMeanTrain'] = [mean_ratings[user] for user in rev_test.UserId]
rev_train['UserStdTrain'] = [std_ratings[user] for user in rev_train.UserId]
rev_test['UserStdTrain'] = [std_ratings[user] for user in rev_test.UserId]
rev_train['NormalizedRating'] = [normalize(data) for data in rev_train[['Rating','UserMeanTrain','UserStdTrain']].values]
rev_test['NormalizedRating'] = [normalize(data) for data in rev_test[['Rating','UserMeanTrain','UserStdTrain']].values]

In [83]:
user_ratings_norm,users_by_recipe = get_dictionaries(rev_train,'UserId','RecipeId','NormalizedRating')

In [89]:
def make_prediction(pair,users_dic,recipes_dic,min_sim):
    user_id = pair[0]
    recipe_id = pair[1]
    if recipe_id not in recipes_dic:
        return 'No prediction'
    rating_dic = users_dic[user_id]
    other_recipes = set(rating_dic.keys())
    recipe_reviewers = recipes_dic[recipe_id]
    weights = []
    ratings = []
    for user_id2 in recipe_reviewers:
        rating_dic1 = users_dic[user_id2]
        no_common_recipes = len(set(rating_dic1.keys()).intersection(other_recipes))
        if no_common_recipes>0:
            simscore = similarity(rating_dic,rating_dic1)
            if simscore>= min_sim:
                weights.append(simscore*no_common_recipes)
                ratings.append(rating_dic1[recipe_id])
    if len(ratings)>0:
        return np.average(ratings,weights = weights)
    else:
        return 'No prediction'

In [159]:
rev_test['Pred'] = [make_prediction((rev_test['UserId'][i],rev_test['RecipeId'][i]),
                                    user_ratings_norm,
                                    users_by_recipe,
                                    0.9) for i in rev_test.index]

In [160]:
rev_test_pr = rev_test.loc[rev_test.Pred!='No prediction'].copy()

In [161]:
rev_test_pr['PredSgn'] = rev_test_pr.Pred.apply(sign)
rev_test_pr['TrainSgn'] = rev_test_pr.NormalizedRating.apply(sign)

In [166]:
rev_test_pr

Unnamed: 0_level_0,UserId,RecipeId,Rating,UserMeanTrain,Pred,UserStdTrain,NormalizedRating,PredSgn,TrainSgn
ReviewId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
609699,597546,19724,5,4.800000,0.517028,0.541603,0.369274,1,1
468869,414969,76,5,4.200000,0.642137,1.469694,0.544331,1,1
1271436,533820,149257,5,4.750000,0.447214,0.433013,0.577350,1,1
389654,30435,63786,5,4.315789,-0.379902,1.299904,0.526355,-1,1
1446992,1671718,117892,5,3.833333,-0.146031,0.687184,1.697749,-1,1
...,...,...,...,...,...,...,...,...,...
1257205,74904,84612,0,3.800000,-0.124659,2.114500,-1.797115,-1,-1
559230,246616,26557,5,3.750000,0.382911,2.165064,0.577350,1,1
230745,196038,106627,5,4.214286,0.864356,1.520271,0.516825,1,1
1133338,887970,222188,5,4.000000,1.104245,1.825742,0.547723,1,1


In [163]:
def get_mean_sq_err(pairs):
    return np.mean([(pair[0]-pair[1])**2 for pair in pairs])

In [164]:
get_mean_sq_err(rev_test_pr[['PredSgn','TrainSgn']].values)

1.6781424581005586

In [165]:
get_mean_sq_err([[0,x] for x in rev_test_pr['PredSgn'].values])

0.9976256983240224

In [None]:
revd