In [1]:
import pandas as pd
import numpy as np

In [4]:
reviews = pd.read_pickle('../data/reviewsred.pk')

We will drop recipes and users that appear too infrequently in the dataset. 

Note that we can't insist on both a minimum number of appearances for both user and item: if we drop the unpopular recipes, and then we drop the unpopular users, some of the recipes that are left in the dataframe may now be below the "popularity" threshold.

In [5]:
rev_df = (reviews.groupby('UserId').filter(lambda x: len(x) > 29))
rev_df = (rev_df.groupby('RecipeId').filter(lambda x: len(x) > 29))
rev_df = (rev_df.groupby('UserId').filter(lambda x: len(x) > 19)).copy()

In [None]:
def density(df):
    no_users = len(df.AuthorId.unique())
    no_recs = len(df.RecipeId.unique())
    no_revs = len(df)
    return {'Users':no_users,'Recipes':no_recs,'Ratings':no_revs,'Density':(no_revs)/(no_users*no_recs)}


In [6]:
rev_df.to_pickle('../data/revdense.pk')

In [None]:
density(reviews)

{'Users': 44084,
 'Recipes': 251937,
 'Ratings': 1102882,
 'Density': 9.930156690828349e-05}

By working with this restricted dataset, we are only using using 10 percent of the available data but the density of the available ratings increases from roughly $10^{-4}$ to $2 \cdot 10^{-2}$.

In [58]:
len(revdf.loc[revdf.AuthorCount>1000].RecipeId.unique())

126893

In [5]:
def get_dictionaries(ratingsdf,usercol,itemcol,ratingcol):
    users = list(set(ratingsdf[usercol].values))
    items = list(set(ratingsdf[itemcol].values))
    ratings_by_user = {user_id:{} for user_id in users}
    users_by_item = {item_id:[] for item_id in items}
    for review_index in ratingsdf.index:
        user_id = ratingsdf[usercol][review_index]
        item_id = ratingsdf[itemcol][review_index]
        rating = ratingsdf[ratingcol][review_index]
        ratings_by_user[user_id][item_id] = rating
        users_by_item[item_id].append(user_id)
    return ratings_by_user,users_by_item

In [40]:
user_ratings,item_reviewers = get_dictionaries(revdf,'AuthorId','RecipeId','Rating')

In [44]:
users = list(user_ratings.keys())
users.sort(key = lambda x:len(user_ratings[x]))

In [60]:
revdf.Rating.unique()

array([4, 2, 5, 1, 3, 0], dtype=int32)

In [69]:
def user_rating_desc(user):
    ratings = user_ratings[user]
    value_counts = {n:0 for n in range(6)}
    if len(ratings)==0:
        return value_counts
    for item in ratings:
        value_counts[ratings[item]]+=1
    for n in value_counts:
        value_counts[n] = value_counts[n]/len(ratings)
    description = {'Number':len(ratings),
                   'Mean':np.mean(list(ratings.values())),
                   'Std':np.std(list(ratings.values())),
                   'RatingCounts':value_counts
                  }
    return description

In [77]:
user_sublist = [user for user in users if user_rating_desc(user)['Std']>1 and user_rating_desc(user)['Number']>50]

In [82]:
revdf['AuthorStd'] = [np.std(list(user_ratings[user].values())) for user in revdf.AuthorId]


In [91]:
len(revdf.loc[revdf.RecipeCount>49].RecipeId.unique())

1933

In [120]:
len(revdense.loc[revdense.AuthorStd>1].loc[revdense.AuthorCount>9].RecipeId.unique())

1933

In [107]:
user_ratings_d,item_reviewers_d = get_dictionaries(revdense,'AuthorId','RecipeId','Rating')

In [19]:
revdf['AuthorCount'] = [len(user_ratings[user]) for user in revdf.AuthorId]
revdf['RecipeCount'] = [len(item_reviewers[item]) for item in revdf.RecipeId]

In [113]:
revdense['AuthorStd'] = [np.std(list(user_ratings_d[user].values())) for user in revdense.AuthorId]
revdense['AuthorCount'] = [len(list(user_ratings[user].values())) for user in revdense.AuthorId]

In [125]:
revdense.loc[revdense.AuthorStd>1].loc[revdense.AuthorCount>9].to_pickle('../data/revdense.pk')

In [122]:
len(revdense.loc[revdense.AuthorStd>1].loc[revdense.AuthorCount>19].AuthorId.unique())

2095

In [38]:
len(revdf.loc[revdf.Popularity>20000].RecipeId.unique())

29378

In [39]:
revdf

Unnamed: 0,ReviewId,RecipeId,AuthorId,Rating,AuthorCount,RecipeCount,Popularity
1,7,4384,1634,4,79,2,158
2,9,4523,2046,2,8,5,40
3,13,7435,1773,5,5,32,160
5,17,5221,2046,4,8,9,72
6,19,13307,2046,5,8,95,760
...,...,...,...,...,...,...,...
1401933,2090292,188578,2002871210,5,2,10,20
1401942,2090301,339905,35251,5,45,49,2205
1401943,2090302,537541,2002323049,3,2,1,2
1401968,2090329,103772,2002553884,2,2,6,12


In [218]:
authors0std = [author_id for author_id in ratings_by_author if np.std(list(ratings_by_author[author_id].values())) == 0]