In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Set-up
We start by defining a few functions, loading the dataframe

In [16]:
def flatten_list(l:list)->list:
    newl = []
    for l0 in l:
        newl+=l0
    return newl

def tally(l:list)->dict:
    counts = {item:0 for item in set(l)}
    for item in l:
        counts[item]+=1
    return counts

In [5]:
revs = pd.read_pickle('../data/revdense.pk')

In [8]:
user_list = list(revs.UserId.unique())

## Train test split

We do a train-test split on our dataframe

In [10]:
from sklearn.model_selection import train_test_split

rev_train, rev_test = train_test_split(revs,
                                       test_size=0.2,
                                       random_state=193,
                                       stratify=revs['UserId'])

In [24]:
def get_dictionaries(ratingsdf,usercol,itemcol,ratingcol):
    users = list(set(ratingsdf[usercol].values))
    items = list(set(ratingsdf[itemcol].values))
    ratings_by_user = {user_id:{} for user_id in users}
    users_by_item = {item_id:[] for item_id in items}
    for review_index in ratingsdf.index:
        user_id = ratingsdf[usercol][review_index]
        item_id = ratingsdf[itemcol][review_index]
        rating = ratingsdf[ratingcol][review_index]
        ratings_by_user[user_id][item_id] = rating
        users_by_item[item_id].append(user_id)
    return ratings_by_user,users_by_item

user_ratings,items_by_user = get_dictionaries(rev_train,
                                              'UserId',
                                              'RecipeId',
                                              'Rating')

## Models
Now that we've removed our test test, we will obtain some models.

A model for us will consist of the following: for each user $u$, we will obtain a ranking of the recipes by how likely it is that $u$ will interact with a given recipe.

### Popularity
Our simplest model is to simply to rank all recipes by general popularity (using the training set), and use the same ranking for all users.

In [13]:
def series_to_ranking(value_counts):
    index = list(value_counts.keys())
    index.sort(key = lambda x:value_counts[x],reverse=True)
    return {index[i]:(i+1) for i in range(len(index))}

In [15]:
popularity_ranking = series_to_ranking(rev_train.RecipeId.value_counts())

### Popularity among related users

We can obtain more variation in the rankings by doing the following: to obtain rankings for user $u$, we again use popularity, but we compute it on a restricted dataset consisting of users we think are similar to our target user.

For a toy model, we will say that two users are similar if they have at least one review in common.

In [31]:
def related_users(user:int)->dict:
    recs = list(rev_train.loc[rev_train.UserId==user]['RecipeId'].values)
    return tally(flatten_list([items_by_user[recipe] for recipe in recs]))

def user_neighborhood(user):
    return rev_train[rev_train['UserId'].isin(related_users(user))]

def user_ranking_v0(user):
    ranking_user = list(user_neighborhood(user)['RecipeId'].value_counts().index)
    return {ranking_user[i]:i+1 for i in range(len(ranking_user))}


### Weighted Popularity among related users

We can refine the previous model by taking into account the number of common reviews.

In [32]:
def user_neighborhood_wts(user):
    rel_user_cts = related_users(user)
    user_df = rev_train[rev_train['UserId'].isin(related_users(user))].copy()
    user_df['No_common_revs'] = [rel_user_cts[user0] for user0 in user_df.UserId]
    return user_df

def user_ranking_v1(user):
    usernbhd = user_neighborhood_wts(user)
    user_rec_list = list(usernbhd.RecipeId.unique())
    rec_scores = {rec:0 for rec in user_rec_list}
    for i in usernbhd.index:
        rec_scores[usernbhd['RecipeId'][i]]+=usernbhd['No_common_revs'][i]
    user_rec_list.sort(key = lambda x:rec_scores[x],reverse=True)
    return {user_rec_list[i]:i+1 for i in range(len(user_rec_list))}


## Making predictions

Now that we've "trained" our models (we've defined functions that will make predicitons using the training set), we will see how they do on the test set.

We will view each user as a query, and we will view the set of recipes they've reviewed as "acceptable responses". This will allow us to judge the collection of rankings.

To quantify how well a model does, we use [mean reciprocal rank](https://en.wikipedia.org/wiki/Mean_reciprocal_rank).

In [35]:
user_data = {'UserId':[],
             'No_recs_user':[],
             'No_recs_rel_users':[],
             'ByPopularity_const_train':[],
             'ByPopularity_us_train':[],
             'ByPopularity_wts_train':[]}

for user in user_list:
    user_data['UserId'].append(user)
    userrecs = list(user_ratings[user].keys())
    user_data['No_recs_user'].append(len(userrecs))
    rankingsv0 = user_ranking_v0(user)
    user_data['No_recs_rel_users'].append(len(rankingsv0))
    user_data['ByPopularity_const_train'].append([popularity_ranking[rec] for rec in userrecs])
    user_data['ByPopularity_us_train'].append([rankingsv0[rec] for rec in userrecs])
    rankingsv1 = user_ranking_v1(user)
    user_data['ByPopularity_wts_train'].append([rankingsv1[rec] for rec in userrecs])





In [36]:
user_pred_df = pd.DataFrame(user_data)
user_pred_df.set_index('UserId',inplace=True)

In [42]:
for col in ['ByPopularity_const_train','ByPopularity_us_train', 'ByPopularity_wts_train']:
    user_pred_df[col] = user_pred_df[col].apply(min)

In [43]:
user_pred_df

Unnamed: 0_level_0,No_recs_user,No_recs_rel_users,ByPopularity_const_train,ByPopularity_us_train,ByPopularity_wts_train
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7802,27,2585,21,2,2
9869,186,2585,1,1,1
6357,74,2585,22,4,2
11297,54,2585,6,2,2
4470,146,2585,1,1,1
...,...,...,...,...,...
2001330613,30,2585,25,3,2
2001362355,24,2585,1,1,1
2001102678,27,2585,59,3,2
2001625595,18,2585,6,1,1


In [47]:
user_data_test = {'UserId':[],
             'No_recs_user':[],
             'Popularity':[],
             'Popularity_us':[],
             'Popularity_wt':[]}

for user in user_list:
    user_data_test['UserId'].append(user)
    userrecs = rev_test.loc[rev_test['UserId']==user]['RecipeId'].values
    user_data_test['No_recs_user'].append(len(userrecs))
    rankingsv0 = user_ranking_v0(user)
    user_data_test['Popularity'].append([popularity_ranking[rec] for rec in userrecs])
    user_data_test['Popularity_us'].append([rankingsv0[rec] for rec in userrecs])
    rankingsv1 = user_ranking_v1(user)
    user_data_test['Popularity_wt'].append([rankingsv1[rec] for rec in userrecs])

user_test_pred_df = pd.DataFrame(user_data_test)

user_test_pred_df.set_index('UserId',inplace=True)


In [54]:
user_test_pred_df['Popularity_min']=user_test_pred_df['Popularity'].apply(min)
user_test_pred_df['Popularity_us_min']= user_test_pred_df['Popularity_us'].apply(min)
user_test_pred_df['Popularity_wt_min']= user_test_pred_df['Popularity_wt'].apply(min)
