In [1]:
import pandas as pd
import numpy as np

raw_interactions_df = pd.read_csv('food.com/RAW_interactions.csv', usecols=['user_id', 'recipe_id', 'rating'])
raw_recipes_df = pd.read_csv('food.com/RAW_recipes.csv', usecols=['name', 'id', 'tags', 'description'])
raw_recipes_df.rename(columns={'id': 'recipe_id'}, inplace=True)

In [2]:
# remove recipes without name/description/tags
raw_recipes_df = raw_recipes_df[~raw_recipes_df['description'].isna()]
raw_recipes_df = raw_recipes_df[raw_recipes_df['description'].apply(lambda x: any(c.isalpha() for c in x))]
raw_recipes_df = raw_recipes_df[~raw_recipes_df['name'].isna()]
raw_recipes_df = raw_recipes_df[raw_recipes_df['tags'].apply(lambda x: any(c.isalpha() for c in x))]#226452
raw_interactions_df = raw_interactions_df.loc[raw_interactions_df['recipe_id'].isin(raw_recipes_df['recipe_id'].values)]

In [3]:
# discard users with fewer than 5 reviews
raw_interactions_df = raw_interactions_df[raw_interactions_df['user_id'].isin((raw_interactions_df.groupby(by='user_id').size() > 4).where(lambda x: x).dropna().index)] # 23086 users remain
raw_recipes_df = raw_recipes_df.loc[raw_recipes_df['recipe_id'].isin(raw_interactions_df['recipe_id'].values)]

In [4]:
raw_interactions_df.reset_index(drop=True, inplace=True)
raw_recipes_df.sort_values(by='recipe_id', inplace=True)
raw_recipes_df.reset_index(drop=True, inplace=True)
raw_recipes_df.rename_axis('item_id', inplace=True)

itemid_to_recipeid = raw_recipes_df[['recipe_id']].rename_axis('item_id')
recipeid_to_itemid = itemid_to_recipeid.reset_index().set_index('recipe_id')

raw_interactions_df['item_id'] = recipeid_to_itemid.loc[raw_interactions_df['recipe_id']]['item_id'].values

In [1]:
# 22578 users
# 206177 recipes
# 851224 interactions

## CF
https://github.com/NicolasHug/Surprise

In [6]:
from surprise import Dataset, Reader, KNNBasic, SVD, accuracy
from surprise.model_selection import train_test_split as surprise_train_test_split
from surprise.model_selection import GridSearchCV

In [7]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(raw_interactions_df[["user_id", "recipe_id", "rating"]], reader)
trainset, testset = surprise_train_test_split(data, test_size=0.25, random_state=7)

In [8]:
knn = KNNBasic(k=9)

knn.fit(trainset)
knn_predictions = knn.test(testset)
accuracy.rmse(knn_predictions)
accuracy.mae(knn_predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0639
MAE:  0.5904


0.5904152785284371

In [9]:
svd = SVD(n_factors=15, lr_all=0.005, n_epochs=18, reg_all=0.04, random_state=7)

svd.fit(trainset)
svd_predictions = svd.test(testset)
accuracy.rmse(svd_predictions)
accuracy.mae(svd_predictions)

RMSE: 0.9333
MAE:  0.5508


0.5507758441531742

In [10]:
from collections import defaultdict

# adapted from https://surprise.readthedocs.io/
def precision_recall_at_k(predictions, max_k=10, threshold=3.5):
    
    precision_recall = pd.DataFrame(columns=['k', 'precision', 'recall', 'f-measure']).set_index('k')
    
    for k in np.arange(1, max_k+1):
        # First map the predictions to each user.
        user_est_true = defaultdict(list)
        for uid, _, true_r, est, _ in predictions:
            user_est_true[uid].append((est, true_r))

        precisions = dict()
        recalls = dict()
        for uid, user_ratings in user_est_true.items():

            # Sort user ratings by estimated value
            user_ratings.sort(key=lambda x: x[0], reverse=True)

            # Number of relevant items
            n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

            # Number of recommended items in top k
            n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

            # Number of relevant and recommended items in top k
            n_rel_and_rec_k = sum(
                ((true_r >= threshold) and (est >= threshold))
                for (est, true_r) in user_ratings[:k]
            )

            # Precision@K: Proportion of recommended items that are relevant
            # When n_rec_k is 0, Precision is undefined. We here set it to 0.

            precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

            # Recall@K: Proportion of relevant items that are recommended
            # When n_rel is 0, Recall is undefined. We here set it to 0.

            recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
            
        precision = sum(prec for prec in precisions.values()) / len(precisions)
        recall = sum(rec for rec in recalls.values()) / len(recalls)
        f_measure = (2*precision*recall)/(precision+recall)
        precision_recall.loc[k] = pd.Series({'precision': precision, 'recall': recall, 'f-measure': f_measure})

    return precision_recall

In [11]:
knn_p_r = precision_recall_at_k(knn_predictions)

In [12]:
svd_p_r = precision_recall_at_k(svd_predictions)

## CB

In [125]:
from string import punctuation
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
def stem(words):
    stemmer = SnowballStemmer('english')
    result = []
    for word in words:
        result.append(stemmer.stem(word.strip(punctuation)))
    return result

In [15]:
def tokenize(data):
    tokens = word_tokenize(data)
    tokens = stem(tokens)
    return tokens

In [51]:
def get_content(df, column_names):
    content = df[column_names[0]]
    for column_name in column_names[1:]:
        content = content + " " + df[column_name]
    
    return content

In [52]:
def create_tfidf_matrix(column_names):
    stopword_list = stem(stopwords.words('english')) + ["", "'", "'d", 'could', 'might', 'must', "n't", 'need', 'r', 'sha', 'v', 'wo', 'would']

    vectorizer = TfidfVectorizer(tokenizer=tokenize,
                                 analyzer='word',
                                 min_df=0.003,
                                 max_df=0.5,
                                 stop_words=stopword_list)
    tfidf_matrix = vectorizer.fit_transform(get_content(raw_recipes_df, column_names))
    return tfidf_matrix

In [17]:
def cb_make_predictions():
    for user_id in raw_interactions_df['user_id'].unique():
    #for user_id in np.array([1634]):
        user_train_df = cb_train[cb_train['user_id'] == user_id].sort_values(by='item_id')
        user_profile = np.dot(tfidf_matrix[user_train_df['item_id'].values].toarray().T, user_train_df['rating'].values)
        
        user_test_df = cb_test[cb_test['user_id'] == user_id]
        
        for item_id in user_test_df['item_id'].values:
            user_test_df.loc[(user_test_df['user_id'] == user_id) & (user_test_df['item_id'] == item_id), 'prediction'] = cosine_similarity(np.atleast_2d(user_profile), tfidf_matrix.getrow(item_id))[0][0]
        
        min = user_test_df['prediction'].min()
        max = user_test_df['prediction'].max()
        if(max != min):
            user_test_df['prediction'] = (user_test_df['prediction'] - min)/(max - min) * 5
        
        for item_id in user_test_df['item_id'].values:
            cb_test.loc[(cb_test['user_id'] == user_id) & (cb_test['item_id'] == item_id), 'prediction'] = user_test_df[user_test_df['item_id'] == item_id]['prediction']

In [18]:
# adapted from https://surprise.readthedocs.io/
def cb_precision_recall_at_k(max_k=10, threshold=3.5):
    
    precision_recall = pd.DataFrame(columns=['k', 'precision', 'recall', 'f-measure']).set_index('k')
    
    for k in np.arange(1, max_k+1):

        precisions = dict()
        recalls = dict()
        
        for user_id in raw_interactions_df['user_id'].unique():#np.array([1634]):
            predictions = cb_test[cb_test['user_id'] == user_id].sort_values(by='prediction', ascending=False)
            top_k = predictions[:k]
            
            # Number of relevant items
            n_rel = predictions[predictions['rating'] >= threshold].shape[0]

            # Number of recommended items in top k
            n_rec_k = top_k[top_k['prediction']>= threshold].shape[0]

            # Number of relevant and recommended items in top k
            n_rel_and_rec_k = top_k[(top_k['prediction'] >= threshold) & (top_k['rating'] >= threshold)].shape[0]

            precisions[user_id] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
            recalls[user_id] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
            
        precision = sum(prec for prec in precisions.values()) / len(precisions)
        recall = sum(rec for rec in recalls.values()) / len(recalls)
        f_measure = (2*precision*recall)/(precision+recall)
        precision_recall.loc[k] = pd.Series({'precision': precision, 'recall': recall, 'f-measure': f_measure})

    return precision_recall

In [19]:
cb_train, cb_test = train_test_split(raw_interactions_df, test_size=0.25, random_state=7, stratify=raw_interactions_df['user_id'])

In [21]:
tfidf_matrix = create_tfidf_matrix(['name', 'description'])

In [24]:
pd.options.mode.chained_assignment = None  # default='warn'

In [25]:
cb_make_predictions()

In [27]:
cb_p_r = cb_precision_recall_at_k()