In [1]:
import os
os.chdir('../code/')
from dataclasses import *

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
reviewsdf = pd.read_pickle('../data/reviews.pk')
cooktimes = pd.read_pickle('../data/cooktimesclean.pk')
keywords =  pd.read_pickle('../data/keywords.pk')

In [130]:
nutrition = pd.read_pickle('../data/nutrition.pk')

In [117]:
common_recs =set(keywords.index).intersection(set(reviewsdf['RecipeId'].values))
keywords = keywords[keywords.index.isin(common_recs)]
reviewsdf = reviewsdf[reviewsdf['RecipeId'].isin(common_recs)]

In [71]:
keywordlist_full = []
for l in keywords.Keywords:
    keywordlist_full+=l
keywordlist_full = list(set(keywordlist_full))

In [79]:
def get_user_dataframe(reviewsdf,featuredf,usercol,itemcol,featurecols, user):
    userdf = reviewsdf.loc[reviewsdf[usercol]==user].copy()
    for feature in featurecols:
        userdf[feature]=[featuredf[feature][item]
                         for item in userdf[itemcol].values]
    return userdf


def gen_keyword_df(user):
    userdf = get_user_dataframe(reviewsdf,
                                keywords,
                                'UserId',
                                'RecipeId',
                                ['Keywords'],
                                user)
    keywordlistoflists = userdf['Keywords'].values
    keywordlist_user = []
    for l in keywordlistoflists:
        keywordlist_user+=l
    keywordlist_user_nr = list(set(keywordlist_user))
    mat = []
    for kwl in keywordlistoflists:
        counts = {w:0 for w in keywordlist_user_nr}
        for w in kwl:
            counts[w]+=1
        mat.append([counts[w] for w in keywordlist_user_nr])
    cols = np.transpose(mat)
    newdf = pd.DataFrame({keywordlist_user_nr[i]:cols[i] for i in range(len(cols))})
    newdf['RecipeId']=userdf.RecipeId.values
    newdf=newdf.set_index('RecipeId')
    newdf['Rating']=userdf.Rating.values
    return newdf   

def get_rating_vecs(user):
    userdf = gen_keyword_df(user)
    counts = {r:{w:0 for w in keywordlist_full} for r in [-1,0,1]}
    keywords_in_df = [w for w in keywordlist_full if w in userdf.columns]
    for r in [-1,0,1]:
        dfr = userdf.loc[userdf.Rating==r]
        for word in keywords_in_df:
            counts[r][word] = sum(dfr[word].values)
    return [np.array([counts[r][word] for word in keywordlist_full])
                     for r in [-1,0,1]]
    



def get_dictionaries(ratingsdf:pd.core.frame.DataFrame,usercol:str,itemcol:str,ratingcol:str):
    users = list(set(ratingsdf[usercol].values))
    items = list(set(ratingsdf[itemcol].values))
    ratings_by_user = {user_id:{} for user_id in users}
    users_by_item = {item_id:[] for item_id in items}
    for review_index in ratingsdf.index:
        user_id = ratingsdf[usercol][review_index]
        item_id = ratingsdf[itemcol][review_index]
        rating = ratingsdf[ratingcol][review_index]
        ratings_by_user[user_id][item_id] = rating
        users_by_item[item_id].append(user_id)
    return ratings_by_user,users_by_item

In [118]:
user_ratings_all,items_to_users = get_dictionaries(reviewsdf,
                                                   'UserId',
                                                   'RecipeId',
                                                   'Rating')

In [119]:
userlist = list(user_ratings_all.keys())
users_50revs = [user for user in userlist if len(user_ratings_all[user])>49]
users_100revs = [user for user in userlist if len(user_ratings_all[user])>99]
users_100revs_x = [user for user in users_100revs
                   if np.mean(list(user_ratings_all[user].values()))<0.5]

In [126]:
reviewsdf.loc[reviewsdf.UserId==users_50revs[1]]['Rating'].value_counts()

Rating
 1    97
-1     1
Name: count, dtype: int64

In [98]:
len(users_100revs_x)

828

In [84]:
users_100revs_x[0]

265

In [127]:
len(keywords)

251934

In [111]:
user_0 = users_100revs[1]
user_0_kwdf = gen_keyword_df(user_0)
user0_vecpos = sum(user_0_kwdf.loc[user_0_kwdf.Rating==1].values)
user0_vecneg = sum(user_0_kwdf.loc[user_0_kwdf.Rating==-1].values)
lneg = np.dot(user0_vecneg[:-1],user0_vecneg[:-1])
lpos = np.dot(user0_vecpos[:-1],user0_vecpos[:-1])
np.dot(user0_vecneg[:-1],user0_vecpos[:-1])/np.sqrt(lneg*lpos)

0.8808481865659471

In [120]:
keywordcounts = {word:0 for word in keywordlist_full}
keywordcounts_var = {word:0 for word in keywordlist_full}

for recipe in keywords.index:
    kwl = keywords['Keywords'][recipe]
    for w in kwl:
        keywordcounts[w]+=1
        keywordcounts_var[w]+=len(items_to_users[recipe])

In [86]:
users_50revs.sort(key=lambda x: np.mean(list(user_ratings_all[x].values())))

In [122]:
keyword_probs = {word:keywordcounts[word]/len(keywords)
                 for word in keywordcounts}

In [125]:
wts = []
for w in keywordlist_full:
    if w in keyword_probs and keyword_probs[w]!=0:
        wts.append(1/keyword_probs[w])
    else:
        wts.append(0)

In [39]:
user0_vecneg_norm

array([-0.        ,  0.00169779,  0.01018676,  0.01018676,  0.04414261,
        0.06791171, -0.        ,  0.00339559,  0.05602716,  0.00509338,
        0.00848896, -0.        ,  0.00339559, -0.        , -0.        ,
        0.00848896, -0.        ,  0.00339559,  0.04074703,  0.03225806,
        0.01188455,  0.00509338, -0.        ,  0.01697793,  0.00169779,
        0.02546689,  0.00339559,  0.09847199,  0.00679117,  0.00169779,
        0.00679117, -0.        , -0.        ,  0.08319185,  0.01528014,
       -0.        ,  0.00339559, -0.        ,  0.00169779,  0.00509338,
        0.02037351,  0.00169779,  0.01697793, -0.        ,  0.03056027,
        0.00169779,  0.00848896,  0.00339559, -0.        ,  0.10526316,
        0.03056027,  0.02037351, -0.        ,  0.00339559,  0.00339559,
        0.00169779, -0.        , -0.        , -0.        , -0.        ,
        0.01867572, -0.        ,  0.00509338,  0.02716469, -0.        ,
        0.01528014, -0.        ,  0.10016978,  0.00339559, -0.  

In [43]:
cossim

-0.25489966197403113

In [47]:
len(user0_vecneg[:-1])

243

In [46]:
len(user0_vecpos[:-1])

243

0.891480143187728

In [49]:
lneg

264185

In [50]:
lpos

2870112

In [72]:
len(keywordlist_full)

303

In [73]:
keywordlist_full

['korean',
 'polynesian',
 'bass',
 'eggs breakfast',
 'cookie & brownie',
 'deep fried',
 'breads',
 'brunch',
 'pork loin',
 'duck',
 'savory',
 'turkish',
 'toddler friendly',
 'creole',
 'homeopathy/remedies',
 'somalian',
 'polish',
 'cuban',
 'scottish',
 'southwest asia (middle east)',
 'moose',
 'dutch',
 'nuts',
 'onions',
 'grains',
 'beverages',
 'bath/beauty',
 'southwestern u.s.',
 'mixer',
 'small appliance',
 'roast beef',
 'cambodian',
 'spicy',
 'from scratch',
 'cajun',
 'beef sauces',
 'pumpkin',
 'savory pies',
 'chard',
 'medium grain rice',
 'potato',
 'caribbean',
 'breakfast potatoes',
 'ethiopian',
 'filipino',
 'colombian',
 'indonesian',
 'chicken stews',
 'hawaiian',
 'coconut',
 'curries',
 'tuna',
 'gelatin',
 'whole duck',
 'roast',
 'salad dressings',
 'elk',
 'black beans',
 'mahi mahi',
 'swedish',
 'pressure cooker',
 'sweet',
 'pineapple',
 'chinese',
 'marinara sauce',
 'frozen desserts',
 'dairy free foods',
 'sauces',
 'chocolate chip cookies',
 '

In [131]:
len(nutrition)

173683

In [136]:
nutrition.loc[nutrition.Calories>2000]

Unnamed: 0_level_0,Name,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
141,Carrie's Pizza Rolls,3521.8,108.1,39.0,183.9,2905.4,548.4,96.2,29.7,134.2,1.0
180,Chocolate Pound Cake,2905.2,168.7,101.2,1031.4,1092.8,321.9,9.2,178.0,38.2,1.0
273,Winter Pear Butter,2059.7,2.2,0.1,0.0,23.5,484.4,57.0,379.9,7.0,1.0
396,Linda'a Guadalajara - Style Burritos,5890.2,215.8,86.7,541.8,12001.9,698.1,74.0,40.7,273.4,1.0
401,Banana Marshmallow Ice Cream (Still-Freeze),4680.1,319.1,198.0,1175.4,608.6,458.5,11.9,326.8,33.8,1.0
...,...,...,...,...,...,...,...,...,...,...,...
513593,Kfc Copycat Chicken,2006.5,175.5,24.8,156.4,1877.7,83.9,8.3,2.2,30.8,4.0
514104,Popeye's Popcorn Shrimp and Chicken,12571.5,1332.7,176.0,994.8,8752.9,80.0,8.1,1.4,110.8,1.0
522194,Haleem,2484.8,74.0,35.7,250.3,2783.3,384.9,64.7,6.3,97.0,6.0
522517,Lettuce Wrap Fish Tacos,3041.6,284.2,29.2,177.5,2460.3,84.1,18.0,9.2,54.9,2.0
