# Recommender System Using Collaborative Filtering and Scipy's "surprise" Library

In [None]:
import numpy as np
import pandas as pd

In [None]:
from surprise import Reader, Dataset

In [None]:
from collections import defaultdict
from operator import itemgetter
import heapq

### Read and Manipulate Data

In [None]:
df = pd.read_csv("RAW_interactions.csv") # interactions data
df = df.drop(columns = ["date","review"]) # dropping columns that won't be used

In [None]:
# map of raw_id to new_id for recipes for better integration
df_map = pd.read_csv("recipe_id_map.csv")
df_map = df_map.rename(columns = {'id':'recipe_id'})

### Data manipulation

In [None]:
df_mapped = df.join(df_map.set_index("recipe_id"),on = "recipe_id").reset_index(drop = True)
#df_mapped.head()

In [None]:
# selecting only rows where rating is 1-5

df1_5 = df_mapped[df_mapped.rating > 0]
df1_5 = df1_5.drop(columns = ["recipe_id"]).rename(columns = {"new_id":"recipe_id"})
df1_5 = df1_5[["user_id","recipe_id","rating"]]

#df1_5.user_id.value_counts() #total unique recipes are 226590, users 196098

In [None]:
# df for filtering recipes that have been rated more than 5 times
df_recipes = df1_5.recipe_id.value_counts().to_frame().reset_index()
df_recipes.rename(columns = {"index":"recipe_id","recipe_id":"num_ratings"},inplace = True)
df_pop_recipes = df_recipes[df_recipes.num_ratings>3] #****************


In [None]:
# join to get interaction df with recipes rated more than 5 times
df_few_recipes = df1_5.join(df_pop_recipes.set_index("recipe_id"), on="recipe_id", how = "right", lsuffix='', rsuffix='', sort=False).reset_index(drop = True)
df_few_recipes = df_few_recipes.drop(columns = ["num_ratings"])
#df_few_recipes.head()

In [None]:
# dataframe of users 
df_users = df1_5.user_id.value_counts().to_frame().reset_index()
df_users.rename(columns = {"index":"user_id","user_id":"num_ratings"},inplace = True)
#print(len(df_users))
#df_users.head(

In [None]:
# users that have rated more than 10 recipes
df_active_users = df_users[df_users.num_ratings > 5] #********************

In [None]:
# left join to keep only active users otherwise we have 196098.
small_df = df_few_recipes.join(df_active_users.set_index("user_id"), on="user_id", how = "right", lsuffix='', rsuffix='', sort=False).reset_index(drop = True)
small_df = small_df.drop(columns = ["num_ratings"])

In [None]:
# removing null values
small_df = small_df[small_df['recipe_id'].isna() == False]
small_df = small_df[small_df['user_id'].isna() == False]
small_df = small_df[small_df['rating'].isna() == False]

In [None]:
# changing float to int
cols = ["recipe_id","rating"]
small_df[cols] = small_df[cols].astype(int)

In [None]:
# To check number of unique users and recipes

#small_df.recipe_id.value_counts() 

# filtering by users >100 and recipe ratings > 20 gives 6855 recipes and 1345 users
# >3 recipe ratings & >3 user ratings gives 65615 recipes & 26836 users
# >4 recipes ,>5 users gives 18810 users, 50694 recipes 

### Similarity Matrix for recommending top 5 recipes for a given user

In [None]:
# read data for matrix calc
reader = Reader()
data = Dataset.load_from_df(small_df, reader)

In [None]:
# similarity matrix using KNNBasic
from time import time

from surprise import KNNBasic

t0 = time()

data_matrix = Dataset.load_from_df(small_df, reader)
# use full set to build as it will automatically use already rated as training and recommend 
# out of those that have not been rated
trainset2 = data_matrix.build_full_trainset()

# To use item-based cosine similarity
sim_options = {
    "name": "msd",
    "user_based": False,  # Compute similarities between items
}
similarity_matrix = KNNBasic(sim_options=sim_options).fit(trainset2).compute_similarities()

t1 = time()

print('matrix computation takes %f' %(t1-t0))

In [None]:
def recommend(test_subject,k=5):
    # convert raw id to inner id
    test_subject_iid = trainset2.to_inner_uid(test_subject)
    #get the top k items user rated
    test_subject_ratings = trainset2.ur[test_subject_iid]
    #test_subject_ratings = trainset.ur[1]
    k_neighbors = heapq.nlargest(k,test_subject_ratings,key = lambda t:t[1]) # gives 5 neighbor recipes for user
    
    # candidates dictionary
    candidates = defaultdict(float)
    for itemID, rating in k_neighbors:
        try:
            similarities = similarity_matrix[itemID] # get vector from similarity matrix 
            for innerID, score in enumerate(similarities):
                candidates[innerID] += score * (rating/5.0)
        except:
            continue
    
    #Build dict of recipes user has rated
    rated = {}
    for itemID, rating in trainset2.ur[test_subject_iid]:
        rated[itemID] = 1
    
    # add items to users recommendations if similar to what they like and have not already rated
    recommendations =[]
    position = 0
    for itemID, rating_sum in sorted(candidates.items(), key = itemgetter(1), reverse = True): # sorted makes sure you recommend good recipes
        if not itemID in rated:
            recommendations.append(trainset2.to_raw_iid(itemID))
            position += 1
            if (position >=5):break # we only want top 10
    return recommendations

#### Tests to ensure quality 

In [None]:
t2 = time()
rec_dict ={}
for user in small_df['user_id'].unique():
    rec_dict[user] = recommend(user)
    
t3 = time()
print('dict computation takes %f' %(t3-t2))

In [None]:
# to check we have recommendations for all users.
num_null_recs = 0
for x in rec_dict:
    if len(rec_dict[x]) < 5:
        num_null_recs+=1
print(num_null_recs)

### Producing .csv file containing user and their recommendations

In [None]:
#create dataframe from dict
rec_df = pd.DataFrame.from_dict(rec_dict)

In [None]:
# flip orientation of dataframe"
rec_df_flip = rec_df.T

In [None]:
rec_df_flip.reset_index(inplace = True)
rec_df_flip.rename(columns={'index':'user_id',0:'rec1',1:'rec2',2:'rec3',3:'rec4',4:'rec5'}, inplace = True)
rec_df_flip.head()

In [None]:
# convert to csv
rec_df_flip.to_csv("user_recommendations.csv")

### Code for finding recipes similar to one provided

In [None]:
raw_iid = 173430
inner_recipe_id = trainset2.to_inner_iid(raw_iid)
print(inner_recipe_id)
sim_recipes = similarity_matrix[inner_recipe_id]
recs = defaultdict(float)
same_recipes = []
for item,score in enumerate(sim_recipes):
    
    recs[trainset2.to_raw_iid(item)]+=score
    if score ==1:
        same_recipes.append(trainset2.to_raw_iid(item))
        three_sim = same_recipes[:3]
#print(same_recipes)
print(three_sim)