In [1]:
import pandas as pd
import numpy as np
import itertools
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm
import json
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
## read in the business profile file

line_count = len(open("business.json").readlines())
business_ids, categories, attr= [], [], []
with open("business.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        business_ids += [blob["business_id"]]
        categories += [blob["categories"]]
        attr += [blob["attributes"]]
business = pd.DataFrame(
{"business_id": business_ids, "category": categories,"attr": attr}
)

100%|██████████| 192609/192609 [00:02<00:00, 92573.43it/s]


In [3]:
### Separate them into train, validation and test data and return indexes of the data.
def make_selection(data_sample, train_size=0.8, val_size=0.2):
    '''
    Divide the dataset into training and validation dataset. 
    To avoid the cold start problem, we add the constraint on the training set
    that it should contain at least one rating from all users and all movies
    should be rated at least once.

    data_sample - the data that we will split
    train_size - the size of the training set in percentage
    val_size - the size of the validation set in percentage
    '''
    indexes_selected = []
    pd_data_grouped = data_sample.groupby('user_id')
    for name, group in pd_data_grouped:
        indexes_selected.append(np.random.choice(group.index,size=1, replace=False))

    pd_data_grouped_item = data_sample.groupby('business_id')
    for name, group in pd_data_grouped_item:
        indexes_selected.append(np.random.choice(group.index,size=1, replace=False))
    
    indexes_selected = np.unique(np.asarray(indexes_selected))

    num_train = int(data_sample.shape[0]*train_size)
    num_validation = int(data_sample.shape[0]*val_size)

    num_selected = indexes_selected.size

    data_left = data_sample.drop(indexes_selected)

    training_index = np.random.choice(data_left.index, size=(num_train-num_selected), replace=False)
    validation_index = np.random.choice(data_left.drop(training_index).index, size=num_validation, replace=False)
    
    training_index = np.append(training_index, indexes_selected)

    return training_index, validation_index

In [4]:
def random_sample_size(small_sample):
    '''
    Divide the dataset into training, validation and testing dataset
    Return the subseted datasets.
    
    small_sample - the input of sample dataset
    '''
    training_index, validation_index = make_selection(small_sample, train_size=0.8, val_size = 0.2)
    train_data = small_sample.loc[training_index,]

    val_data = small_sample.loc[validation_index,]

    #test_data = small_sample.loc[test_index, ['rating', 'movieId', 'userId']]
    return train_data, val_data

In [5]:
##convert each attribute feature to a single word as new_attr
def get_attr(x):
    '''
    Convert each attribute feature to a single word as new_attr 
    Return the new dataset
    
    x - the input of sample dataset
    '''
    l=[]
    if x !=None:
        for k in x.keys():
            if x[k]=="True":
                l.append(k.lower())
    return ', '.join(map(str,l))

In [None]:
def cosine_similarity_matrix(business):
        '''
    Calculate the cosine similarity matrix of item(business)
    
    business - business profile
    
        '''
    # instantiating and generating the count matrix
    count = CountVectorizer()
    count_matrix = count.fit_transform(business["bag_of_words"])

    # generating the cosine similarity matrix
    cosine_sim = cosine_similarity(count_matrix, count_matrix)

    # creating a Series for the business_id  so they are associated to an ordered numerical
    # list I will use later to match the indexes
    indices = pd.Series(business['business_id'])
    
    return cosine_sim,indices

In [None]:
def recommendations(title,n = 5):
    '''
    function that takes in bisiness_id as input and returns 
    the top n = 5 recommended stores
    The returned value is index of the recommendations and the 
    set of category it belongs to.
    
    title - bisiness_id need recommendations
    n - number of recommendatios need
    '''
    recommended = []
    return_index =[]
    category = set()
    idx = 1
    # gettin the index of the business that matches the bisiness_id
    if(title in list(indices)):
        idx = indices[indices == title].index[0]

    if(idx == 14967):
        idx = 14966
    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 5 most similar bisiness
    top_5_indexes = list(score_series.iloc[1:6].index)
    
    # populating the list with the titles of the best 5 matching bisiness_id
    for i in top_5_indexes:

        return_index.append(i)
    
        temp = list(business['category'])[i].split(",")
        temp = list(map(lambda x:x.strip() , temp))
        category.update(set(temp))
        
    return return_index,category

In [None]:
def accuracy(user_id):
    '''
    Calculate the accuracy of the recommendations we have for a single user
    by comparing the category of our recommendations with his or her actual interested 
    category in the test data.
    The higher the accuracy the higher the overlap of the recommendations category and the 
    actual interested category.
    Return the accuracy number
    
    user_id -- the user id of s single user
    '''
    ttt = train_data[train_data['user_id']==user_id]
    ttt = ttt[ttt["rating"]>3]
  
    recommend_cate_set = set()
    def fun1(business_id):
        category = recommendations(business_id)[1]
        recommend_cate_set.update(category)
        return recommend_cate_set
    
    list(map(lambda x:fun1(x),list(ttt["business_id"])));
    
    true_cate = set()
    def fun2(i):
        temp = list(business[business["business_id"] == i]['category'])[0].split(",")
        temp = list(map(lambda x:x.strip() ,temp))
        if (len(temp) >0):
            true_cate.update(temp)
        return true_cate
    
    true_list = list(set(ppp["business_id"]) & set(business["business_id"]))
    list(map(lambda x:fun2(x),true_list))
    
    accuracy = 0
    if(len(true_cate) != 0):
        accuracy = len( recommend_cate_set & true_cate)/len(true_cate)
        
    
    
    return (accuracy)

In [None]:
def single_serendipity(title1,title2):
    '''
    We care about the serendipity of our recomendtiosn
    The amount of relevant information that is new to the user in a
    recommendation
    Return the serendipity score
    
    title1- The original business_id the user have rated 
    title2 -The recommendedbusiness_id
    '''
    t1_word = business.loc[business['business_id'] ==title1]['bag_of_words'].map(lambda x: x.split(','))
    t1_word = set(t1_word.tolist()[0])
    
    t2_word = business.loc[business['business_id']==title2]["bag_of_words"].map(lambda x: x.split(','))
    t2_word= set(t2_word.tolist()[0])
    
    recommend_len =len(t2_word)
    existed_len = len(t1_word.intersection(t2_word))
    serendipity_score = (recommend_len - existed_len)/recommend_len
    
    
    return serendipity_score

In [None]:
def recommend_serendipity(title1,recommend):
    '''
    The overall serendipity of our recommendations
    Return the serendipity score
    
    title1-The original business_id the user have rated 
    recommend - The list of recommendations we provide
    
    '''
    recommend_id = business.loc[business['business_id'].isin(recommend)]["business_id"].tolist()
    
    recommend_serendipity = {}
    for title2 in recommend_id:
        recommend_serendipity[title2] = single_serendipity(title1,title2)
        
    return  recommend_serendipity  
        
 

In [236]:
## yelp.csv is the small sample we subset from the review dataset, which include useres with 5 or more reviews
sample = pd.read_csv('yelp.csv')

## hold out user's final review (by date) as the test data and make the rest of them as train data
sample.date = pd.to_datetime(sample.date)
sample.dropna(axis=0,how='any');
test=sample.sort_values(by=['user_id','date']).groupby("user_id")['date'].max()
test=pd.DataFrame(test)
test= test.merge(sample,on=['user_id','date'],how='left')
train=sample[~sample['Unnamed: 0'].isin(np.array(test['Unnamed: 0']))]

In [388]:
train_user = train.user_id.values
test  = test[test["user_id"].isin(train_user)]

In [89]:
##filter busuness so we only care about business rated by user in sample
##and the business_id ehich have info in business profile
t1 = sample.business_id.values 
t2 = business.business_id.values
t3 = np.intersect1d(t1,t2)

business = business[business['business_id'].isin(t3)]
sample = sample[sample['business_id'].isin(t3)]
##split train and validation data of the train data we have 
train_data, val_data = random_sample_size(train)

##reindex business id
reindex = pd.Series(list(range(len(business))))
business=business.set_index([reindex])

##combine new_attr and category as the item profile
business["new_attr"]=business['attr'].map(lambda x: get_attr(x))
business["bag_of_words"]=business["category"]+', '+business["new_attr"]

##manipulate bag_of_words and category
##remove null data and convert string to lower case
business["category"] = business["category"].str.lower()
business = business[pd.notnull(business["category"])]
business['bag_of_words'] = business['bag_of_words'].str.lower()
business = business[pd.notnull(business['bag_of_words'])]


In [19]:
## calculate the cosine matrix
cosine_sim,indices = cosine_similarity_matrix(business)

In [261]:
####This is a test code

# business_id_1 = "gnKjwL_1w79qoiV3IC_xQQ"
# business_id_2 = "PZ-LZzSlhSe9utkQYU8pFg"
user_id = "5JVY32_bmTBfIGpCCsnAfw"

# ttt = train_data[train_data['user_id']==user_id]
# ttt = ttt[ttt["rating"]>3]
# ppp = val_data[val_data['user_id']==user_id]
# recommend_cate_set = set()
# #recommend_index = []
# for business_id in ttt["business_id"]:
#     recommend_return_index,category = recommendations(business_id)
#     recommend_cate_set.update(category)
#     #recommend_index.extend(recommend_return_index)
# true_cate = set()
# #true_index = []
# for i in ppp["business_id"]:
#     #print(i)
#     #true_index_single = business[business["business_id"] == i].index.values
#     #true_index.extend(true_index_single)
#     temp = list(business[business["business_id"] == i]['category'])[0].split(",")
#     temp = list(map(lambda x:x.strip() ,temp))
#    # print(temp)
#     true_cate.update(temp)
# len(recommend_cate_set & true_cate)/len(true_cate)

# accuracy(user_id)

In [241]:
##The test result is the accuracy from validation data
test_result = [accuracy(i) for i in val_data.user_id.unique()]

In [242]:
##The mean Accuracy of the validation result
sum(test_result)/len(test_result)

0.3899222409484094

In [239]:
##The test result is the accuracy from the test data (last review)
final_result = [accuracy(i) for i in test.user_id.unique()]

In [240]:
##The mean Accuracy of the test result
sum(final_result)/len(final_result)

0.2696801015680017

In [430]:
def single_user_bus_rating(user_id):
    temp = train[train.user_id == user_id]
    return (temp.business_id, temp.rating)

In [392]:
#predict the rating of user for their last visited business
def predict(user_id,idx):
    '''
    predict the rating of user for their last visited business
    
    user_id - single user id
    idx- the index of their last visited business in the similarity matrix of business profile
    '''
    single_user_bus,single_user_bus_ratings = single_user_bus_rating(user_id)

    single_user_bus_idx = [indices[indices == i ].index[0] for i in single_user_bus]
    single_user_bus_sim = [cosine_sim[idx][i] for i in single_user_bus_idx]
    sum = 0
    for i in range(len(single_user_bus_sim)):

        sum = sum + single_user_bus_sim[i]*list(single_user_bus_ratings)[i]
    return sum/len(single_user_bus)

In [None]:
# Calculate MSE
def getMSE(prediction, true_val):
    '''
    Calculate the MSE of true value and predicted value
    
    prediction - the pridicted rating of test_user
    true_val - the true rating of the test_user
    '''
    # MSE on the train data
    mse = np.nanmean(((true_val -  prediction) ** 2))
    return mse

In [440]:
def calcRMSE(test):
    '''
    Calculate RMSE for the test data

    '''
    
    mse_sum = 0
    for user_id in test.user_id:
        test_bus = test[test.user_id == user_id].business_id
        idx = indices[indices == list(test_bus)[0]].index[0]
        test_rating = test[test.user_id == user_id].rating
        if (idx ==14967):
            idx = 14966
        predition = predict(user_id,idx)
        mse = getMSE(predition,test_rating)
        mse_sum += mse
    RMSE = math.sqrt(mse_sum / len(test.user_id))
    return RMSE


In [441]:
calcRMSE(test)

3.0383425605220977

In [442]:
calcRMSE(val_data)

2.695398142213499