# Full review based model and evaluation metrics for recommenders

## Preprocessing

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from collections import defaultdict
from functools import reduce

# NLP processing for lemmatizing
import spacy
import spacy_lookups_data

# Fast word2vect embedding
import fse
from fse import IndexedList
import gensim.downloader as api
from fse.models import uSIF

#Recommender
#loaders
from surprise import Dataset, Reader
# cross validation
from surprise.model_selection import cross_validate, KFold, LeaveOneOut
# Models for RS
from surprise import SVD, KNNBasic
# Hypertunning and metrics
from surprise import accuracy
#GridSearchCV

# Other metrics for recommender systems
import recmetrics

In [2]:
# Load dataframe
reviews = pd.read_csv('../data/reviews_filtered.csv')
reviews.drop_duplicates(inplace=True)
print(reviews.shape)
reviews.head()

(320528, 5)


Unnamed: 0,name,review_stars,cleaned_text,sent_rating,uid
0,Deagan's Kitchen & Bar,5,we walked into melt did you want to put your n...,3.505159,0
1,Deagan's Kitchen & Bar,4,brunch on saturday was excellent the bloody ma...,3.8,1
2,Deagan's Kitchen & Bar,4,great food great atmosphere great service some...,3.5875,2
3,Deagan's Kitchen & Bar,3,had a saturday evening dinner with friends goi...,3.436378,3
4,Deagan's Kitchen & Bar,2,i haven't been here for years i'm not from thi...,2.981408,4


In [3]:
nlp = spacy.load('en_core_web_sm', disable = ['ner', 'parser']) 
nlp.max_length = 33000000

In [4]:
def get_lemmas(string):
    '''
    This function takes a sentence and returns a clean text
    '''
    doc = nlp(string)
    l_token = [token.lemma_ for token in doc if not token.is_punct | token.is_space | token.is_digit | 
               token.like_url | token.like_num | token.like_email & token.is_oov]
    return ' '.join(l_token)

In [5]:
# Lemmatize the words of each sentence
reviews['cleaned_text'] = reviews['cleaned_text'].apply(get_lemmas)
reviews.head()

Unnamed: 0,name,review_stars,cleaned_text,sent_rating,uid
0,Deagan's Kitchen & Bar,5,-PRON- walk into melt do -PRON- want to put -P...,3.505159,0
1,Deagan's Kitchen & Bar,4,brunch on saturday be excellent the bloody mar...,3.8,1
2,Deagan's Kitchen & Bar,4,great food great atmosphere great service some...,3.5875,2
3,Deagan's Kitchen & Bar,3,have a saturday evening dinner with friend go ...,3.436378,3
4,Deagan's Kitchen & Bar,2,i have not be here for year -PRON- be not from...,2.981408,4


In [6]:
# Tokenize (split by words)
sentences = reviews['cleaned_text'].apply(lambda x: x.split(' ')).to_list()

In [7]:
# Load word dictionary and apply it to create a model
glove = api.load("glove-wiki-gigaword-100")

model = uSIF(glove, workers=2, lang_freq="en")

In [8]:
# Train the fse model
s = IndexedList(sentences)

model.train(s)

(320528, 46440481)

In [9]:
matrix = []
for i in range(reviews.shape[0]):
    matrix.append(model.sv[i])
vector_df = pd.DataFrame(matrix)
del matrix

In [10]:
vector_df.to_csv('../data/vector_df.csv')

## Recommendations from models

### Custom functions in order to process and calculate performance metrics

In [103]:
# Function to get the index of all comments

def get_indices(df, threshold=3.5):
    '''
    Retrieve the index of the most rated item for each user(row)
    '''
    uids = []
    idx = []
    ratings = []
    df = df.sort_values('rating', ascending=False)
    for i in df.uid.unique():
        idx.append(df[(df.uid == i) & (df.rating > threshold)].index.to_list())
        ratings.append(df['rating'][(df.uid == i) & (df.rating > threshold)].to_list())
        uids.append(i)
        
    return pd.DataFrame({'user_id': uids, 'idx_actual': idx, 'ratings': ratings})

In [96]:
#Get recommendations based on rating:

def get_users_predictions(uid, df, r_column='rating', k=10):
    '''
    This functions takes an user(uid) from a dataframe(df) and returns a
    list the most k (int) rated items.
    '''
    recommended_items = df[['uid', r_column]][df['uid'] == uid]
    recommended_items = recommended_items.sort_values(r_column, ascending=False).head(k)  
    return recommended_items


In [91]:
def similar_comments(lst, k=50):
    '''
    This functions takes a list of indices and returns the most similar comments, 
    based on the model previously calculated before. The model computes the distances between vector 
    of words of each comments using cosine similarity.
    '''
    idx_comments = []
    for i in lst:
        try:
            similar = [idx for idx in model.sv.most_similar(i)]
            for idx in similar:
                if idx not in idx_comments and idx[0] not in lst:
                    idx_comments.append(idx)
        except KeyError: # The index may has been eliminated
            continue
    # Filter by the similarity
    idx_comments.sort(key=lambda x:x[1], reverse=True)
    # Get only the k indices of comments
    result = [i[0] for i in idx_comments][:k]
    return result

In [14]:
def get_restaurant_name(lst, df=reviews):
    '''
    Given a list of idx return the restaurant name
    '''
    result = []
    for i in lst:
        try:
            result.append(reviews.loc[i,'name'])
        except:
            result.append(np.nan)
    return result

In [117]:
def get_ratings(lst, df=reviews):
    '''
    Given a list of idx return the rating
    '''
    result = []
    for i in lst:
        try:
            result.append(reviews.loc[i,'review_stars'])
        except:
            result.append(np.nan)
    return result

In [144]:
df_ratings = reviews[['uid', 'name', 'review_stars']]
df_ratings.columns = ['uid', 'rid', 'rating'] #uid = user_id / rid = restaurant

In [145]:
# Get indices of the commments most rated for each user
rec_comments = get_indices(df_ratings)
rec_comments.head()

Unnamed: 0,user_id,idx_actual,ratings
0,0,"[0, 86324, 97228, 193467, 202727, 200461, 1179...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, ..."
1,3364,"[221796, 223264, 219084, 88614, 220118, 101854...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ..."
2,3758,"[87036, 221870, 221910, 218872, 218873, 218874...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ..."
3,679,"[221805, 220615, 219486, 88839, 81432, 208664,...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ..."
4,1618,"[87032, 19734, 187599, 172053, 163425, 309358,...","[5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."


In [147]:
# Drop this rating column because the rating that we want is from the list of similar comments 
rec_comments.drop('ratings', axis=1, inplace=True)

In [148]:
# Get indices of the predictions
rec_comments['idx_rec'] = rec_comments['idx_actual'].apply(similar_comments)
rec_comments.head()

Unnamed: 0,user_id,idx_actual,idx_rec
0,0,"[0, 86324, 97228, 193467, 202727, 200461, 1179...","[214946, 76842, 205866, 251498, 29551, 310233,..."
1,3364,"[221796, 223264, 219084, 88614, 220118, 101854...","[150470, 15316, 156026, 107962, 200274, 28839,..."
2,3758,"[87036, 221870, 221910, 218872, 218873, 218874...","[215471, 55182, 27552, 127685, 232276, 229124,..."
3,679,"[221805, 220615, 219486, 88839, 81432, 208664,...","[131776, 320005, 205805, 90956, 246851, 94252,..."
4,1618,"[87032, 19734, 187599, 172053, 163425, 309358,...","[237952, 113046, 280438, 145509, 261478, 31512..."


In [150]:
# Get the name of the restaurants
rec_comments['pred_com'] = rec_comments['idx_rec'].apply(get_ratings)
#rec_comments['pred_com'] = rec_comments['idx_rec'].apply(get_restaurant_name)
rec_comments.head()

Unnamed: 0,user_id,idx_actual,idx_rec,pred_com
0,0,"[0, 86324, 97228, 193467, 202727, 200461, 1179...","[214946, 76842, 205866, 251498, 29551, 310233,...","[4, 4, 5, 4, 4, 5, 4, 5, 3, 4, 4, 4, 3, 1, 4, ..."
1,3364,"[221796, 223264, 219084, 88614, 220118, 101854...","[150470, 15316, 156026, 107962, 200274, 28839,...","[4, 5, 3, 3, 5, 3, 3, 4, 4, 4, 4, 5, 2, 5, 5, ..."
2,3758,"[87036, 221870, 221910, 218872, 218873, 218874...","[215471, 55182, 27552, 127685, 232276, 229124,...","[5, 3, 5, 3, 5, 3, 4, 4, 5, 5, 4, 4, 3, 5, 4, ..."
3,679,"[221805, 220615, 219486, 88839, 81432, 208664,...","[131776, 320005, 205805, 90956, 246851, 94252,...","[4, 5, 4, 4, 5, 2, 5, 4, 3, 4, 3, 5, 4, 5, 3, ..."
4,1618,"[87032, 19734, 187599, 172053, 163425, 309358,...","[237952, 113046, 280438, 145509, 261478, 31512...","[4, 4, 4, 4, 3, 3, 4, 5, 4, 4, 1, 2, 3, 2, 5, ..."


In [160]:
ark(rec_comments.loc[10,'pred_com'], rec_comments.loc[10,'idx_rec'])

0.6

### List of recommendations for rating only

In [105]:
data_rating = Dataset.load_from_df(df_ratings, Reader(rating_scale=(1,5))) 

In [107]:
# Optimized Model
svd = SVD(n_factors = 50, n_epochs = 50, lr_all = 0.002, reg_all =  0.08)
trainset = data_rating.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1ddc5498b08>

In [108]:
# Map iid to name restaurant
name_iid = {}
for i in trainset.ir:
    name_iid[trainset.to_raw_iid(i)] = i

In [109]:
predictions = {}
for i in range(df_ratings.shape[0]):
    try:
        predictions[i] = svd.predict(df_ratings.loc[i, 'uid'], df_ratings.loc[i,'rid']).est
    except:
        predictions[i] = np.nan

In [110]:
df_ratings['pred'] = predictions.values()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [111]:
df_rating_stars = df_ratings[['uid', 'rid', 'pred']]
df_rating_stars.columns = ['uid', 'rid', 'rating']
df_rating_stars

Unnamed: 0,uid,rid,rating
0,0,Deagan's Kitchen & Bar,3.928411
1,1,Deagan's Kitchen & Bar,4.162772
2,2,Deagan's Kitchen & Bar,4.445593
3,3,Deagan's Kitchen & Bar,3.164741
4,4,Deagan's Kitchen & Bar,3.572904
...,...,...,...
320574,3351,Swiss Chalet Rotisserie & Grill,2.882307
320575,337,Sultan of Samosas,3.192871
320576,729,Rickety Cricket Brewing Bistro,3.054259
320577,3840,Cibo Homemade Pasta & Grille,3.171281


In [112]:
# Index of the recommendations as well as the rated give by the user
rec_ratings = get_indices(df_rating_stars)
rec_ratings.head()

Unnamed: 0,user_id,idx_actual,ratings
0,2065,"[124449, 300345, 213539, 56049, 14388, 198913,...","[5.0, 4.694967206684223, 4.531108082390571, 4...."
1,3579,"[125393, 256627, 144243, 278157, 106777, 13315...","[5.0, 4.771475742765279, 4.460451304823717, 4...."
2,3873,"[250543, 313088, 158388, 161262, 127648, 15762...","[5.0, 5.0, 4.813952488910123, 4.74040077429284..."
3,3290,"[52569, 159738, 261182, 27915, 298668, 191280,...","[5.0, 4.7537147290152495, 4.659352952621169, 4..."
4,282,"[58085, 67763, 21144, 122561, 243624, 93394, 1...","[5.0, 4.9073136532044925, 4.866318636007135, 4..."


In [118]:
rec_ratings['rat_actual'] = rec_ratings['idx_actual'].apply(get_ratings)
rec_ratings

Unnamed: 0,user_id,idx_actual,ratings,rat_actual
0,2065,"[124449, 300345, 213539, 56049, 14388, 198913,...","[5.0, 4.694967206684223, 4.531108082390571, 4....","[2, 5, 5, 4, 5, 4, 4, 2, 3, 4, 4, 5, 5, 4, 3, ..."
1,3579,"[125393, 256627, 144243, 278157, 106777, 13315...","[5.0, 4.771475742765279, 4.460451304823717, 4....","[5, 2, 4, 1, 4, 5, 5, 3, 5, 4, 4, 5, 5, 4, 4, ..."
2,3873,"[250543, 313088, 158388, 161262, 127648, 15762...","[5.0, 5.0, 4.813952488910123, 4.74040077429284...","[2, 4, 3, 4, 3, 4, 3, 3, 4, 3, 4, 2, 1, 2, 2, ..."
3,3290,"[52569, 159738, 261182, 27915, 298668, 191280,...","[5.0, 4.7537147290152495, 4.659352952621169, 4...","[4, 4, 3, 3, 4, 5, 4, 4, 3, 5, 3, 2, 4, 3, 3, ..."
4,282,"[58085, 67763, 21144, 122561, 243624, 93394, 1...","[5.0, 4.9073136532044925, 4.866318636007135, 4...","[3, 4, 2, 2, 4, 4, 4, 4, 3, 3, 3, 4, 4, 5, 5, ..."
...,...,...,...,...
4001,2220,"[138370, 306741, 244732, 47757, 258232, 148054...","[4.3026396429647145, 4.301513496256174, 4.2691...","[3, 4, 2, 5, 5, 3, 4, 2, 3, 4, 4, 4, 2, 4, 4, ..."
4002,1274,"[229662, 50399, 53054, 95560, 150755, 223068, ...","[4.2884259132046, 4.2383906842238686, 4.211497...","[5, 4, 3, 4, 3, 3, 1, 3, 4, 3, 3, 1, 3, 4, 3, ..."
4003,593,"[89863, 245766, 110482, 171908, 73814, 149636,...","[4.2815255723159495, 4.259943725417783, 4.1472...","[4, 3, 3, 4, 4, 3, 4, 2, 4, 4, 3, 4, 5, 3, 3, ..."
4004,1083,"[98638, 96252, 106716, 26362, 300469, 1416, 15...","[4.243668162473115, 4.23864853505004, 4.201395...","[4, 4, 4, 5, 5, 4, 4, 4, 4, 5, 4, 5, 4, 5, 5, ..."


### Sentiment analysis based recommendation

In [161]:
df_sent = reviews[['uid', 'name', 'sent_rating']]
df_sent.columns = ['uid', 'rid', 'rating'] #uid = user_id / rid = restaurant
data_sent = Dataset.load_from_df(df_sent, Reader(rating_scale=(1,5)))

In [162]:
# Fine tunned model for sentiment analysis
kNN = KNNBasic(k=50, min_k=5, sim_options = {'name': 'msd', 'user_based': [False]})
trainset = data_rating.build_full_trainset()
kNN.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1ddd607bcc8>

In [163]:
predictions = {}
for i in range(df_sent.shape[0]):
    try:
        predictions[i] = kNN.predict(df_sent.loc[i, 'uid'], df_sent.loc[i,'rid']).est
    except:
        predictions[i] = np.nan

In [165]:
df_sent['pred'] = predictions.values()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [166]:
df_sent = df_sent[['uid', 'rid', 'pred']]
df_sent.columns = ['uid', 'rid', 'rating']
df_sent

Unnamed: 0,uid,rid,rating
0,0,Deagan's Kitchen & Bar,4.465961
1,1,Deagan's Kitchen & Bar,4.177944
2,2,Deagan's Kitchen & Bar,4.083518
3,3,Deagan's Kitchen & Bar,3.701011
4,4,Deagan's Kitchen & Bar,3.669555
...,...,...,...
320574,3351,Swiss Chalet Rotisserie & Grill,2.596003
320575,337,Sultan of Samosas,3.747885
320576,729,Rickety Cricket Brewing Bistro,3.361971
320577,3840,Cibo Homemade Pasta & Grille,3.159051


In [167]:
rec_sentiment = get_indices(df_sent)
#rec_sentiment['sent_pred'] = rec_ratings['idx_actual'].apply(get_restaurant_name)
rec_sentiment.head()

Unnamed: 0,user_id,idx_actual,ratings
0,109,"[283278, 257456, 102254, 162927, 180983, 15088...","[5.0, 4.967660935217688, 4.800075555195769, 4...."
1,2565,"[154710, 313457, 223195, 251668, 256879, 13099...","[5.0, 4.920427888181394, 4.905190937222596, 4...."
2,1891,"[284848, 252652, 282729, 302394, 151738, 16363...","[5.0, 4.868439926246033, 4.764330523958913, 4...."
3,1859,"[284849, 286755, 152200, 299498, 315961, 30125...","[5.0, 5.0, 4.864143425784075, 4.86214259344870..."
4,664,"[307615, 284718, 52670, 287148, 162687, 95016,...","[5.0, 4.861444876141891, 4.760370304334406, 4...."


In [168]:
rec_sentiment['rat_sent_actual'] = rec_sentiment['idx_actual'].apply(get_ratings)
rec_sentiment

Unnamed: 0,user_id,idx_actual,ratings,rat_sent_actual
0,109,"[283278, 257456, 102254, 162927, 180983, 15088...","[5.0, 4.967660935217688, 4.800075555195769, 4....","[4, 5, 3, 4, 4, 3, 4, 4, 5, 4, 4, 4, 4, 4, 5, ..."
1,2565,"[154710, 313457, 223195, 251668, 256879, 13099...","[5.0, 4.920427888181394, 4.905190937222596, 4....","[3, 3, 3, 1, 2, 4, 4, 3, 2, 3, 4, 3, 4, 4, 3, ..."
2,1891,"[284848, 252652, 282729, 302394, 151738, 16363...","[5.0, 4.868439926246033, 4.764330523958913, 4....","[5, 4, 5, 1, 3, 5, 4, 4, 4, 5, 5, 5, 4, 4, 4, ..."
3,1859,"[284849, 286755, 152200, 299498, 315961, 30125...","[5.0, 5.0, 4.864143425784075, 4.86214259344870...","[2, 3, 4, 2, 5, 4, 4, 3, 2, 3, 2, 2, 3, 4, 4, ..."
4,664,"[307615, 284718, 52670, 287148, 162687, 95016,...","[5.0, 4.861444876141891, 4.760370304334406, 4....","[2, 2, 4, 4, 3, 4, 5, 5, 5, 3, 4, 4, 3, 4, 3, ..."
...,...,...,...,...
4001,3166,"[312807, 9213, 244605, 174177, 206464, 194905,...","[4.41767725056517, 4.392759260891076, 4.364845...","[4, 5, 5, 4, 5, 3, 4, 4, 5, 5, 4, 5, 5, 5, 4, ..."
4002,2954,"[210806, 102330, 238945, 118171, 309838, 18373...","[4.400801560301623, 4.381363589295059, 4.33183...","[2, 4, 3, 5, 5, 3, 3, 5, 4, 4, 5, 4, 4, 4, 3, ..."
4003,1886,"[37613, 170062, 164161, 195189, 16838, 65196, ...","[4.370883013503801, 4.331727024050968, 4.31485...","[4, 4, 5, 5, 4, 2, 4, 4, 5, 4, 2, 3, 5, 5, 3, ..."
4004,3989,"[192301, 248949, 238482, 70994, 126749, 158738...","[4.36544756902779, 4.34287365472376, 4.3411681...","[5, 3, 2, 5, 5, 5, 4, 5, 2, 4, 2, 4, 5, 4, 4, ..."


In [36]:
###  Create a matrix with the list of all predictions for all users

In [169]:
# Drop unnecessary columns
rec_comments.drop('idx_actual', axis=1, inplace=True)
rec_ratings.drop('idx_actual', axis=1, inplace=True)
rec_sentiment.drop('idx_actual', axis=1, inplace=True)

In [170]:
dfs = [rec_comments, rec_ratings, rec_sentiment]
recommendations = reduce(lambda left,right: pd.merge(left,right,on='user_id'), dfs)

In [172]:
recommendations.drop(['ratings_x', 'ratings_y'], axis=1, inplace=True)

In [173]:
recommendations.head()

Unnamed: 0,user_id,idx_rec,pred_com,rat_actual,rat_sent_actual
0,0,"[214946, 76842, 205866, 251498, 29551, 310233,...","[4, 4, 5, 4, 4, 5, 4, 5, 3, 4, 4, 4, 3, 1, 4, ...","[3, 4, 5, 4, 3, 5, 4, 4, 3, 3, 4, 4, 4, 5, 2, ...","[5, 1, 4, 3, 4, 3, 2, 4, 5, 5, 5, 4, 4, 5, 3, ..."
1,3364,"[150470, 15316, 156026, 107962, 200274, 28839,...","[4, 5, 3, 3, 5, 3, 3, 4, 4, 4, 4, 5, 2, 5, 5, ...","[4, 4, 3, 4, 3, 4, 3, 4, 3, 4, 4, 3, 5, 2, 5, ...","[3, 5, 4, 4, 5, 2, 5, 4, 4, 4, 4, 2, 5, 3, 3, ..."
2,3758,"[215471, 55182, 27552, 127685, 232276, 229124,...","[5, 3, 5, 3, 5, 3, 4, 4, 5, 5, 4, 4, 3, 5, 4, ...","[5, 5, 3, 4, 4, 4, 5, 5, 4, 5, 5, 2, 4, 5, 5, ...","[5, 5, 4, 4, 4, 5, 5, 4, 5, 3, 5, 5, 3, 5, 5, ..."
3,679,"[131776, 320005, 205805, 90956, 246851, 94252,...","[4, 5, 4, 4, 5, 2, 5, 4, 3, 4, 3, 5, 4, 5, 3, ...","[4, 4, 5, 5, 5, 5, 5, 4, 5, 4, 4, 5, 4, 5, 5, ...","[4, 5, 5, 5, 4, 4, 5, 5, 5, 5, 5, 4, 5, 4, 4, ..."
4,1618,"[237952, 113046, 280438, 145509, 261478, 31512...","[4, 4, 4, 4, 3, 3, 4, 5, 4, 4, 1, 2, 3, 2, 5, ...","[4, 4, 3, 4, 5, 4, 4, 4, 3, 4, 4, 3, 3, 4, 4, ...","[4, 4, 3, 3, 4, 4, 4, 4, 3, 4, 4, 3, 5, 4, 4, ..."


### Comparation of performance between recommenders

In [176]:
#### Average Precision at K

def apk(rating, pred, k=10):
    '''
    This functions calculates the precision at Kth metric for each user in the dataset.
    '''
    score = 0
    n_hits = 0
    
    for r,p in (zip(rating, pred)):
        if len(pred) > k:
            pred = pred[:k]
            rating = rating[:k]
            
        for i, p in enumerate(pred):
            # If the recommendation is relevant add it to the result
            if rating[i] >= 3.5:
                n_hits +=1
                score += n_hits / (i+1)
            
    
        if not rating:
            return np.nan

        return score / min(len(pred), k)
    

In [142]:
def ark(rating, pred, k=10):
    '''
    This functions calculates the recall at Kth metric for each user in the dataset.
    '''
    
    n_hits = 0
    
    for a,p in (zip(rating, pred)):
        if len(pred) > k:
            pred = pred[:k]
            rating = rating[:k]
    
        for i, p in enumerate(pred):
            if rating[i] > 3.5:
                n_hits +=1
    
        if not rating:
            return np.nan
        
        return n_hits / min(len(rating), k)
    
    

In [177]:
# Calculate the average precission and recall for each user

for i in range(recommendations.shape[0]):
    
    # Average Precision at K
    recommendations.loc[i, 'apk_comments'] = apk(recommendations.loc[i,'pred_com'], recommendations.loc[i,'idx_rec'])
    recommendations.loc[i, 'apk_rating'] = apk(recommendations.loc[i,'rat_actual'], recommendations.loc[i,'idx_rec'])
    recommendations.loc[i, 'apk_sentiment'] = apk(recommendations.loc[i,'rat_sent_actual'], recommendations.loc[i,'idx_rec'])
    
    # Average Recall at K
    recommendations.loc[i, 'ark_comments'] = ark(recommendations.loc[i,'pred_com'], recommendations.loc[i,'idx_rec'])
    recommendations.loc[i, 'ark_rating'] = ark(recommendations.loc[i,'rat_actual'], recommendations.loc[i,'idx_rec'])
    recommendations.loc[i, 'ark_sentiment'] = ark(recommendations.loc[i,'rat_sent_actual'], recommendations.loc[i,'idx_rec'])


In [178]:
## Calculate the global metric of mean average precision and recall for each dataset

mapk_comments = np.mean(recommendations.apk_comments)
mapk_rating = np.mean(recommendations.apk_rating)
mapk_sentiment = np.mean(recommendations.apk_sentiment)

mark_comments = np.mean(recommendations.ark_comments)
mark_rating = np.mean(recommendations.ark_rating)
mark_sentiment = np.mean(recommendations.ark_sentiment)

print(mapk_comments, mapk_rating, mapk_sentiment)
print(mark_comments, mark_rating, mark_sentiment)

0.4824763450062997 0.5344243258128684 0.5410717752735976
0.643135297054413 0.6739640539191204 0.6788067898152768


In [185]:
print('#### Mean Average Precission ####')
print(f'The Mean Average Precission at 10 (MAP@10) for rating dataset is: {round(mapk_rating,3)}')
print(f'The Mean Average Precission at 10 (MAP@10) for sentiment dataset is: {round(mapk_sentiment,3)}')
print(f'The Mean Average Precission at 10 (MAP@10) for comments dataset is: {round(mapk_comments,3)}')
print('\n')
print('#### Mean Average Recall ####')
print(f'The Mean Average Recall at 10 (MAR@10) for rating dataset is: {round(mark_rating,3)}')
print(f'The Mean Average Recall at 10 (MAR@10) for sentiment dataset is: {round(mark_sentiment,3)}')
print(f'The Mean Average Recall at 10 (MAR@10) for comments dataset is: {round(mark_comments,3)}')

#### Mean Average Precission ####
The Mean Average Precission at 10 (MAP@10) for rating dataset is: 0.534
The Mean Average Precission at 10 (MAP@10) for sentiment dataset is: 0.541
The Mean Average Precission at 10 (MAP@10) for comments dataset is: 0.482


#### Mean Average Recall ####
The Mean Average Recall at 10 (MAR@10) for rating dataset is: 0.674
The Mean Average Recall at 10 (MAR@10) for sentiment dataset is: 0.679
The Mean Average Recall at 10 (MAR@10) for comments dataset is: 0.643
