In [17]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import SVD, SVDpp, NMF
from surprise import SlopeOne, CoClustering
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

In [18]:
data = Dataset.load_builtin('ml-100k')

In [3]:
knnbasic_cv = cross_validate(KNNBasic(user_based= True), data, cv=5, n_jobs=5, verbose=False)
knnmeans_cv = cross_validate(KNNWithMeans(), data, cv=5, n_jobs=5, verbose=False)
knnz_cv = cross_validate(KNNWithZScore(), data, cv=5, n_jobs=5, verbose=False)
knnbaseline_cv = cross_validate(KNNWithZScore(), data, cv=5, n_jobs=5, verbose=False)
print (knnbasic_cv)

{'test_rmse': array([0.98208427, 0.97345831, 0.98158899, 0.97593729, 0.97522898]), 'test_mae': array([0.77563476, 0.77033525, 0.77439325, 0.76959169, 0.76960282]), 'fit_time': (0.7983694076538086, 0.816885232925415, 0.9210584163665771, 1.0037074089050293, 0.8278729915618896), 'test_time': (5.887953281402588, 6.174742221832275, 5.853468418121338, 5.698150157928467, 5.464995622634888)}


In [4]:
svd_cv = cross_validate(SVD(), data, cv=5)
svdpp_cv = cross_validate(SVDpp(), data, cv=5, n_jobs=5, verbose=False)
nmf_cv = cross_validate(NMF(), data, cv=5, n_jobs=5, verbose=False)

In [5]:
print('Algorithm\t RMSE\t\t MAE')
print()
print('KNN Basic', '\t', round(knnbasic_cv['test_rmse'].mean(), 4), '\t', round(knnbasic_cv['test_mae'].mean(), 4))
print('KNN Means', '\t', round(knnmeans_cv['test_rmse'].mean(), 4), '\t', round(knnmeans_cv['test_mae'].mean(), 4))
print('KNN ZScore', '\t', round(knnz_cv['test_rmse'].mean(), 4), '\t', round(knnz_cv['test_mae'].mean(), 4))
print('KNN Baseline', '\t', round(knnbaseline_cv['test_rmse'].mean(), 4), '\t', round(knnbaseline_cv['test_mae'].mean(), 4))
print()
print('SVD', '\t\t', round(svd_cv['test_rmse'].mean(), 4), '\t', round(svd_cv['test_mae'].mean(), 4))
print('SVDpp', '\t\t', round(svdpp_cv['test_rmse'].mean(), 4), '\t\t', round(svdpp_cv['test_mae'].mean(), 4))
print('NMF', '\t\t', round(nmf_cv['test_rmse'].mean(), 4), '\t', round(nmf_cv['test_mae'].mean(), 4))

Algorithm	 RMSE		 MAE

KNN Basic 	 0.9777 	 0.7719
KNN Means 	 0.9516 	 0.7496
KNN ZScore 	 0.9514 	 0.7464
KNN Baseline 	 0.9503 	 0.7456

SVD 		 0.9376 	 0.7393
SVDpp 		 0.9187 		 0.7201
NMF 		 0.963 	 0.7568


In [None]:
x_algo = ['KNN Basic', 'KNN Means', 'KNN ZScore', 'KNN Baseline', 'SVD', 'SVDpp', 'NMF']
#x_algo = ['KNN Basic', 'SVD']

all_algos_cv = [knnbasic_cv, knnmeans_cv, knnz_cv, knnbaseline_cv, svd_cv, svdpp_cv, nmf_cv]
#all_algos_cv = [knnbasic_cv, svd_cv]

rmse_cv = [round(res['test_rmse'].mean(), 4) for res in all_algos_cv]
mae_cv = [round(res['test_mae'].mean(), 4) for res in all_algos_cv]

plt.figure(figsize=(20,5))

plt.subplot(1, 2, 1)
plt.title('Comparison of Algorithms on RMSE', loc='center', fontsize=15)
plt.bar(x_algo, rmse_cv, label='RMSE', color='darkgreen')
plt.xlabel('Algorithm')
plt.ylabel('RMSE Value')
plt.ylim(.9,1)
plt.grid(ls='dashed')

plt.subplot(1, 2, 2)
plt.title('Comparison of Algorithms on MAE', loc='center', fontsize=15)
plt.bar(x_algo, mae_cv, label='MAE')
plt.xlabel('Algorithms', fontsize=15)
plt.ylabel('MAE Value', fontsize=15)
plt.ylim(.7,.8)
plt.grid(ls='dashed')

plt.show()

In [None]:
svd_param_grid = {'n_epochs': [20, 25], 
                  'lr_all': [0.007, 0.009, 0.01],
                  'reg_all': [0.4, 0.6]}

#svdpp_gs = GridSearchCV(SVDpp, svd_param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=5)
#svdpp_gs.fit(data)

svd_gs = GridSearchCV(SVD, svd_param_grid, measures=['rmse', 'mae'], cv=5)
svd_gs.fit(data)

In [None]:
#print('SVDpp - RMSE:', round(svdpp_gs.best_score['rmse'], 4), '; MAE:', round(svdpp_gs.best_score['mae'], 4))
print('SVD   - RMSE:', round(svd_gs.best_score['rmse'], 4), '; MAE:', round(svd_gs.best_score['mae'], 4))

In [None]:
print('RMSE =', svd_gs.best_params['rmse'])
print('MAE =', svd_gs.best_params['mae'])

In [None]:
param_grid = {'k': [10, 25, 30, 40],
              'sim_options': {'name': ['msd', 'cosine', 'pearson'],
                              'min_support': [1, 5],
                              'user_based': [False, True]}
              }

knnbasic_gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=5)
knnbasic_gs.fit(data)

In [None]:
knnbasic_gs.best_params

In [None]:
param_grid = {'k': [10, 20, 25, 30, 40]}

knnbasic_gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=5)
knnbasic_gs.fit(data)

knnbasic_gs.best_params

In [None]:
x = [10, 20, 25, 30, 40]
y1 = knnbasic_gs.cv_results['mean_test_rmse']
y2 = knnbasic_gs.cv_results['mean_test_mae']

#y3 = knnmeans_gs.cv_results['mean_test_rmse']
#y4 = knnmeans_gs.cv_results['mean_test_mae']

#y5 = knnz_gs.cv_results['mean_test_rmse']
#y6 = knnz_gs.cv_results['mean_test_mae']
print(knnbasic_gs.cv_results['mean_test_rmse'])
print

In [None]:
plt.figure(figsize=(18,5))

plt.subplot(1, 2, 1)
plt.title('K Neighbors vs RMSE', loc='center', fontsize=15)
plt.plot(x, y1, label='KNNBasic', color='lightcoral', marker='o')
#plt.plot(x, y5, label='KNNWithZScore', color='indianred', marker='o')
#plt.plot(x, y3, label='KNNWithMeans', color='darkred', marker='o')
plt.xlabel('K Neighbor', fontsize=15)
plt.ylabel('RMSE Value', fontsize=15)
plt.legend()
plt.grid(ls='dotted')

plt.subplot(1, 2, 2)
plt.title('K Neighbors vs MAE', loc='center', fontsize=15)
plt.plot(x, y2, label='KNNBasic', color='lightcoral', marker='o')
#plt.plot(x, y4, label='KNNWithMeans', color='indianred', marker='o')
#plt.plot(x, y6, label='KNNWithZScore', color='darkred', marker='o')
plt.xlabel('K Neighbor', fontsize=15)
plt.ylabel('MAE Value', fontsize=15)
plt.legend()
plt.grid(ls='dotted')

plt.show()

In [None]:
print(knnbasic_gs.best_params['mae'])
print(svd_gs.best_params['mae'])
#svd = SVD(svd_gs.best_estimator['mae'])
#svd.fit(trainset)

In [6]:
from collections import defaultdict
from surprise import KNNBaseline

def get_top_n(predictions, n=10):

    
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))


    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [1]:
from collections import defaultdict

from surprise import Dataset
from surprise import SVD
from surprise.model_selection import KFold


def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

In [7]:
trainset = data.build_full_trainset()
testset = trainset.build_anti_testset()

In [10]:
svd = SVD(n_epochs=25, lr_all=.007, reg_all=.4)
svd.fit(trainset)
svd_predictions = svd.test(testset)


({'196': 1.0,
  '186': 1.0,
  '22': 1.0,
  '244': 1.0,
  '166': 1.0,
  '298': 1.0,
  '115': 1.0,
  '253': 1.0,
  '305': 1.0,
  '6': 1.0,
  '62': 1.0,
  '286': 1.0,
  '200': 1.0,
  '210': 1.0,
  '224': 1.0,
  '303': 1.0,
  '122': 1.0,
  '194': 1.0,
  '291': 1.0,
  '234': 1.0,
  '119': 1.0,
  '167': 1.0,
  '299': 1.0,
  '308': 1.0,
  '95': 1.0,
  '38': 1.0,
  '102': 1.0,
  '63': 1.0,
  '160': 1.0,
  '50': 1.0,
  '301': 1.0,
  '225': 1.0,
  '290': 1.0,
  '97': 1.0,
  '157': 1.0,
  '181': 1,
  '278': 1.0,
  '276': 1.0,
  '7': 1.0,
  '10': 1.0,
  '284': 1.0,
  '201': 1.0,
  '287': 1.0,
  '246': 1.0,
  '242': 1.0,
  '249': 1.0,
  '99': 1.0,
  '178': 1.0,
  '251': 1.0,
  '81': 1.0,
  '260': 1.0,
  '25': 1.0,
  '59': 1.0,
  '72': 1.0,
  '87': 1.0,
  '42': 1.0,
  '292': 1.0,
  '20': 1.0,
  '13': 1.0,
  '138': 1.0,
  '60': 1.0,
  '57': 1.0,
  '223': 1.0,
  '189': 1.0,
  '243': 1.0,
  '92': 1.0,
  '241': 1.0,
  '254': 1.0,
  '293': 1.0,
  '127': 1.0,
  '222': 1.0,
  '267': 1.0,
  '11': 1.0,
  '8'

In [16]:
pres, rec = precision_recall_at_k(svd_predictions, k=10, threshold=3.5)
pres

{'196': 1.0,
 '186': 1.0,
 '22': 1.0,
 '244': 1.0,
 '166': 1.0,
 '298': 1.0,
 '115': 1.0,
 '253': 1.0,
 '305': 1.0,
 '6': 1.0,
 '62': 1.0,
 '286': 1.0,
 '200': 1.0,
 '210': 1.0,
 '224': 1.0,
 '303': 1.0,
 '122': 1.0,
 '194': 1.0,
 '291': 1.0,
 '234': 1.0,
 '119': 1.0,
 '167': 1.0,
 '299': 1.0,
 '308': 1.0,
 '95': 1.0,
 '38': 1.0,
 '102': 1.0,
 '63': 1.0,
 '160': 1.0,
 '50': 1.0,
 '301': 1.0,
 '225': 1.0,
 '290': 1.0,
 '97': 1.0,
 '157': 1.0,
 '181': 1,
 '278': 1.0,
 '276': 1.0,
 '7': 1.0,
 '10': 1.0,
 '284': 1.0,
 '201': 1.0,
 '287': 1.0,
 '246': 1.0,
 '242': 1.0,
 '249': 1.0,
 '99': 1.0,
 '178': 1.0,
 '251': 1.0,
 '81': 1.0,
 '260': 1.0,
 '25': 1.0,
 '59': 1.0,
 '72': 1.0,
 '87': 1.0,
 '42': 1.0,
 '292': 1.0,
 '20': 1.0,
 '13': 1.0,
 '138': 1.0,
 '60': 1.0,
 '57': 1.0,
 '223': 1.0,
 '189': 1.0,
 '243': 1.0,
 '92': 1.0,
 '241': 1.0,
 '254': 1.0,
 '293': 1.0,
 '127': 1.0,
 '222': 1.0,
 '267': 1.0,
 '11': 1.0,
 '8': 1.0,
 '162': 1.0,
 '279': 1.0,
 '145': 1.0,
 '28': 1.0,
 '135': 1.0,
 '3

In [None]:
knn = KNNWithMeans(k=25)
knn.fit(trainset)
knn_predictions = knn.test(testset)

In [None]:
from surprise.model_selection import KFold

def evaluation(model, k, threshold):
    algo = model
    kf = KFold(n_splits=5)

    precision_sum = 0 
    recall_sum = 0


    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        predictions = algo.test(testset)
        precisions, recalls = precision_recall_at_k(predictions, k=k, threshold=threshold)

    # Precision and recall can then be averaged over all users
        precision_sum += sum(prec for prec in precisions.values()) / len(precisions)
        recall_sum += sum(rec for rec in recalls.values()) / len(recalls)
    
    print('precision', precision_sum / 5)
    print('recall', recall_sum / 5)
    
    return precision_sum / 5, recall_sum / 5


In [None]:
from surprise.model_selection import KFold

def estimated_average(model):
    algo = model
    kf = KFold(n_splits=3)

    total_est_average = 0


    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        predictions = algo.test(testset)
        est_sum = 0
        for prediction in predictions:
            est_sum += prediction.est

        total_est_average += est_sum/len(predictions)
        
        
    return total_est_average/5


In [None]:
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import SVD, SVDpp, NMF

models = [KNNBasic(), KNNWithMeans(), KNNWithZScore(), KNNBaseline(), SVD(n_epochs=25, lr_all=.007, reg_all=.4), NMF()]
modelNames = ['Ground Truth', 'KNN Basic', 'KNN Means', 'KNN ZScore', 'KNN Baseline', 'SVD', 'NMF']
est_sums = []


for model in models:
    est_sum = estimated_average(model)
    est_sums.append(est_sum)
    


In [None]:
plt.figure(figsize=(10,5))
modelNames = ['Ground Truth', 'KNN Basic', 'KNN Means', 'KNN ZScore', 'KNN Baseline', 'SVD', 'NMF']
averages = []
averages.append(true_average_rating)
for est_sum in est_sums:
    averages.append(est_sum)

print(averages)


plt.title('Estimated Averages', loc='center', fontsize=15)
plt.bar(modelNames, averages, label='Recall')
plt.xlabel('Algorithm', fontsize=15)
plt.ylabel('Movie Rating', fontsize=15)
plt.ylim(3.4, 3.7)
plt.grid(ls='dashed')

plt.show()

In [None]:
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import SVD, SVDpp, NMF

models = [KNNBasic(k=25), KNNWithMeans(), KNNWithZScore(k=25), KNNBaseline(k=25), SVD(n_epochs=25, lr_all=.007, reg_all=.4), NMF()]
modelNames = ['KNN Basic', 'KNN Means', 'KNN ZScore', 'KNN Baseline', 'SVD', 'NMF']
precisions = []
recalls = []


for model in models:
    precision, recall = evaluation(model, 10, 3.5)
    precisions.append(precision)
    recalls.append(recall)


In [None]:
plt.figure(figsize=(20,5))

plt.subplot(1, 2, 1)
plt.title('Comparison of Algorithms on Precision', loc='center', fontsize=15)
plt.bar(modelNames, precisions, label='Precision', color='darkgreen')
plt.xlabel('Algorithm', fontsize=15)
plt.ylabel('Precision Value', fontsize=15)
plt.ylim(.7,.8)
plt.grid(ls='dashed')

plt.subplot(1, 2, 2)
plt.title('Comparison of Algorithms on Recall', loc='center', fontsize=15)
plt.bar(modelNames, recalls, label='Recall')
plt.xlabel('Algorithm', fontsize=15)
plt.ylabel('Recall Value', fontsize=15)
plt.ylim(.5,.6)
plt.grid(ls='dashed')

plt.show()

In [None]:
precisions = []
recalls = []
ks = [5, 10, 20, 25, 30, 40, 50, 100]

for k in ks:
    precision, recall = evaluation(KNNWithMeans(), k, 3.5)
    precisions.append(precision)
    recalls.append(recall)


In [None]:
plt.figure(figsize=(20,5))

plt.subplot(1, 2, 1)
plt.title('Precision at K for Different K Values', loc='center', fontsize=15)
plt.plot(ks, precisions, label='KNN Means', color='darkgreen')
plt.xlabel('K Value', fontsize=15)
plt.ylabel('Precision Value', fontsize=15)
plt.legend()
plt.grid(ls='dashed')

plt.subplot(1, 2, 2)
plt.title('Recall at K for Different K Values', loc='center', fontsize=15)
plt.plot(ks, recalls, label='KNN Means')
plt.xlabel('K Value', fontsize=15)
plt.ylabel('Recall Value', fontsize=15)
plt.legend()
plt.grid(ls='dashed')

plt.show()

In [None]:
precisions = []
recalls = []
thresholds = [3, 3.5, 4, 4.5, 5]

for threshold in thresholds:
    precision, recall = evaluation(KNNWithMeans(), 10, threshold)
    precisions.append(precision)
    recalls.append(recall)


In [None]:
plt.figure(figsize=(20,5))

plt.subplot(1, 2, 1)
plt.title('Comparison of Rating Thresholds on Precision', loc='center', fontsize=15)
plt.plot(thresholds, precisions, label='KNN Means', color='darkgreen')
plt.xlabel('Threshold', fontsize=15)
plt.ylabel('Precision Value', fontsize=15)
plt.legend()
plt.grid(ls='dashed')

plt.subplot(1, 2, 2)
plt.title('Comparison of Rating Thresholds on Recall', loc='center', fontsize=15)
plt.plot(thresholds, recalls, label='KNN Means')
plt.xlabel('Threshold', fontsize=15)
plt.ylabel('Threshold', fontsize=15)
plt.legend()
plt.grid(ls='dashed')

plt.show()

## Extras