In [None]:
!pip install surprise
import pandas as pd
import numpy as np
from surprise import SVD, KNNBasic
from surprise import Dataset
from surprise import Reader
from surprise.prediction_algorithms.knns import KNNWithMeans
from surprise.model_selection import cross_validate
import matplotlib.pyplot as plt

In [None]:
#3a)
data = pd.read_csv('ratings_small.csv')
data.head()

In [None]:
reader = Reader(line_format='user item rating timestamp',sep=',',skip_lines=1, rating_scale=(1, 5))
data = Dataset.load_from_file("ratings_small.csv",reader)

In [None]:
#PROBABILISTIC MATRIX FACTORIZATION
algorithm_svd = SVD()
pmf_results = cross_validate(algorithm_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print("Probabilistic Matrix Factorization Results : ",pmf_results)
avg_mae_pmf = pmf_results['test_mae'].mean()
avg_rsme_pmf = pmf_results['test_rmse'].mean()

print(f'Average MAE of the  Probabilistic Matrix Factorization under the 5-folds cross-validation : {avg_mae_pmf}')
print(f'Average RMSE of the  Probabilistic Matrix Factorization under the 5-folds cross-validation : {avg_rsme_pmf}')

In [None]:
#User Based Collabirative Filtering
sim_options = {
    "user_based": True, 
}

knn = KNNWithMeans(sim_options=sim_options)
ubcf_results = cross_validate(knn, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print("User Based Collabirative Filtering Results : ",ubcf_results)
avg_mae_user = ubcf_results['test_mae'].mean()
avg_rsme_user = ubcf_results['test_rmse'].mean()

print(f'Average MAE of the User based Collaborative Filtering under the 5-folds cross-validation : {avg_mae_user}')
print(f'Average RMSE of the User based Collaborative Filtering under the 5-folds cross-validation : {avg_rsme_user}')

In [None]:
#Item Based Collabirative Filtering 
sim_options = {
    "user_based": False,
}

knn = KNNWithMeans(sim_options=sim_options)
ibcf_results = cross_validate(knn, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print("Item Based Collabirative Filtering Results : ",ibcf_results)
avg_mae_item = ibcf_results['test_mae'].mean()
avg_rsme_item = ibcf_results['test_rmse'].mean()

print(f'Average MAE of the Item based Collaborative Filtering under the 5-folds cross-validation : {avg_mae_item}')
print(f'Average RMSE of the Item based Collaborative Filtering under the 5-folds cross-validation : {avg_rsme_item}')

In [None]:
#Cosine similarity for User based Collaborative Filtering
sim_options = {'name':'cosine','user_based': True }
user_cosine = KNNWithMeans(sim_options=sim_options)
user_cosine_scores = cross_validate(user_cosine,data,measures=['RMSE', 'MAE'],cv=5,verbose=True)
print('Cosine similarity impact on User Based Collaborative Filtering : ', user_cosine_scores)
avg_mae_cosine = user_cosine_scores['test_mae'].mean()
avg_rsme_cosine = user_cosine_scores['test_rmse'].mean()
print(f'Cosine Average MAE of the User based Collaborative Filtering : {avg_mae_cosine}')
print(f'Cosine Average RMSE of the User based Collaborative Filtering : {avg_rsme_cosine}')


In [None]:
#MSD similarity for User based Collaborative Filtering
sim_options = {'name':'msd','user_based': True }
user_msd = KNNWithMeans(sim_options=sim_options)
user_msd_scores = cross_validate(user_msd,data,measures=['rmse', 'mae'],cv=5,verbose=True)
print('MSD similarity impact on User Based Collaborative Filtering : ', user_msd_scores)
avg_mae_msd = user_msd_scores['test_mae'].mean()
avg_rsme_msd = user_msd_scores['test_rmse'].mean()
print(f'MSD Average MAE of the User based Collaborative Filtering : {avg_mae_msd}')
print(f'MSD Average RMSE of the User based Collaborative Filtering : {avg_rsme_msd}')

In [None]:
#Pearson similarity for User based Collaborative Filtering
sim_options = {'name':'pearson_baseline','user_based': True }
user_pearson = KNNWithMeans(sim_options=sim_options)
user_pearson_scores = cross_validate(user_pearson,data,measures=['rmse', 'mae'],cv=5,verbose=True)
print('Pearson similarity impact on User Based Collaborative Filtering : ', user_pearson_scores)
avg_mae_pb = user_pearson_scores['test_mae'].mean()
avg_rsme_pb = user_pearson_scores['test_rmse'].mean()
print(f'Pearson Average MAE of the User based Collaborative Filtering : {avg_mae_pb}')
print(f'Pearson Average RMSE of the User based Collaborative Filtering : {avg_rsme_pb}')

In [None]:
#Cosine similarity for Item based Collaborative Filtering
sim_options = {'name':'cosine','user_based': False }
item_cosine = KNNWithMeans(sim_options=sim_options)
item_cosine_scores = cross_validate(item_cosine,data,measures=['rmse', 'mae'],cv=5,verbose=True)
print('Cosine similarity impact on Item Based Collaborative Filtering : ', item_cosine_scores)
avg_mae_cosine = item_cosine_scores['test_mae'].mean()
avg_rsme_cosine = item_cosine_scores['test_rmse'].mean()
print(f'Cosine Average MAE of the Item based Collaborative Filtering : {avg_mae_cosine}')
print(f'Cosine Average RMSE of the Item based Collaborative Filtering : {avg_rsme_cosine}')

In [None]:
#MSD similarity for Item based Collaborative Filtering
sim_options = {'name':'msd','user_based': False }
item_msd = KNNWithMeans(sim_options=sim_options)
item_msd_scores = cross_validate(item_msd,data,measures=['rmse', 'mae'],cv=5,verbose=True)
print('MSD similarity impact on Item Based Collaborative Filtering : ', item_msd_scores)
avg_mae_msd = item_msd_scores['test_mae'].mean()
avg_rsme_msd = item_msd_scores['test_rmse'].mean()
print(f'MSD Average MAE of the Item based Collaborative Filtering : {avg_mae_msd}')
print(f'MSD Average RMSE of the Item based Collaborative Filtering : {avg_rsme_msd}')

In [None]:
#Pearson similarity for Item based Collaborative Filtering
sim_options = {'name':'pearson_baseline','user_based': False }
item_pearson = KNNWithMeans(sim_options=sim_options)
item_pearson_scores = cross_validate(item_pearson,data,measures=['rmse', 'mae'],cv=5,verbose=True)
print('Pearson similarity impact on Item Based Collaborative Filtering : ', item_pearson_scores)
avg_mae_pearson = item_pearson_scores['test_mae'].mean()
avg_rsme_pearson = item_pearson_scores['test_rmse'].mean()
print(f'MSD Average MAE of the Item based Collaborative Filtering : {avg_mae_pearson}')
print(f'MSD Average RMSE of the Item based Collaborative Filtering : {avg_rsme_pearson}')

In [None]:
measures = ['cosine', 'msd', 'pearson']
metrics = ['MAE','RMSE']
results = {
    'user_based': dict(),
    'item_based': dict()
    }

for measure_name in measures:
    user_based = KNNBasic(sim_options={
        'name': measure_name, 
        'user_based': True
        })
    item_based = KNNBasic(sim_options={
        'name': measure_name, 
        'user_based': False
        })   
    user_based_data = cross_validate(user_based, data, metrics, cv=5, verbose=True)
    item_based_data = cross_validate(item_based, data, metrics, cv=5, verbose=True)
    results['user_based'][measure_name] = user_based_data
    results['item_based'][measure_name] = item_based_data

In [None]:
data_for_plot = {'mae': {'cosine': [], 'msd': [],'pearson': []},'rmse': {'cosine': [],'msd': [],'pearson': []}}
for i, j in results.items():
    for value, result in results[i].items():
        data_for_plot['mae'][value].append(result['test_mae'].mean())
        data_for_plot['rmse'][value].append(result['test_rmse'].mean())

data_for_plot

In [None]:
#MAE for User Based Collaborative Filtering
label = ['User Based Collaborative Filtering']
value = np.arange(len(label)) 
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
plt.title("MAE for User Based")
ax.bar(value + 0.00, data_for_plot['mae']['cosine'][0], color = '#FF0000', width = 0.25)
ax.bar(value + 0.25, data_for_plot['mae']['msd'][0], color = '#FFFF00', width = 0.25)
ax.bar(value + 0.50, data_for_plot['mae']['pearson'][0], color = '#0000FF', width = 0.25)
plt.xticks(value, label)
plt.ylabel("MAE metric")
plt.yticks()
plt.legend(["Cosine", "MSD", "Pearson"])


#MAE for Item Based Collaborative Filtering
label = ['Item Based Collaborative Filtering']
value = np.arange(len(label)) 
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
plt.title("MAE for Item Based")
ax.bar(value + 0.00, data_for_plot['mae']['cosine'][1], color = '#FF0000', width = 0.25)
ax.bar(value + 0.25, data_for_plot['mae']['msd'][1], color = '#FFFF00', width = 0.25)
ax.bar(value + 0.50, data_for_plot['mae']['pearson'][1], color = '#0000FF', width = 0.25)
plt.xticks(value, label)
plt.ylabel("MAE metric")
plt.yticks()
plt.legend(["Cosine", "MSD", "Pearson"])
plt.show()



In [None]:
#RMSE for User Based Collaborative Filtering
label = ['User Based Collaborative Filtering']
value = np.arange(len(label)) 
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
plt.title("RMSE for User Based")
ax.bar(value + 0.00, data_for_plot['rmse']['cosine'][0], color = '#FF0000', width = 0.25)
ax.bar(value + 0.25, data_for_plot['rmse']['msd'][0], color = '#FFFF00', width = 0.25)
ax.bar(value + 0.50, data_for_plot['rmse']['pearson'][0], color = '#0000FF', width = 0.25)
plt.xticks(value, label)
plt.ylabel("RMSE metric")
plt.yticks()
plt.legend(["Cosine", "MSD", "Pearson"])


#RMSE for Item Based Collaborative Filtering
label = ['Item Based Collaborative Filtering']
value = np.arange(len(label)) 
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
plt.title("RMSE for Item Based")
ax.bar(value + 0.00, data_for_plot['rmse']['cosine'][1], color = '#FF0000', width = 0.25)
ax.bar(value + 0.25, data_for_plot['rmse']['msd'][1], color = '#FFFF00', width = 0.25)
ax.bar(value + 0.50, data_for_plot['rmse']['pearson'][1], color = '#0000FF', width = 0.25)
plt.xticks(value, label)
plt.ylabel("RMSE metric")
plt.yticks()
plt.legend(["Cosine", "MSD", "Pearson"])
plt.show()


In [None]:


ks = np.arange(1, 20, 1)
ub_mae = []
ub_rmse = []
for k in ks:     
    sim_options = {
    "user_based": True, 
    }    
    knn = KNNWithMeans(k, sim_options=sim_options)
    results = cross_validate(knn, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    mean_mae = results['test_mae'].mean()
    mean_rsme = results['test_rmse'].mean()
    ub_mae.append(mean_mae)
    ub_rmse.append(mean_rsme)

In [None]:
x = [1, 2, 3, 4, 5, 6]
labels = ['0', '20', '40', '60', '80', '100']
plt.figure(figsize=(10,5))
plt.plot(ks, ub_rmse)
plt.title('NEIGHBOURS FOR RMSE ON USER-BASED COLLAB. FILTERING')
plt.xlabel('neighbours')
plt.ylabel('RMSE')
plt.xticks(x, labels)
plt.show()

plt.figure(figsize=(10,5))
plt.plot(ks, ub_mae)
plt.title('NEIGHBOURS FOR MAE ON USER-BASED COLLAB. FILTERING')
plt.xlabel('neighbours')
plt.ylabel('MAE')
plt.xticks(x, labels)
plt.show()

In [None]:
ib_mae = []
ib_rmse = []

for k in ks: 
    sim_options = {
    "user_based": False,  
    }
    
    knn = KNNWithMeans(k, sim_options=sim_options)
    scores = cross_validate(knn, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    
    mean_mae = scores['test_mae'].mean()
    mean_rsme = scores['test_rmse'].mean()
    
    ib_mae.append(mean_mae)
    ib_rmse.append(mean_rsme)

In [None]:
plt.figure(figsize=(10,5))
plt.plot(ks, ib_rmse)
plt.title('NEIGHBOURS FOR RMSE ON ITEM-BASED COLLAB. FILTERING')
plt.xlabel('neighbours')
plt.ylabel('RMSE')

plt.show()

plt.figure(figsize=(10,5))
plt.plot(ks, ib_mae)
plt.title('NEIGHBOURS FOR MAE ON ITEM-BASED COLLAB. FILTERING')
plt.xlabel('neighbours')
plt.ylabel('MAE')


plt.show()

In [None]:
plt.plot(ub_rmse, color = 'green')
plt.plot(ib_rmse, color='blue')
plt.xlabel("K")
plt.ylabel("RMSE")
plt.title("RMSE for values of K - User-Based CF and Item-Based CF")
plt.show()

In [None]:
print('\nUSER-BASED COLLAB. FILTERING - RMSE')
print('Best K: ', ks[np.argmin(ub_rmse)])
print('Best K RMSE: ', np.min(ub_rmse))

print('\nITEM-BASED COLLAB. FILTERING - RMSE')
print('Best K: ', ks[np.argmin(ib_rmse)])
print('Best K RMSE: ', np.min(ib_rmse))