In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Installing Surprise

In [19]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Imports

In [1]:
import json
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise import prediction_algorithms
from surprise import accuracy
from collections import defaultdict

## Data Selection and Preprocessing

In [2]:
# reading in the Movies and TV json file and drop the unecessary columns
# only considering the first 500000 entries since the entire file is too big
df = pd.read_json('/content/drive/MyDrive/CSE 272/HW 2_Recommendation Systems/Movies_and_TV.json', lines = True, nrows = 500000)
df.drop(['verified', 'reviewTime', 'style', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote', 'image'], axis=1, inplace=True)

In [3]:
# making sure there are a 500000 entries and the other columns were dropped
print(df.shape)
df.head()

(500000, 3)


Unnamed: 0,overall,reviewerID,asin
0,5,A3478QRKQDOPQ2,1527665
1,5,A2VHSG6TZHU1OB,1527665
2,5,A23EJWOW1TLENE,1527665
3,5,A1KM9FNEJ8Q171,1527665
4,4,A38LY2SSHVHRYB,1527665


In [4]:
# converting the json file to csv
df.to_csv('/content/drive/MyDrive/CSE 272/HW 2_Recommendation Systems/data.csv', index = False)

In [5]:
# reading in the csv file
data = pd.read_csv('/content/drive/MyDrive/CSE 272/HW 2_Recommendation Systems/data.csv')
data = data[['reviewerID', 'asin', 'overall']]
reader = Reader()
data = Dataset.load_from_df(data, reader)

## Train and Test Data

In [31]:
# splitting the data into train and test data
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 10)

## Baseline

In [32]:
# Baseline algorithm and MAE and RSME calculations
baseline = prediction_algorithms.baseline_only.BaselineOnly(bsl_options={}, verbose=True)
baseline.fit(train_data)
pred_base = baseline.test(test_data)

Estimating biases using als...


In [37]:
mae_base = accuracy.mae(pred_base, verbose = True)

MAE:  0.7458


In [38]:
rsme_base = accuracy.rmse(pred_base, verbose = True)

RMSE: 1.0365


## Matrix Factorization

In [None]:
# Matrix Factorization algorithm and MAE and RSME calculations
matrix_factor = prediction_algorithms.matrix_factorization.SVD(n_epochs = 50, verbose = True)
matrix_factor.fit(train_data)
pred_mf = matrix_factor.test(test_data)

In [40]:
mae_mf = accuracy.mae(pred_mf, verbose = True)

MAE:  0.7419


In [41]:
rmse_mf = accuracy.rmse(pred_mf, verbose = True)

RMSE: 1.0415


## Item-based CF

In [42]:
# Item-based CF algorithm and MAE and RSME calculations
item_based_CF = prediction_algorithms.knns.KNNWithMeans(k = 5, sim_options = {'name':'pearson_baseline', 'user_based':False})
item_based_CF.fit(train_data)
pred_cf = item_based_CF.test(test_data)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [43]:
mae_cf = accuracy.mae(pred_cf, verbose = True)

MAE:  0.7906


In [44]:
rsme_cf = accuracy.rmse(pred_cf, verbose = True)

RMSE: 1.0963


## Slope One

In [45]:
# Slope One algorithm and MAE and RSME calculations
slope_one = prediction_algorithms.slope_one.SlopeOne()
slope_one.fit(train_data)
pred_so = slope_one.test(test_data)

In [46]:
mae_so = accuracy.mae(pred_so, verbose = True)

MAE:  0.7969


In [47]:
rsme_so = accuracy.rmse(pred_so, verbose = True)

RMSE: 1.1287


## Ranking

In [48]:
# ranking (getting the top 10 items for each user)
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n
# Reference: https://surprise.readthedocs.io/en/stable/FAQ.html

In [None]:
# top 10 for Baseline
top_n_base = get_top_n(pred_base, n = 10)
for uid, user_ratings in top_n_base.items():
    print(uid, [iid for (iid, _) in user_ratings])

In [None]:
# top 10 for Matrix Factorization
top_n_mf = get_top_n(pred_mf, n = 10)
for uid, user_ratings in top_n_mf.items():
    print(uid, [iid for (iid, _) in user_ratings])

In [None]:
# top 10 for Item-based CF
top_n_cf = get_top_n(pred_cf, n = 10)
for uid, user_ratings in top_n_cf.items():
    print(uid, [iid for (iid, _) in user_ratings])

In [None]:
# top 10 for Slope One
top_n_so = get_top_n(pred_so, n = 10)
for uid, user_ratings in top_n_so.items():
    print(uid, [iid for (iid, _) in user_ratings])

## Precision, Recall, and F-measure

In [25]:
# precision, recall, f-measure
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )
        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
    return precisions, recalls
# Reference: https://surprise.readthedocs.io/en/stable/FAQ.html

def f_measure(precision, recall):
  f_m = 2 * (precision * recall) / (precision + recall)
  return f_m

In [27]:
# precision, recall, and f-measure for Baseline
precisions, recalls = precision_recall_at_k(pred_base, k=5, threshold=4)
print('Precision: ')
precision_base = sum(prec for prec in precisions.values()) / len(precisions)
print(precision_base)
print('Recall: ')
recall_base = sum(rec for rec in recalls.values()) / len(recalls)
print(recall_base)
print('F-measure: ')
f_measure_base = f_measure(precision_base, recall_base)
print(f_measure_base)

Precision: 
0.7785858909390405
Recall: 
0.7780736310289463
F-measure: 
0.7783296766976701


In [28]:
# precision, recall, and f-measure for Matrix Factorization
precisions, recalls = precision_recall_at_k(pred_mf, k=5, threshold=4)
print('Precision: ')
precision_mf = sum(prec for prec in precisions.values()) / len(precisions)
print(precision_mf)
print('Recall: ')
recall_mf = sum(rec for rec in recalls.values()) / len(recalls)
print(recall_mf)
print('F-measure: ')
f_measure_mf = f_measure(precision_mf, recall_mf)
print(f_measure_mf)

Precision: 
0.7713990474992086
Recall: 
0.7686286642326603
F-measure: 
0.7700113640200459


In [29]:
# precision, recall, and f-measure for Item-based CF
precisions, recalls = precision_recall_at_k(pred_cf, k=5, threshold=4)
print('Precision: ')
precision_cf = sum(prec for prec in precisions.values()) / len(precisions)
print(precision_cf)
print('Recall: ')
recall_cf = sum(rec for rec in recalls.values()) / len(recalls)
print(recall_cf)
print('F-measure: ')
f_measure_cf = f_measure(precision_cf, recall_cf)
print(f_measure_cf)

Precision: 
0.8242412976584031
Recall: 
0.8227677051567905
F-measure: 
0.823503842189931


In [30]:
# precision, recall, and f-measure for Slope One
precisions, recalls = precision_recall_at_k(pred_so, k=5, threshold=4)
print('Precision: ')
precision_so = sum(prec for prec in precisions.values()) / len(precisions)
print(precision_so)
print('Recall: ')
recall_so = sum(rec for rec in recalls.values()) / len(recalls)
print(recall_so)
print('F-measure: ')
f_measure_so = f_measure(precision_so, recall_so)
print(f_measure_so)

Precision: 
0.812740365366796
Recall: 
0.8115586583215332
F-measure: 
0.8121490819874939


## Results Summary

In [50]:
print('The following are the results of MAE, RSME, Precision, Recall, and F-measure for all of the algorithms used above.')
print()
print('Baseline')
print('MAE: ', mae_base)
print('RSME: ', rsme_base)
print('Precision: ', precision_base)
print('Recall: ', recall_base)
print('F-measure: ', f_measure_base)
print()
print('Matrix Factorization')
print('MAE: ', mae_mf)
print('RSME: ', rmse_mf)
print('Precision: ', precision_mf)
print('Recall: ', recall_mf)
print('F-measure: ', f_measure_mf)
print()
print('Item-based CF')
print('MAE: ', mae_cf)
print('RSME: ', rsme_cf)
print('Precision: ', precision_cf)
print('Recall: ', recall_cf)
print('F-measure: ', f_measure_cf)
print()
print('Slope One')
print('MAE: ', mae_so)
print('RSME: ', rsme_so)
print('Precision: ', precision_so)
print('Recall: ', recall_so)
print('F-measure: ', f_measure_so)

The following are the results of MAE, RSME, Precision, Recall, and F-measure for all of the algorithms used above.

Baseline
MAE:  0.7457710712861408
RSME:  1.0365054998969583
Precision:  0.7785858909390405
Recall:  0.7780736310289463
F-measure:  0.7783296766976701

Matrix Factorization
MAE:  0.7418938679503214
RSME:  1.0415269294494476
Precision:  0.7713990474992086
Recall:  0.7686286642326603
F-measure:  0.7700113640200459

Item-based CF
MAE:  0.7906034375554847
RSME:  1.0962673048928995
Precision:  0.8242412976584031
Recall:  0.8227677051567905
F-measure:  0.823503842189931

Slope One
MAE:  0.7968704897598564
RSME:  1.1286997361807634
Precision:  0.812740365366796
Recall:  0.8115586583215332
F-measure:  0.8121490819874939
