In [1]:
import time
import pandas as pd
import json

from surprise import SVD, NMF, SlopeOne, NormalPredictor, CoClustering, BaselineOnly, KNNBasic, KNNWithMeans, Dataset, Reader, accuracy
from sklearn.model_selection import ParameterGrid

In [2]:
users = open("./MovieLens/20m/users_20m_sample_section.csv").read().split('\n')

In [3]:
train = pd.read_csv("./MovieLens/20m/train_ratings_20m_sample_section.csv")
test = pd.read_csv("./MovieLens/20m/test_ratings_20m_sample_section.csv")
train = train.rename(index=str, columns={"userId": "userID", "movieId": "itemID"})
test = test.rename(index=str, columns={"userId": "userID", "movieId": "itemID"})

In [4]:
reader = Reader()

db = Dataset.load_from_df(train[['userID', 'itemID', 'rating']], reader)
trainset = db.build_full_trainset()

algs = {
    'SVD': SVD(random_state=42),
    'NMF': NMF(random_state=42),
    'CoClustering': CoClustering(random_state=42),
    'SlopeOne': SlopeOne(),
    'NormalPredictor': NormalPredictor(),
    'BaselineOnly': BaselineOnly(bsl_options={'method':'sgd'}),
    'KNNBasic_UserBased': KNNBasic(sim_options={'user_based': True, 'name': 'cosine'}),
    'KNNWithMeans_UserBased': KNNWithMeans(sim_options={'user_based': True, 'name': 'cosine'}),
    'KNNBasic_ItemBased': KNNBasic(sim_options={'user_based': False, 'name': 'cosine'}),
    'KNNWithMeans_ItemBased': KNNWithMeans(sim_options={'user_based': False, 'name': 'cosine'})
}

for name in list(algs.keys()):
    print('Training {} algorithm'.format(name))
    algs[name] = algs[name].fit(trainset)

results = {}
for name, alg in algs.items():
    print('Running {} algorithm...'.format(name))
    results[name] = {}
    initial = time.time()
    for user in users:
        data_te = test.loc[test['userID'] == int(user)]
        db = Dataset.load_from_df(data_te[['userID', 'itemID', 'rating']], reader)
        testset = db.construct_testset(db.raw_ratings)
        predictions = alg.test(testset)
        results[name][user] = {'RMSE': accuracy.rmse(predictions, verbose=False)}
    final = time.time()
    print('{} took: {}s'.format(name, final-initial))
#     print('Results for {} algorithm:'.format(name))
#     print(results[name])

Training SVD algorithm
Training NMF algorithm
Training CoClustering algorithm
Training SlopeOne algorithm
Training NormalPredictor algorithm
Training BaselineOnly algorithm
Estimating biases using sgd...
Training KNNBasic_UserBased algorithm
Computing the cosine similarity matrix...
Done computing similarity matrix.
Training KNNWithMeans_UserBased algorithm
Computing the cosine similarity matrix...
Done computing similarity matrix.
Training KNNBasic_ItemBased algorithm
Computing the cosine similarity matrix...
Done computing similarity matrix.
Training KNNWithMeans_ItemBased algorithm
Computing the cosine similarity matrix...
Done computing similarity matrix.
Running SVD algorithm...
SVD took: 1.2320363521575928s
Running NMF algorithm...
NMF took: 1.0949034690856934s
Running CoClustering algorithm...
CoClustering took: 0.9882099628448486s
Running SlopeOne algorithm...
SlopeOne took: 10.139931440353394s
Running NormalPredictor algorithm...
NormalPredictor took: 0.9720468521118164s
Runni

In [5]:
with open('results.json', 'w') as result_file:
    json.dump(results, result_file, indent=4, sort_keys=True)