In [None]:
# importing libraries
import pandas as pd
import numpy as np

from surprise import Dataset, Reader
from surprise import SVD, NMF, KNNWithZScore, CoClustering
from surprise.model_selection import ShuffleSplit
from surprise import accuracy

from collections import defaultdict

In [None]:
# Yahoo! Data
ratings_data = 'K:/Datasets/yahoo/ydata-ymusic-user-song-ratings-meta-v1_0/train_0.txt'
ratings = pd.read_csv(ratings_data, sep="\t", header=None)
ratings.columns = ['user_id', 'song_id', 'rating']
ratings.head()

In [None]:
# how many songs each user has listened to
user_counts = ratings.groupby('user_id')['song_id'].count()
user_counts.describe()

In [None]:
# how many users listen to the same song on average
song_user = ratings.groupby('song_id')['user_id'].count()
song_user.describe()

In [None]:
# filter out users that have listened to less than 16 songs
flt_users = user_counts[user_counts > 50].index.to_list() # 25%

# filter out songs that have less than 200 users
flt_songs = song_user[song_user > 150].index.to_list() # 150 users 

# filter out dataset with user and song id's
ratings_flt = ratings[(ratings['user_id'].isin(flt_users)) & (ratings['song_id'].isin(flt_songs))].reset_index(drop=True)

In [None]:
# top 10,000 songs with most ratings
top_songs = song_user.sort_values(ascending=False)[:10000].index.to_list()
# filter for only top 2500 popular songs
ratings_flt_pop = ratings_flt[(ratings_flt['song_id'].isin(top_songs))].reset_index(drop=True)

# top 10,000 users with most ratings
most_ratings_user = user_counts.sort_values(ascending=False)[:10000].index.to_list()
ratings_flt_pop = ratings_flt_pop[(ratings_flt_pop['user_id'].isin(most_ratings_user))].reset_index(drop=True)

In [None]:
n_users = ratings_flt_pop.user_id.nunique()
n_items = ratings_flt_pop.song_id.nunique()
print('Users: ' + str(n_users), 'Songs: ' + str(n_items))

In [None]:
# Initialise Reader class with rating scale from 1 to 5
reader = Reader(rating_scale=(1, 5))
# load a new dataset object with Yahoo ratings
data = Dataset.load_from_df(ratings_flt_pop[['user_id', 'song_id', 'rating']], reader)

In [None]:
# code from suprise library website
# Source: https://surprise.readthedocs.io/en/stable/FAQ.html

# Return precision and recall at k metrics for each user
def precision_recall_at_k(predictions, k=20, threshold=3):
    
    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
    
    return precisions, recalls

In [None]:
algos = [SVD(), NMF(), KNNWithZScore(), CoClustering()]
test_size = ["0.25", "0.30", "0.40"]
n_splits = 5

results = {}

for algorithm in algos:
    algo_dict = {}

    rmse_list = []
    precision_list = []
    recall_list = []
    
    algorithm_name = str(algorithm).split(".")[3].split(" ")[0]
    print(f'Algorithm: {algorithm_name}')
    print('------------------------------')
    
    for size in test_size:
        test_dict = {}
        print(f'\nTest size: {size}')
        print('====================')
        
        kf = ShuffleSplit(n_splits=n_splits, test_size=float(size), shuffle=True, random_state=42)
        rmse_list = []
        precision_list = []
        recall_list = []
        
        for trainset, testset in kf.split(data):
            # training
            print('Model is being trained...')
            algo = algorithm
            algo.fit(trainset) 
            print('Training Successful\n')

            # testing
            print('Model is being tested...')
            predictions = algo.test(testset)
            result = round(accuracy.rmse(predictions, verbose=False), 4)
            print('Testing Successful\n')
            print(f'Testset RMSE is {result}')
            rmse_list.append(result)
    
            # recall and precision @ 20
            precisions, recalls = precision_recall_at_k(predictions, k=20, threshold=3)
            precision = round(sum(prec for prec in precisions.values()) / len(precisions), 4)
            recall = round(sum(rec for rec in recalls.values()) / len(recalls), 4)
            
            print(f'Precision: {precision}')
            print(f'Recall: {recall}')
            print('--------------------')
            
            precision_list.append(precision)
            recall_list.append(recall)
        
        avg_rmse = round(mean(rmse_list), 4)
        avg_precision = round(mean(precision_list), 4)
        avg_recall = round(mean(recall_list), 4)
        
        print('********************')
        print(f'Mean Precision: {avg_precision}')
        print(f'Mean Recall: {avg_recall}')
        
        test_dict['rmse'] = avg_rmse
        test_dict['precision'] = avg_precision
        test_dict['recall'] = avg_recall
        
        algo_dict[f'testset_{size}'] = test_dict
    
    results[algorithm_name] = algo_dict

In [None]:
# dictionary of results
print(results)