In [None]:
# importing libraries
import pandas as pd
import numpy as np

from surprise import Dataset, Reader
from surprise import SVD, NMF, KNNWithZScore, CoClustering
from surprise.model_selection import ShuffleSplit
from surprise import accuracy

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from collections import defaultdict
from statistics import mean
import random

In [None]:
# Train Triplets Data
train_triplets = 'K:/Datasets/EchoNest/Train Triplets/train_triplets.txt'
train_ratings = pd.read_csv(train_triplets, sep="\t", header=None)
train_ratings.columns = ['user_id', 'song_id', 'listen_count']

# Spotify Metadata 
song_metadata = pd.read_csv('K:/Notebook Files/spotify_metadata.csv')

# get ratings for songs that are in the metadata
ratings = train_ratings[train_ratings['song_id'].isin(song_metadata['en_song_id'])]

In [None]:
# how many songs each user has listened to
user_counts = ratings.groupby('user_id')['song_id'].count()
user_counts.describe().apply(lambda x: format(x, 'f'))

In [None]:
# how many users listen to the same song on average
song_user = ratings.groupby('song_id')['user_id'].count()
song_user.describe()

In [None]:
# filter out users that have listened to less than 16 songs
flt_users = user_counts[user_counts > 15].index.to_list() #25%

# filter out songs that have less than 200 users
flt_songs = song_user[song_user > 14].index.to_list() #50%

# filter out dataset with user and song id's
ratings_flt = ratings[(ratings['user_id'].isin(flt_users)) & (ratings['song_id'].isin(flt_songs))].reset_index(drop=True)

In [None]:
# top 10000 songs with most ratings
top_songs = song_user.sort_values(ascending=False)[:10000].index.to_list()
# filter for only top 10000 popular songs
ratings_flt_pop = ratings_flt[(ratings_flt['song_id'].isin(top_songs))].reset_index(drop=True)

# top 10000 users with most ratings
most_ratings_user = user_counts.sort_values(ascending=False)[:10000].index.to_list()
# filter for only top 10000 users
ratings_flt_pop = ratings_flt_pop[(ratings_flt_pop['user_id'].isin(most_ratings_user))].reset_index(drop=True)

In [None]:
# binning technique 
bins = [0,1,2,3,4,5,6,7,8,9,ratings_flt_pop['listen_count'].max()]

ratings_flt_pop['rating'] = pd.cut(ratings_flt_pop['listen_count'], bins=bins, labels=[1,2,3,4,5,6,7,8,9,10])
ratings_flt_pop['rating'] = ratings_flt_pop.rating.astype('int')

In [None]:
# Initialize Reader class with rating scale from 1 to 10
reader = Reader(rating_scale=(1, 10))

# load dataset class with ratings 
data = Dataset.load_from_df(ratings_flt_pop[['user_id', 'song_id', 'rating']], reader)

In [None]:
# create a dataframe of user_id with a sample of 25 randomly selected users 
user_sample = ratings_flt_pop['user_id'].sample(n=25, random_state=10)
ss = StandardScaler()

# standardise song feature values
columns_to_cluster = song_metadata.columns[6:16]
songs_scaled = ss.fit_transform(song_metadata[columns_to_cluster])

# creating new dataframe with scaled song features
columns_to_cluster_scaled  = ['danceability_scaled', 'energy_scaled', 'key_scaled', 'loudness_scaled', 'mode_scaled',
                              'speechiness_scaled', 'instrumentalness_scaled', 'liveness_scaled', 'valence_scaleded', 
                              'tempo_scaled']
df_songs_scaled = pd.DataFrame(songs_scaled, columns=columns_to_cluster_scaled)

# k selected from silhouette score and elbow method
k = 7

# predict clusters from song audio features
model = KMeans(n_clusters=k, random_state=11).fit(songs_scaled)
predictions = model.predict(songs_scaled)

# add cluster labels to songs in dataframe
df_songs_scaled['cluster'] = model.labels_
df_songs_joined = pd.concat([song_metadata, df_songs_scaled], axis=1).set_index('cluster')

In [None]:
def kmeans_recommendations(user_id):
    # user profile - users song_id list of liked songs
    user_songs = train_ratings[train_ratings['user_id'] == user_id]
    user_liked = user_songs[user_songs['listen_count'] >= 3]
    
    if(user_liked['song_id'].count() >= 10):
        user_song_ids = user_liked['song_id'].to_list()
    else:
        user_song_ids = user_songs['song_id'].to_list()
    
    df_user = song_metadata[song_metadata['en_song_id'].isin(user_song_ids)].reset_index(drop=True)
    
    # scale features for users songs
    ss = StandardScaler()
    user_scaled_features = ss.fit_transform(df_user[columns_to_cluster])
    user_predictions = model.predict(user_scaled_features)
    
    # create dataframe with scaled features
    user_cluster = pd.DataFrame(user_scaled_features, columns=columns_to_cluster_scaled)
    user_cluster['cluster'] = user_predictions
    df_user_songs_joined = pd.concat([df_user, user_cluster], axis=1).set_index('cluster')
    
    # predict clusters for songs
    df_user_songs_joined.reset_index(inplace=True)
    cluster_pred = df_user_songs_joined.cluster.value_counts(normalize=True)*20
    if int(cluster_pred.round(0).sum()) < 20:
        cluster_pred[cluster_pred < 0.5] = cluster_pred[cluster_pred < 0.5] + 1.0
    df_user_songs_joined['cluster_pred'] = df_user_songs_joined['cluster'].apply(lambda c: cluster_pred[c])
    df_user_songs_joined.drop(columns=columns_to_cluster_scaled, inplace=True)
    
    # create a song recommendation df 
    song_rec = pd.DataFrame()
    for n_cluster, pred in cluster_pred.items():
        songs = df_songs_joined[df_songs_joined['cluster'] == n_cluster].sample(n=int(round(pred, 0)))
        song_rec = pd.concat([song_rec, songs], ignore_index=True)
        if len(song_rec) > 20 :
            limit = 20 - len(song_rec)
            song_rec = song_rec[:limit]
            
    return song_rec['en_song_id']

In [None]:
# Return precision and recall at k metrics for each user
def hybrid_recommendations(user_id, predictions, n=20):
    
    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        if(uid == user_id):
            user_est_true[uid].append((round(est, 2), round(true_r, 3), iid))    
    
    # get an prediction score the user would give to each cb song
    cb_list = kmeans_recommendations(user_id)
    cb_pred = []
    for song_id in cb_list:
        cb_pred.append(algo.predict(user_id, song_id, r_ui=3))
    
    # if song already in cf list, delete song
    i = 0
    for tpl in list(user_est_true.values())[0]:
        i+=1
        if(tpl[2] in (cb_list)):
            del user_est_true[user_id][i-1]
            
    # insert recommendations randomly into list
    for uid, iid, true_r, est, _ in cb_pred:
        user_est_true[uid].insert(random.randint(0, 20), (round(est, 2), round(true_r, 2), iid))    
    
    # create recommendation list and user ratings list 
    user_ratings = []
    recommendations = []
    for est, true_r, song_id in sorted(list(user_est_true.values())[0], key=lambda x: x[0], reverse=True):
        user_ratings.append((est, true_r))
        recommendations.append(song_id)
    
    return user_ratings, recommendations[:n]

In [None]:
# code from suprise library website
# source: https://surprise.readthedocs.io/en/stable/FAQ.html

def precision_recall_at_k_hybrid(user_ratings, k=15, threshold=4):
    
    for est, true_r in user_ratings:
        # How many relevant items
        num_rel = sum((true_r >= threshold) for (est, true_r) in user_ratings)
        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, true_r) in user_ratings[:k])
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
        
        # Precision@K: Proportion of recommended items that are relevant
        precision = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        # Recall@K: Proportion of relevant items that are recommended
        recall = n_rel_and_rec_k / num_rel if num_rel != 0 else 1
        
    return round(precision, 2), round(recall, 2)

In [None]:
# code from suprise library website
# source: https://surprise.readthedocs.io/en/stable/FAQ.html

# Return precision and recall at k metrics for each user
def precision_recall_at_k(predictions, k=15, threshold=4):
    
    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
    
    return precisions, recalls

In [None]:
algos = [SVD(), NMF(), KNNWithZScore(), CoClustering()]
test_size = ['0.25', '0.30', '0.40']
n_splits = 5

results = {}
hybrid_results = {}

for algorithm in algos:
    algo_dict = {}

    rmse_list = []
    precision_list = []
    recall_list = []
    
    algorithm_name = str(algorithm).split(".")[3].split(" ")[0]
    print(f'\nAlgorithm: {algorithm_name}')
    print('------------------------------')
    
    for size in test_size:
        test_dict = {}
        print(f'\nTest size: {size}')
        print('====================')
        
        kf = ShuffleSplit(n_splits=n_splits, test_size=float(size), shuffle=True, random_state=42)
        rmse_list = []
        precision_list = []
        recall_list = []
        
        for trainset, testset in kf.split(data):
            # training
            print('Model is being trained...')
            algo = algorithm
            algo.fit(trainset) 
            print('Training Successful\n')

            # testing
            print('Model is being tested...')
            predictions = algo.test(testset)
            result = round(accuracy.rmse(predictions, verbose=False), 4)
            print('Testing Successful\n')
            print(f'Testset RMSE is {result}')
            rmse_list.append(result)
            
            # recall and precision @ 20
            precisions, recalls = precision_recall_at_k(predictions, k=15, threshold=4)
            precision = round(sum(prec for prec in precisions.values()) / len(precisions), 4)
            recall = round(sum(rec for rec in recalls.values()) / len(recalls), 4)
            
            print(f'Precision: {precision}')
            print(f'Recall: {recall}')
            print('--------------------')
            
            precision_list.append(precision)
            recall_list.append(recall)
        
        avg_rmse = round(mean(rmse_list), 4)
        avg_precision = round(mean(precision_list), 4)
        avg_recall = round(mean(recall_list), 4)
        
        print('********************')
        print(f'Mean Precision: {avg_precision}')
        print(f'Mean Recall: {avg_recall}')
        
        test_dict['rmse'] = avg_rmse
        test_dict['precision'] = avg_precision
        test_dict['recall'] = avg_recall
        
        algo_dict[f'testset_{size}'] = test_dict
        
    results[algorithm_name] = algo_dict
        
    # Hybrid Testing
    hybrid_metrics = {}

    hybrid_precision = []
    hybrid_recall = []
    for user in user_sample:
        user_ratings, recommendations = hybrid_recommendations(user, predictions)
        precision, recall = precision_recall_at_k_hybrid(user_ratings)
        hybrid_precision.append(precision)
        hybrid_recall.append(recall)
    
    avg_hybrid_precision = np.mean(hybrid_precision)
    avg_hybrid_recall = np.mean(hybrid_recall)
    
    print(avg_hybrid_precision, avg_hybrid_recall)
    
    hybrid_metrics['precision'] = avg_hybrid_precision
    hybrid_metrics['recall'] = avg_hybrid_recall

    hybrid_results[algorithm_name] = hybrid_metrics

In [None]:
print(results)

In [None]:
print(hybrid_results)