# Demo

1. Load data
2. Run recommendation algorithm on it
3. Compute accuracy and diversity metrics

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
usage_df = pd.read_csv('msd_subset_usage.csv')
usage_df = usage_df.drop(columns=['Unnamed: 0'])
usage_df.head()

Unnamed: 0,user_id,song_id,num_plays,track_id
0,b64cdd1a0bd907e5e00b39e345194768e330d652,SOLXDDC12A6701FBFD,1,TRAUURC128E078EC6E
1,951945330eb5df161ac4f97729647514001cd102,SOLXDDC12A6701FBFD,3,TRAUURC128E078EC6E
2,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOLXDDC12A6701FBFD,1,TRAUURC128E078EC6E
3,e427f647c231c1bde8881eca5b2f5db9b3bcb2b4,SOLXDDC12A6701FBFD,1,TRAUURC128E078EC6E
4,02192554db8fe6d17b6309aabb2b7526a2e58534,SOLXDDC12A6701FBFD,1,TRAUURC128E078EC6E


In [3]:
metadata_df = pd.read_csv('msd_subset_metadata.csv')
metadata_df = metadata_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
metadata_df.head()

Unnamed: 0,artist_mbid,artist_name,artist_playmeid,danceability,duration,energy,key,loudness,mode,release,release_7digitalid,song_hotttnesss,song_id,tempo,time_signature,title,track_7digitalid,track_id,year,genre
0,0e6524bd-6641-46a6-bce5-96f06c19aa46,Orlando Pops Orchestra,-1,0.0,199.99302,0.0,10,-16.477,1,Easy Listening: Cartoon Songs,767122,,SOGSOUE12A58A76443,120.382,4,Zip-A-Dee-Doo-Dah (Song of the South),8493899,TRARRPG12903CD1DE9,0,Stage
1,37c78aeb-d196-42b5-b991-6afb4fc9bc2e,Dead Kennedys,6004,0.0,216.842,0.0,10,-4.264,1,Milking The Sacred Cow,181162,0.788388,SOZQSGL12AF72A9145,92.897,4,Halloween,1959132,TRARREF128F422FD96,1982,Pop_Rock
2,17e137fb-59e5-4fd7-af48-afc34995396c,Atreyu,-1,0.0,218.90567,0.0,0,-4.707,0,The Curse,290671,0.681092,SOBTEHX12A6D4FBF18,157.715,4,You Eclipsed By Me (Album Version),3260887,TRARRQO128F427B5F5,2004,Pop_Rock
3,e8143d56-f2e9-4122-9666-6aa5352dadcd,Mistress,160877,0.0,580.70159,0.0,0,-4.523,1,In Disgust We Trust,682479,0.401487,SOXGDVW12AB01864E7,146.331,4,Shovel,7574139,TRARRMK12903CDF793,2005,Pop_Rock
4,2e3b75d8-00a8-486a-b4e7-a1f8f65e64c9,Shadows Fall,12510,0.0,283.48036,0.0,0,-4.076,0,The War Within,734726,0.687874,SOKMPKV12A67AE241B,84.992,4,What Drives The Weak,8138989,TRARUOP12903CF2384,2004,Pop_Rock


In [4]:
def metadata_diversity(song_ids):
#     metadata_df = pd.read_csv('song_data.csv')
    df = metadata_df[metadata_df['song_id'].isin(song_ids)]
    
    diversity = 0
    diversity += 3*df['genre'].nunique() # genres
    diversity += 0.5*df['release_7digitalid'].nunique() # albums
    diversity += 2*df['artist_name'].nunique() # artists
    diversity += 0.01*df['year'].nunique() # years
    diversity += 0.1*np.floor(df['year']/10).nunique() # decades
    diversity += 0.1*np.round(df['duration']/60).nunique() # durations in minutes
    diversity += 1*np.round(df['tempo']/10).nunique() # tempo 10s of BMP
    
    return diversity

In [5]:
from surprise import Dataset
from surprise import Reader
from surprise import SVDpp
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
import pandas as pd

usage_df['rating'] = 1
reader = Reader(rating_scale=(0,1))
data = Dataset.load_from_df(usage_df[['user_id','song_id','rating']], reader)
algo = SVD()
kf = KFold(n_splits=2)

for trainset, testset in kf.split(data):
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)
    print(predictions[0][1])
    recommended_song_ids = [pred[1] for pred in predictions]
    print(f'Metadata Diversity of predictions: {metadata_diversity(recommended_song_ids)}')
    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

SOAHYMX12AB0182021
Metadata Diversity of predictions: 2537
RMSE: 0.0187
SOIQENV12A8C133A29
Metadata Diversity of predictions: 2562
RMSE: 0.0190
