In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import time
import matplotlib.pyplot as plt

import warnings; warnings.simplefilter('ignore')



In [2]:
music_rat_df = pd.read_json("data/Digital_Music_5.json", lines = True)
#meta_df = pd.read_json("meta_Digital_Music.json", lines = True)

In [3]:
music_rat_df = music_rat_df[['asin', 'overall', 'reviewText', 'reviewerID', 'reviewerName', 'summary']]

In [4]:
music_rat_df.rename(columns = {'asin':'artist_id', 'overall':'rating', 'reviewerID':'reviewer_id', 'reviewerName':'reviewer_name', 'reviewText':'review_text'}, inplace = True)

In [73]:
music_rat_df.tail()

Unnamed: 0,artist_id,rating,review_text,reviewer_id,reviewer_name,summary
64701,B00KILDVEI,4,I like the reggae sound a lot in this song. I ...,A1PQ1PESSO8CMO,Ginger Christmas,Cool song
64702,B00KILDVEI,5,I first heard this on Sirius and had to have i...,A120RH58WVY4W6,"Kelly Dunwell ""avid reader""",Great Song
64703,B00KILDVEI,5,"I absolutely love this song, it downloaded fin...",A19VJ2IQLO50G0,melinda,Five Stars
64704,B00KILDVEI,3,"Reggae, island beats aren't really my cup of t...",AUDSM2CTLLW1Q,Patrick L. Randall,Well-crafted song
64705,B00KILDVEI,1,Magic! is a Canadian band that incorporates re...,A1GN8UJIZLCA59,P Magnum,Souless Reggae


In [6]:
us_rat_df = music_rat_df.pivot(index = 'reviewer_id', columns ='artist_id', values = 'rating').fillna(0)
us_rat_df.head()

artist_id,5555991584,B0000000ZW,B00000016T,B00000016W,B00000017R,B0000001P4,B0000002HZ,B0000002J9,B0000002JR,B0000002ME,...,B00II5VHBU,B00IOVH8AW,B00IXZ9QP4,B00J80ED9M,B00JJCQRDE,B00JJOG5D4,B00JRBLSR2,B00JTHVWO8,B00JYKU6BK,B00KILDVEI
reviewer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A08161909WK3HU7UYTMW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1020L7BWW9RAX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10323WWTFPSGP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A103KNDW8GN92L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A103W7ZPKGOCC9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
print ("Number of reviewers: "+ str(len(music_rat_df['reviewer_id'].unique())))
print ("Number of items rated: " + str(len(music_rat_df['artist_id'].unique())))

Number of reviewers: 5541
Number of items rated: 3568


In [8]:
n_unique_users = music_rat_df['reviewer_id'].nunique()
n_unique_artists = music_rat_df['artist_id'].nunique()

In [9]:
#Calculate sparsity level of dataset matrix

sparsity=round(1.0-len(music_rat_df)/float(n_unique_users*n_unique_artists),3)
print('The sparsity level of dataset is ' +  str(sparsity*100) + '%')

The sparsity level of dataset is 99.7%


# Train Test Split

In [10]:
train_data, test_data = train_test_split(us_rat_df, test_size = 0.25)

In [11]:
#Create two user-item matrices, one for training and another for testing

train_data_matrix = train_data.as_matrix()
test_data_matrix = test_data.as_matrix()

# Memory-Based Collaborative Filtering

- <h4> First calculate similarity between users </h4>
<p> For that purpose, we will use the following formula: 
<img class="aligncenter size-thumbnail img-responsive" src="https://latex.codecogs.com/gif.latex?s_u^{cos}(u_k,u_a)=\frac{u_k&space;\cdot&space;u_a&space;}{&space;\left&space;\|&space;u_k&space;\right&space;\|&space;\left&space;\|&space;u_a&space;\right&space;\|&space;}&space;=\frac{\sum&space;x_{k,m}x_{a,m}}{\sqrt{\sum&space;x_{k,m}^2\sum&space;x_{a,m}^2}}"/>

In [12]:
#First calculate similarity between users|
user_similarity = pairwise_distances(train_data_matrix, metric = 'cosine')

 - <h4> Then calculate similarity between artists </h4>
 <p> For that purpose, similar formula will be used:</p>
 
 <img class="aligncenter size-thumbnail img-responsive" src="https://latex.codecogs.com/gif.latex?s_u^{cos}(i_m,i_b)=\frac{i_m&space;\cdot&space;i_b&space;}{&space;\left&space;\|&space;i_m&space;\right&space;\|&space;\left&space;\|&space;i_b&space;\right&space;\|&space;}&space;=\frac{\sum&space;x_{a,m}x_{a,b}}{\sqrt{\sum&space;x_{a,m}^2\sum&space;x_{a,b}^2}}
"/>

In [13]:
#Next calculate similarity between artists
artist_similarity = pairwise_distances(train_data_matrix.T, metric = 'cosine')

In [14]:
def predict_ratings (ratings, similarity, type = 'user'):
    if type =='user':
        mean_user_rating = ratings.mean(axis = 1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        prediction = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'artist':
        prediction = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return prediction

In [15]:
#Get predictions
user_prediction = predict_ratings(train_data_matrix, user_similarity, type ='user')
artist_prediction = predict_ratings(train_data_matrix, artist_similarity, type = 'artist')

- <h4> Evaluate model performance using RMSE : </h4>
<img src="https://latex.codecogs.com/gif.latex?RMSE&space;=\sqrt{\frac{1}{N}&space;\sum&space;(x_i&space;-\hat{x_i})^2}" title="RMSE =\sqrt{\frac{1}{N} \sum (x_i -\hat{x_i})^2}" />

In [16]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [17]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Artist-based CF RMSE: ' + str(rmse(artist_prediction, test_data_matrix)))

User-based CF RMSE: 4.326908108238543
Artist-based CF RMSE: 4.344369190577104


- <h3> Check performance of the model by using given predictions for rating to get information about artists that the given user would like </h3>

In [18]:
#Return for the given user_id 10 most similar users
#def most_similar_ten(user_id, user_similarity_matrix):
def highest_rated_indices(user_id, user_prediction, us_rat_df):
    index = 0
    for i in range(0, us_rat_df.shape[0]-1):
        if us_rat_df.index.values[i] == user_id:
            index = i
    
    pred_arr = np.argsort(user_prediction[index][::-1])
    pred_arr = pred_arr[0:10]
    
    return pred_arr    

array([0.17016655, 0.15505977, 0.13713975, ..., 0.00204533, 0.00179163,
       0.00131712])

In [36]:
hr = highest_rated_indices('A1020L7BWW9RAX', user_prediction, us_rat_df)
hr[1]

2820

In [47]:
us_rat_df.columns.values[hr[1]]

'B0013L5M08'

In [50]:
def get_artist_ids(us_rat_df, indices):
    ids = []
    
    for i in range(0, indices.shape[0]-1):
        ids.append(us_rat_df.columns.values[indices[i]])     
    return ids

In [56]:
ai = get_artist_ids(us_rat_df, hr)   

In [71]:
def get_artists_description(artist_ids, music_df):
    descriptions = []
    
    for i in range(0, len(artist_ids) - 1):
        res_df = music_df.query("artist_id == '" + artist_ids[i] + "'")
        descriptions.append(res_df['summary'].tolist())
    return descriptions
        

In [72]:
get_artists_description(ai, music_rat_df)

[['This album is an absolute dream!',
  'Lush??',
  'IVY Does It Again!',
  'Not a Good Effort',
  'A warm, enjoyable album of covers'],
 ['A Superb If Incomplete Introduction To The Reprise Years.',
  'Missing Quite a Lot',
  'wonderful compilation',
  "A Fine Summation of the Artist, But You'll Want Much More",
  'Several tracks contain some seriously misguided remixing attempts',
  "Nice introduction to Mr. Sinatra's music",
  'The voice: ten years after',
  'Sinatra sets the mood everytime!',
  "New Jersey's Best",
  'Wonderful Frank Sinatra!',
  "certainly not all of Frank's best--but it's still all right by me !!!",
  'classics from Ole Blue Eyes',
  "OLD BLUE EYES SINGING 'MANY' OF HIS VERY BEST!",
  'Everything Is Simply Beautiful!',
  'Wonderful!!!!!!!',
  "It's Been Done Before, but Great Sound",
  "Ol' Blue Eyes was the God of male vocalists",
  "Doubling the gold standard (deserves even a Sinatra collector's attention)",
  "What a fine selection of Sinatra's work",
  'For t

# Model-Based Collaborative filtering
  - <b>SVD</b>

In [80]:
#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k = 15)
s_diag_matrix=np.diag(s)
predictions = np.dot(np.dot(u, s_diag_matrix), vt)
print('User-based CF MSE: ' + str(rmse(predictions, test_data_matrix)))

User-based CF MSE: 4.332039981419453


In [83]:
hr_svd = highest_rated_indices('A1020L7BWW9RAX', predictions, us_rat_df)

In [84]:
ai_svd = get_artist_ids(us_rat_df, hr_svd)   

In [85]:
get_artists_description(ai_svd, music_rat_df)

[["Eurythmics Doin' It For Themselves",
  'FIVE STARS YES!',
  'Good album but not their best one',
  'Eurythmics: Be Yourself Tonight (1985)',
  "Best Pop Album of the 80's?",
  'The Eurythmics At Their Peak? R&B and New Wave Pop Collide',
  'Their Most Cohesive',
  'Lots of soul',
  "I'm Thrown And Overblown With Bliss.",
  'mixed emotions',
  'A Landmark Album----The Eurythmics Best',
  "Eurythmics' best album",
  'Awesome and moving',
  'Great Album!',
  'Eurythmics "rock out" ... so to speak',
  'you may not know the album, but you know the songs',
  'Eurythmics cut loose',
  'Soulful Annie and Dave'],
 ["I can't believe I didn't own this sooner",
  'Great East Coast',
  'raw genius',
  'Uni-Bombs',
  'Just a couple of kids from Queens',
  'Untochable',
  'Real Rap',
  'Unparalleled Classic',
  'Mobb Deep at their best!!!',
  'The Infamous',
  'The Mobb',
  'Another "oldschool" classic - rap fans will LOVE it!',
  'The Best Mobb Deep Album [5 stars]',
  'There Famous 2nd Best Ever