In [14]:
# Data processing
import pandas as pd
import numpy as np
import scipy.stats

# Visualization
import seaborn as sns

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
matrix = pd.read_pickle('../data/item_matrix.pkl')
matrix.head()

userID,0,4,5,7,14,20,31,33,40,46,...,199956,199969,199973,199974,199975,199976,199980,199988,199990,199996
songID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2263,,,,,,,,,,,...,,,,,,,,,,5.0
2726,,,,,,,,,,,...,,,,,,,,5.0,,
3785,,,,,,,,,,,...,,,,,,,,,,
8063,,,2.0,,,,,,,,...,,,,,,,,,,
12709,,,,,,,,,,,...,2.0,,,,,,,,,


In [16]:
# modularizing getting normalized_matrix
def get_normalized_matrix(matrix):
    # Assuming matrix is a numpy array
    normalized_matrix = matrix

    # Calculate row means
    row_means = normalized_matrix.apply(lambda row: row.mean(), axis=1)
    normalized_matrix = normalized_matrix.sub(row_means, axis=0)
    normalized_matrix = normalized_matrix.fillna(0) # if there are NaN values, fill them with 0
    return normalized_matrix

In [17]:
# modularizing getting the similarity scores
def get_similarity_scores(normalized_matrix, picked_songID):
    try:
        picked_row_vector = normalized_matrix.loc[picked_songID].values.reshape(1, -1)
        rest_of_row_vectors = normalized_matrix.drop(picked_songID)
        similarity_scores = cosine_similarity(picked_row_vector, rest_of_row_vectors)
        similarity_scores_series = pd.Series(similarity_scores.flatten(), index=rest_of_row_vectors.index)
        return similarity_scores_series
    except:
        print(f"The song with ID {picked_songID} does not exist in the dataset. Please try again with a different song ID.")
        print(f"Here are the song IDs in the dataset: {normalized_matrix.index.values}")

In [18]:
picked_songID = 2263
similarity_scores_series = get_similarity_scores(get_normalized_matrix(matrix), picked_songID)
print(similarity_scores_series)

songID
2726      0.004418
3785      0.001116
8063     -0.000459
12709    -0.002472
13859     0.001723
16548    -0.001237
17029     0.001381
19299     0.000619
19670    -0.001655
22763     0.003601
24427     0.003214
25182     0.000439
28985    -0.001438
36561     0.004534
40712     0.003003
42781     0.000000
42906     0.003369
43267     0.002068
43827     0.000094
45026     0.004650
45934    -0.001879
48731     0.004632
52611     0.006285
54042     0.000761
55240     0.004053
55622    -0.001969
56660    -0.004000
60465     0.002585
60888     0.001450
62954    -0.000693
68572    -0.000261
71582     0.023981
72017    -0.001741
72309     0.001970
74640     0.003638
79622    -0.001558
86341     0.002698
90409     0.002973
91266     0.001613
92881    -0.000740
94535     0.002117
94604    -0.001311
105421    0.002466
105433    0.001606
112023    0.000739
113954    0.003367
119103    0.006552
120147    0.003552
122065   -0.000024
123176   -0.000011
125557    0.001655
126757    0.007487
13104

In [19]:
most_similar_songID = similarity_scores_series.idxmax()
print(f"The top most common songID to picked_songID {picked_songID} is: {most_similar_songID}")

The top most common songID to picked_songID 2263 is: 71582


Predicted ratings (formula)
$$r_{xi} = \frac{\sum_{j \in N} s_{ij} \cdot r_{xj}}{\sum_{j \in N} s_{ij}}$$

To obtain the predicted rating of user x on item i. We need to first get the top N items in terms of similarity with respect to user x. 

Suppose that the N similar items are y, then we take the rating as:
$$\frac{{\text{(similarity scores of $i$ and $j$)} \cdot \text{(rating given on item $j$ on $x$)}}}{{\text{sum of similarity scores of $i$ and $j$}}}
$$

In [20]:
#================================================================================================
# predict the rating of a song for a user
#================================================================================================

# suppose that we want to find the predicted rating of songID 2263 for picked_userID 4
# top N similar userIDs
N = 10
picked_songID = 8063
userID = 5

# modularizing
def get_predicted_ratings(picked_songID , N, userID, matrix):
    normalized_matrix = get_normalized_matrix(matrix)
    similarity_scores_series = get_similarity_scores(normalized_matrix, picked_songID)
    try:
        top_n_similar_songIDs = similarity_scores_series.nlargest(N).index.tolist()
        similarity_scores_series = similarity_scores_series[top_n_similar_songIDs]
        
        denom = np.sum(similarity_scores_series)
        matrix_adj = matrix.fillna(0)
        numer = np.dot(similarity_scores_series, matrix_adj.loc[top_n_similar_songIDs, userID])
        predicted_rating = numer / denom
        return predicted_rating
    
    except KeyError as e:
        print(f"The userID {userID} is not found in the columns; pick from the list of available UserIDs below:")
        print(f"{normalized_matrix.columns}")

In [21]:
predicted_rating = get_predicted_ratings(picked_songID, N, userID, matrix)
print(f"The predicted rating of songID {picked_songID} for userID {userID} is: {predicted_rating}")

The predicted rating of songID 8063 for userID 5 is: 0.788739024541422


In [22]:
# content-based + collaborative filtering
# item-based first

$$r_{xi} = b_{xi}+\frac{\sum_{j \in N} s_{ij} \cdot (r_{xj}-b_{xj})}{\sum_{j \in N} s_{ij}}$$

Where

$$b_{xi} = \mu + b_{x} + b_{i}$$

$\mu$ : overall song rating

$b_{x}$ : rating deviation of user x. (average rating of user x) - $\mu$

$b_{i}$ : rating deviation of item i. (average rating of item i) - $\mu$

In [23]:
# ASSUMING NAs CAN BE HANDLED
# modularizing the calculation of b_xi
def calc_b_xi(matrix, userID, picked_songID):
    try:
        mu = matrix.mean().mean()
        b_x = matrix.loc[:,userID].mean() - mu
        b_i = matrix.loc[picked_songID].mean() - mu
        return mu + b_x + b_i
    except:
        print(f"The userID {userID} or songID {picked_songID} does not exist in the dataset. Please try again with a different userID or songID.")
        print(f"Here are the song IDs in the dataset: {matrix.index.values}")
        print(f"Here are the user IDs in the dataset: {matrix.columns.values}")

In [24]:
# modularizing the calculation of r_xi
userID = 5
picked_songID = 8063
N = 10
def get_predicted_ratings_advanced(matrix, picked_songID, userID, N):
    normalized_matrix = get_normalized_matrix(matrix)
    similarity_scores_series = get_similarity_scores(normalized_matrix, picked_songID)
    matrix_adj = matrix.fillna(0)

    b_xi = calc_b_xi(matrix, userID, picked_songID) # if userID or picked_songID does not exist, exception will be raised
    denom = np.sum(similarity_scores_series)
    
    # getting top N similar songIDs
    top_n_similar_songIDs = similarity_scores_series.nlargest(N).index.tolist()
    similarity_scores_series = similarity_scores_series[top_n_similar_songIDs]
    
    r_xj = matrix_adj.loc[top_n_similar_songIDs, userID]
    b_xj = [calc_b_xi(matrix, userID, j) for j in top_n_similar_songIDs]
    numer = np.dot(similarity_scores_series, (r_xj-b_xj))
    predicted_ratings = b_xi + numer / denom
    return predicted_ratings

In [25]:
test = get_predicted_ratings_advanced(matrix, picked_songID, userID, N)

In [26]:
print(f"The predicted rating of songID {picked_songID} for userID {userID} is: {test:.2f}")

The predicted rating of songID 8063 for userID 5 is: 2.15
