In [17]:
# Data processing
import pandas as pd
import numpy as np
import scipy.stats

# Visualization
import seaborn as sns

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
matrix = pd.read_pickle("../data/user_matrix.pkl")
matrix.head()

songID,2263,2726,3785,8063,12709,13859,16548,17029,19299,19670,...,113954,119103,120147,122065,123176,125557,126757,131048,132189,134732
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,2.0,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,3.0
14,,,,,,,,,,,...,,,,,,,,,,


In [19]:
# modularizing getting normalized_matrix
def get_normalized_matrix(matrix):
    # Assuming matrix is a numpy array
    normalized_matrix = matrix

    # Convert numpy array to pandas DataFrame
    normalized_matrix = pd.DataFrame(normalized_matrix)

    # Calculate row means
    row_means = normalized_matrix.apply(lambda row: row.mean(), axis=1)
    normalized_matrix = normalized_matrix.sub(row_means, axis=0)
    normalized_matrix = normalized_matrix.fillna(0) # if there are NaN values, fill them with 0
    return normalized_matrix

In [20]:
# modularizing getting the similarity scores
def get_similarity_scores(normalized_matrix, picked_userID):
    try:
        picked_row_vector = normalized_matrix.loc[picked_userID].values.reshape(1, -1)
        rest_of_row_vectors = normalized_matrix.drop(picked_userID)
        similarity_scores = cosine_similarity(picked_row_vector, rest_of_row_vectors)
        similarity_scores_series = pd.Series(similarity_scores.flatten(), index=rest_of_row_vectors.index)
        return similarity_scores_series
    except:
        print(f"The user with ID {picked_userID} does not exist in the dataset. Please try again with a different song ID.")
        print(f"Here are the song IDs in the dataset: {normalized_matrix.index.values}")

In [21]:
picked_userID = 0
similarity_scores_series = get_similarity_scores(get_normalized_matrix(matrix), picked_userID)
print(similarity_scores_series)

userID
4         0.0
5         0.0
7         0.0
14        0.0
20        0.0
         ... 
199976    0.0
199980    0.0
199988    0.0
199990    0.0
199996    0.0
Length: 53962, dtype: float64


In [22]:
most_similar_userID = similarity_scores_series.idxmax()
print(f"The top most common userID to picked_userID {picked_userID} is: {most_similar_userID}")

The top most common userID to picked_userID 0 is: 4


Predicted ratings (formula)
$$r_{xi} = \frac{\sum_{y \in N} s_{xy} \cdot r_{yi}}{\sum_{y \in N} s_{xy}}$$

To obtain the predicted rating of user x on item i. We need to first get the top N users in terms of similarity with respect to item i. 

Suppose that the use users are y, then we take the rating as:
$$\frac{{\text{(similarity scores of $y$ and $x$)} \cdot \text{(rating given on item $i$ on $y$)}}}{{\text{sum of similarity scores of $y$ and $x$}}}
$$

In [23]:
#================================================================================================
# predict the rating of a song for a user
#================================================================================================

# suppose that we want to find the predicted rating of songID 2263 for picked_userID 4
# top N similar userIDs
N = 10
songID = 8063
picked_userID = 5

# modularizing similarity scores as a function
def get_predicted_ratings(picked_userID, N, songID, matrix):
    normalized_matrix = get_normalized_matrix(matrix)
    similarity_scores_series = get_similarity_scores(normalized_matrix, picked_userID)

    try:
        top_n_similar_userIDs = similarity_scores_series.nlargest(N).index.tolist()
        
        # retaining only the top N similar userIDs
        similarity_scores_series = similarity_scores_series[top_n_similar_userIDs]
        
        denom = np.sum(similarity_scores_series)
        matrix_adj = matrix.fillna(0)
        numer = np.dot(similarity_scores_series, matrix_adj.loc[top_n_similar_userIDs, songID])
        
        predicted_rating = numer / denom
        return predicted_rating
        
    except KeyError as e:
        print(f"The songID {songID} is not found in the columns; pick from the list of available songIDs below:")
        print(f"{normalized_matrix.columns}")

In [24]:
predicted_rating = get_predicted_ratings(picked_userID, N, songID, matrix)
print(f"The predicted rating of songID {songID} for picked_userID {picked_userID} is: {predicted_rating}")

The predicted rating of songID 8063 for picked_userID 5 is: 3.3
