In [3]:
# Data processing
import pandas as pd
import numpy as np
import scipy.stats

# Visualization
import seaborn as sns

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
df = pd.read_csv('../data/songsDataset.csv')

In [6]:
df.head()

Unnamed: 0,'userID','songID','rating'
0,0,7171,5
1,0,8637,4
2,0,21966,4
3,0,35821,5
4,0,82446,5


In [7]:
# Remove extra quotes from column names
df.columns = df.columns.str.strip("'")

# Now the column names should be corrected
print(df.columns)

Index(['userID', 'songID', 'rating'], dtype='object')


In [8]:
# there is too much data at the moment, we must sample it
print(df.shape)
print(df['userID'].nunique())
print(df['songID'].nunique())

(2000000, 3)
200000
127771


In [9]:
# for ease of calculation, we are getting the songs with over 1000 ratings and also aggregating by songID
agg_ratings = df.groupby('songID').agg(number_of_ratings = ('rating', 'count')).reset_index()

agg_ratings_GT1000 = agg_ratings[agg_ratings['number_of_ratings']>1000]
agg_ratings_GT1000.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56 entries, 2118 to 125910
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   songID             56 non-null     int64
 1   number_of_ratings  56 non-null     int64
dtypes: int64(2)
memory usage: 1.3 KB


In [11]:
agg_ratings_GT1000.head()

Unnamed: 0,songID,number_of_ratings
2118,2263,1413
2549,2726,1904
3538,3785,1092
7536,8063,1491
11887,12709,1089


In [12]:
# merge the two dataframes (to ensure that we only have songs with over 100 ratings)
df_GT1000 = pd.merge(df, agg_ratings_GT1000[['songID']], on='songID', how='inner')
df_GT1000.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72046 entries, 0 to 72045
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   userID  72046 non-null  int64
 1   songID  72046 non-null  int64
 2   rating  72046 non-null  int64
dtypes: int64(3)
memory usage: 1.6 MB


In [13]:
df_GT1000.head()

Unnamed: 0,userID,songID,rating
0,0,90409,5
1,4,91266,1
2,5,8063,2
3,5,24427,4
4,5,105433,4


### Create the matrix

In [15]:
df_GT1000['rating'].value_counts()

rating
5    26664
1    13634
4    13429
3    11277
2     7042
Name: count, dtype: int64

In [16]:
matrix = df_GT1000.pivot_table(index='songID', columns='userID', values='rating')
matrix.head()

userID,0,4,5,7,14,20,31,33,40,46,...,199956,199969,199973,199974,199975,199976,199980,199988,199990,199996
songID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2263,,,,,,,,,,,...,,,,,,,,,,5.0
2726,,,,,,,,,,,...,,,,,,,,5.0,,
3785,,,,,,,,,,,...,,,,,,,,,,
8063,,,2.0,,,,,,,,...,,,,,,,,,,
12709,,,,,,,,,,,...,2.0,,,,,,,,,


In [17]:
normalized_matrix = matrix
row_means = normalized_matrix.apply(lambda row: row.mean(), axis=1)
normalized_matrix = normalized_matrix.sub(row_means, axis=0)
normalized_matrix = matrix.fillna(0)
print(normalized_matrix)

userID  0       4       5       7       14      20      31      33      \
songID                                                                   
2263       0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2726       0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3785       0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
8063       0.0     0.0     2.0     0.0     0.0     0.0     0.0     0.0   
12709      0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
13859      0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
16548      0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
17029      0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
19299      0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
19670      0.0     0.0     0.0     0.0     0.0     1.0     0.0     0.0   
22763      0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
24427      0.0     0.0     4.0     0.0

In [27]:
try:
    # Define a picked userID
    picked_songID = 2263

    # Extract the first row vector from the normalized_matrix
    picked_row_vector = normalized_matrix.loc[picked_songID].values.reshape(1, -1)

    # Extract the rest of the row vectors excluding the picked_userID
    rest_of_row_vectors = normalized_matrix.drop(picked_songID)

    # Compute cosine similarity between the first row vector and the rest of the row vectors
    similarity_scores = cosine_similarity(picked_row_vector, rest_of_row_vectors)

    # Print the similarity scores
    print(similarity_scores)

except KeyError as e:
    print(f"The picked_songID {picked_songID} is not found in the index.")
    print("Available indices to choose from:")
    print(normalized_matrix.index)

[[0.01920403 0.00264817 0.00736182 0.00213104 0.01046116 0.00112748
  0.01390946 0.00483179 0.00313    0.00452364 0.00359277 0.0044136
  0.00205082 0.00935701 0.01300096 0.         0.0054429  0.01229933
  0.00043529 0.00744281 0.01543778 0.00176448 0.03240197 0.0016889
  0.02645205 0.00559758 0.00582338 0.00275966 0.00770865 0.00053245
  0.00884983 0.05147245 0.00254045 0.00537526 0.01664087 0.00155915
  0.01043156 0.00222591 0.00293732 0.00535181 0.00404868 0.00097047
  0.00352536 0.00484117 0.01318156 0.01455763 0.0111824  0.01122055
  0.01723894 0.00377352 0.00499477 0.01010678 0.00741551 0.00348913
  0.01105078]]


In [29]:
# testing out with a particular userID
import pandas as pd

# Convert similarity_scores to a pandas Series
similarity_scores_series = pd.Series(similarity_scores.flatten(), index=rest_of_row_vectors.index)

# Find the index label of the maximum similarity score
most_similar_songID = similarity_scores_series.idxmax()

print(f"The top most common songID to picked_songID {picked_songID} is: {most_similar_songID}")

The top most common songID to picked_songID 2263 is: 71582


Predicted ratings (formula)
$$r_{xi} = \frac{\sum_{y \in N} s_{xy} \cdot r_{yi}}{\sum_{y \in N} s_{xy}}$$

To obtain the predicted rating of user x on item i. We need to first get the top N users in terms of similarity with respect to item i. 

Suppose that the use users are y, then we take the rating as:
$$\frac{{\text{(similarity scores of $y$ and $x$)} \cdot \text{(rating given on item $i$ on $y$)}}}{{\text{sum of similarity scores of $y$ and $x$}}}
$$

In [34]:
#================================================================================================
# predict the rating of a song for a user
#================================================================================================

# suppose that we want to find the predicted rating of songID 2263 for picked_userID 4
# top N similar userIDs
N = 10
picked_songID = 2263
userID = 4

# modularizing similarity scores as a function
def similarity_scores(picked_songID ,normalized_matrix):
    try:
        # Extract the first row vector from the normalized_matrix
        picked_row_vector = normalized_matrix.loc[picked_songID].values.reshape(1, -1)

        # Extract the rest of the row vectors excluding the picked_songID
        rest_of_row_vectors = normalized_matrix.drop(picked_songID)

        # Compute cosine similarity between the first row vector and the rest of the row vectors
        similarity_scores = cosine_similarity(picked_row_vector, rest_of_row_vectors)

        # Print the similarity scores
        return(similarity_scores)

    except KeyError as e:
        print(f"The songID {picked_songID} is not found in the index.")
        print("Available indices to choose from:")
        print(normalized_matrix.index)

try:
    similarity_scores_series = pd.Series(similarity_scores(picked_songID, normalized_matrix).flatten(), index=rest_of_row_vectors.index)
    top_n_similar_songIDs = similarity_scores_series.nlargest(N).index.tolist()
    
    # retaining only the top N similar userIDs
    similarity_scores_series = similarity_scores_series[top_n_similar_songIDs]
    
    denom = np.sum(similarity_scores_series)
    matrix_adj = matrix.fillna(0)
    numer = np.sum(similarity_scores_series * matrix_adj.loc[picked_songID, userID])
    
    predicted_rating = numer / denom
    print(f"The predicted rating of songID {picked_songID} for userID {userID} is: {predicted_rating}")
    
except KeyError as e:
    print(f"The userID {userID} is not found in the columns; pick from the list of available UserIDs below:")
    print(f"{normalized_matrix.columns}")

The predicted rating of songID 2263 for userID 4 is: 0.0
