In [51]:
# Data processing
import pandas as pd
import numpy as np
import scipy.stats

# Visualization
import seaborn as sns

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

In [52]:
df = pd.read_csv('../data/songsDataset.csv')

In [53]:
df.head()

Unnamed: 0,'userID','songID','rating'
0,0,7171,5
1,0,8637,4
2,0,21966,4
3,0,35821,5
4,0,82446,5


In [54]:
# Remove extra quotes from column names
df.columns = df.columns.str.strip("'")

# Now the column names should be corrected
print(df.columns)

Index(['userID', 'songID', 'rating'], dtype='object')


In [55]:
# there is too much data at the moment, we must sample it
print(df.shape)
print(df['userID'].nunique())
print(df['songID'].nunique())

(2000000, 3)
200000
127771


In [56]:
# for ease of calculation, we are getting the songs with over 1000 ratings and also aggregating by songID
agg_ratings = df.groupby('songID').agg(number_of_ratings = ('rating', 'count')).reset_index()

agg_ratings_GT1000 = agg_ratings[agg_ratings['number_of_ratings']>1000]
agg_ratings_GT1000.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56 entries, 2118 to 125910
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   songID             56 non-null     int64
 1   number_of_ratings  56 non-null     int64
dtypes: int64(2)
memory usage: 1.3 KB


In [57]:
# merge the two dataframes (to ensure that we only have songs with over 100 ratings)
df_GT1000 = pd.merge(df, agg_ratings_GT1000[['songID']], on='songID', how='inner')
df_GT1000.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72046 entries, 0 to 72045
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   userID  72046 non-null  int64
 1   songID  72046 non-null  int64
 2   rating  72046 non-null  int64
dtypes: int64(3)
memory usage: 1.6 MB


In [58]:
df_GT1000.head() 

Unnamed: 0,userID,songID,rating
0,0,90409,5
1,161,90409,5
2,466,90409,3
3,587,90409,5
4,695,90409,5


In [59]:
# train test split to be done here first

### Create the matrix

In [60]:
df_GT1000['rating'].value_counts()

rating
5    26664
1    13634
4    13429
3    11277
2     7042
Name: count, dtype: int64

In [61]:
matrix = df_GT1000.pivot_table(index='userID', columns='songID', values='rating')
matrix.head()

songID,2263,2726,3785,8063,12709,13859,16548,17029,19299,19670,...,113954,119103,120147,122065,123176,125557,126757,131048,132189,134732
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,2.0,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,3.0
14,,,,,,,,,,,...,,,,,,,,,,


In [62]:
# dataset normalization
normalized_matrix = matrix
row_means = normalized_matrix.apply(lambda row: row.mean(), axis=1)
normalized_matrix = normalized_matrix.sub(row_means, axis=0)
normalized_matrix = matrix.fillna(0)
print(normalized_matrix)

songID  2263    2726    3785    8063    12709   13859   16548   17029   \
userID                                                                   
0          0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4          0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5          0.0     0.0     0.0     2.0     0.0     0.0     0.0     0.0   
7          0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
14         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
...        ...     ...     ...     ...     ...     ...     ...     ...   
199976     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
199980     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
199988     0.0     5.0     0.0     0.0     0.0     0.0     0.0     0.0   
199990     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
199996     5.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

songID  19299   19670   ...  113954  

In [63]:
try:
    # Define a picked userID
    picked_userID = 4

    # Extract the first row vector from the normalized_matrix
    picked_row_vector = normalized_matrix.iloc[picked_userID].values.reshape(1, -1)

    # Extract the rest of the row vectors excluding the picked_userID
    rest_of_row_vectors = normalized_matrix.drop(picked_userID)

    # Compute cosine similarity between the first row vector and the rest of the row vectors
    similarity_scores = cosine_similarity(picked_row_vector, rest_of_row_vectors)

    # Print the similarity scores
    print(similarity_scores)

except KeyError as e:
    print(f"The picked_userID {picked_userID} is not found in the index.")
    print("Available indices to choose from:")
    print(normalized_matrix.index)

[[0. 0. 0. ... 0. 0. 0.]]


In [64]:
# for reference
normalized_matrix.index

Index([     0,      4,      5,      7,     14,     20,     31,     33,     40,
           46,
       ...
       199956, 199969, 199973, 199974, 199975, 199976, 199980, 199988, 199990,
       199996],
      dtype='int64', name='userID', length=53963)

In [65]:
# testing out with a particular userID
import pandas as pd

# Convert similarity_scores to a pandas Series
similarity_scores_series = pd.Series(similarity_scores.flatten(), index=rest_of_row_vectors.index)

# Find the index label of the maximum similarity score
most_similar_userID = similarity_scores_series.idxmax()

print(f"The top most common userID to picked_userID {picked_userID} is: {most_similar_userID}")

The top most common userID to picked_userID 4 is: 14


In [66]:
# attributions

# https://colab.research.google.com/drive/1cN44RlIEaB28FTD30qFiHkN3rqcDgcng?usp=sharing#scrollTo=1Zo4UEFWJVyt
# https://www.youtube.com/watch?v=h9gpufJFF-0&ab_channel=ArtificialIntelligence-AllinOne
# https://www.youtube.com/watch?v=6BTLobS7AU8&t=728s&ab_channel=ArtificialIntelligence-AllinOne
# ChatGPT and Copilot to format code