In [14]:
# Data processing
import pandas as pd
import numpy as np
import scipy.stats

# Visualization
import seaborn as sns

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
df = pd.read_csv('../data/songsDataset.csv')

In [16]:
df.head()

Unnamed: 0,'userID','songID','rating'
0,0,7171,5
1,0,8637,4
2,0,21966,4
3,0,35821,5
4,0,82446,5


In [17]:
# Remove extra quotes from column names
df.columns = df.columns.str.strip("'")

# Now the column names should be corrected
print(df.columns)

Index(['userID', 'songID', 'rating'], dtype='object')


In [18]:
# there is too much data at the moment, we must sample it
print(df.shape)
print(df['userID'].nunique())
print(df['songID'].nunique())

(2000000, 3)
200000
127771


In [19]:
# for ease of calculation, we are getting the songs with over 1000 ratings and also aggregating by songID
agg_ratings = df.groupby('songID').agg(number_of_ratings = ('rating', 'count')).reset_index()

agg_ratings_GT1000 = agg_ratings[agg_ratings['number_of_ratings']>1000]
agg_ratings_GT1000.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56 entries, 2118 to 125910
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   songID             56 non-null     int64
 1   number_of_ratings  56 non-null     int64
dtypes: int64(2)
memory usage: 1.3 KB


In [20]:
agg_ratings_GT1000.head()

Unnamed: 0,songID,number_of_ratings
2118,2263,1413
2549,2726,1904
3538,3785,1092
7536,8063,1491
11887,12709,1089


In [21]:
# merge the two dataframes (to ensure that we only have songs with over 100 ratings)
df_GT1000 = pd.merge(df, agg_ratings_GT1000[['songID']], on='songID', how='inner')
df_GT1000.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72046 entries, 0 to 72045
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   userID  72046 non-null  int64
 1   songID  72046 non-null  int64
 2   rating  72046 non-null  int64
dtypes: int64(3)
memory usage: 1.6 MB


In [22]:
df_GT1000.head()

Unnamed: 0,userID,songID,rating
0,0,90409,5
1,4,91266,1
2,5,8063,2
3,5,24427,4
4,5,105433,4


In [23]:
df_GT1000['rating'].value_counts()

rating
5    26664
1    13634
4    13429
3    11277
2     7042
Name: count, dtype: int64

In [24]:
# for items
item_matrix = df_GT1000.pivot_table(index='songID', columns='userID', values='rating')
item_matrix.head()

userID,0,4,5,7,14,20,31,33,40,46,...,199956,199969,199973,199974,199975,199976,199980,199988,199990,199996
songID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2263,,,,,,,,,,,...,,,,,,,,,,5.0
2726,,,,,,,,,,,...,,,,,,,,5.0,,
3785,,,,,,,,,,,...,,,,,,,,,,
8063,,,2.0,,,,,,,,...,,,,,,,,,,
12709,,,,,,,,,,,...,2.0,,,,,,,,,


In [25]:
# for users
user_matrix = df_GT1000.pivot_table(index='userID', columns='songID', values='rating')
user_matrix.head()

songID,2263,2726,3785,8063,12709,13859,16548,17029,19299,19670,...,113954,119103,120147,122065,123176,125557,126757,131048,132189,134732
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,2.0,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,3.0
14,,,,,,,,,,,...,,,,,,,,,,


In [26]:
# import into pickle
user_matrix.to_pickle('../data/user_matrix.pkl')
item_matrix.to_pickle('../data/item_matrix.pkl')