## Collaborative Filtering: Item-Item with Matrix Factorization
- Using SVD to factorize the item-item matrix
- Matrix of 10,000 x 10,000 using 80% training data
- Using play counts as the feature

In [38]:
import random
import pandas as pd
from sklearn.decomposition import TruncatedSVD

In [8]:
df_song = pd.read_pickle('data/song_pairs.pkl')
df_song.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42597082 entries, 0 to 42597081
Data columns (total 3 columns):
 #   Column     Dtype
---  ------     -----
 0   song_num1  int32
 1   song_num2  int32
 2   plays      int32
dtypes: int32(3)
memory usage: 487.5 MB


## Sample of data for matrix factorization

In [9]:
# create a dataframe with a sample of 80% of the data
df_song_sample = df_song.sample(frac=0.9, random_state=42)
df_song_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38337374 entries, 23692561 to 2794874
Data columns (total 3 columns):
 #   Column     Dtype
---  ------     -----
 0   song_num1  int32
 1   song_num2  int32
 2   plays      int32
dtypes: int32(3)
memory usage: 731.2 MB


## Create song-song interaction matrix

In [10]:
song_song = df_song_sample.pivot(index='song_num1', columns='song_num2', values='plays').fillna(0)
song_song.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 77 to 998882
Columns: 10000 entries, 77 to 998882
dtypes: float64(10000)
memory usage: 763.0 MB


In [11]:
song_song.head(3)

song_num2,77,197,328,375,421,459,511,553,583,778,...,998152,998275,998388,998541,998697,998738,998755,998844,998870,998882
song_num1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
77,0.0,0.0,0.0,0.0,0.0,30.0,3.0,5.0,11.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
197,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,3.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0
328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,1.0


## Factorize the matrix using SVD

In [12]:
# Perform Truncated SVD with 100 latent features
svd = TruncatedSVD(n_components=100, random_state=42)
user_factors = svd.fit_transform(song_song)
song_factors = svd.components_ # svd.components_.T

print('Explained variance ratio', svd.explained_variance_ratio_.sum())

#song_song = None # save memory

Explained variance ratio 0.9181805847596275


## Using factorization to predict recommendations

In [47]:
df_song_list = pd.read_pickle('data/song_cleaned.pkl')
df_song_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999056 entries, 0 to 999055
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   song_num     999056 non-null  int32 
 1   title        999041 non-null  object
 2   release      999056 non-null  object
 3   artist_name  999056 non-null  object
 4   year         999056 non-null  int32 
dtypes: int32(2), object(3)
memory usage: 30.5+ MB


In [None]:
df_song_list[df_song_list['song_num'].isin(song_song.index) & df_song_list['artist_name'].str.contains('U2')].head(3)

In [60]:
def recommend_songs(row_id, top_n=10):
  user_index = song_song.index.get_loc(row_id)
  user_vector = user_factors[user_index]
  song_scores = user_vector.dot(song_factors)
  song_recommendations = song_scores.argsort()[::-1][:top_n]
  return song_recommendations

In [90]:
# Example usage
song_num = random.choice(song_song.index)
song_data = df_song_list[df_song_list.song_num == song_num]

song_recommendations = recommend_songs(song_num)
print(f"Recommended songs based on song {song_num} - {song_data.title.values[0]} - {song_data.artist_name.values[0]}:")

for song in song_recommendations:
  song_data = df_song_list[df_song_list.song_num == song]
  print(f"{song_data.song_num.values[0]} - {song_data.title.values[0]} - {song_data.artist_name.values[0]}")



Recommended songs based on song 430705 - Second Heartbeat - Avenged Sevenfold:
4255 - Outta My Mind - The Last Vegas
4035 - Staring At The Sun - Chesney Hawkes
6409 - Banks Of The Roses - Dublin City Ramblers
3160 - Je Te Veux - Valentina Madonna
5089 - Pigmeat Blues - Georgia White
739 - En Una Ensonación - Lino Borges
6410 - Step with Jungle Roots - Fire Ball
294 - Pocket Revolution - dEUS
9054 - El amor que me das - David deMaria
5801 - Feeling That - Nôze
