In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
import Recommenders as Recommenders

In [2]:
# File paths
triplets_file = r"D:\ONE DRIVE MAHI\OneDrive\Documents\GitHub\Music-Recommender-KNN\song_dataset\triplets_file.csv"
songs_metadata_file = r"D:\ONE DRIVE MAHI\OneDrive\Documents\GitHub\Music-Recommender-KNN\song_dataset\song_data.csv"

# Load the triplets file (with header in the first row)
song_df_1 = pd.read_csv(triplets_file)
song_df_1.columns = ['user_id', 'song_id', 'listen_count']

# Load the songs metadata file
song_df_2 = pd.read_csv(songs_metadata_file)

song_df = pd.merge(song_df_1, song_df_2.drop_duplicates(['song_id']), on="song_id", how="left")

print(song_df.head())


                                    user_id             song_id  listen_count  \
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995             1   
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B             2   
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBXHDL12A81C204C0             1   
3  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBYHAJ12A6701BF1D             1   
4  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SODACBL12A8C13C273             1   

             title                        release    artist_name  year  
0         The Cove             Thicker Than Water   Jack Johnson     0  
1  Entre Dos Aguas            Flamenco Para Niños  Paco De Lucia  1976  
2         Stronger                     Graduation     Kanye West  2007  
3   Constellations              In Between Dreams   Jack Johnson  2005  
4     Learn To Fly  There Is Nothing Left To Lose   Foo Fighters  1999  


In [3]:
#Checking listening percentage
#We are checking how many users interacted with the song, not how many times they've individually listened to it.
song_grouped=song_df.groupby(['title']).agg({'listen_count':'count'}).reset_index()
#Here listen count now represents how many users have listened to that song
grouped_sum=song_grouped['listen_count'].sum() #Summing up
song_grouped['percentage']=song_grouped['listen_count'].div(grouped_sum)*100
# Sort the DataFrame temporarily without modifying the original
sorted_song_grouped = song_grouped.sort_values(['listen_count', 'title'], ascending=[False, True])

# View the sorted result
print(sorted_song_grouped.head())


                               title  listen_count  percentage
6836                   Sehr kosmisch          8277     0.41385
8725                            Undo          7032     0.35160
1964  Dog Days Are Over (Radio Edit)          6949     0.34745
9496                  You're The One          6729     0.33645
6498                         Revelry          6145     0.30725


In [4]:
#Return unique users and unique songs
users = song_df['user_id'].unique()
len(users)


76353

In [10]:
songs = song_df['title'].unique()
len(songs) 

9567

In [5]:
train_data, test_data = train_test_split(song_df, test_size = 0.20, random_state=0)

In [24]:
pm = Recommenders.popularity_recommender_py()
pm.create(train_data, 'user_id', 'title')
#user the popularity model to make some prediction
user_id = users[5]
pm.recommend(user_id)

Unnamed: 0,user_id,title,score,Rank
6836,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Sehr kosmisch,6630,1.0
8725,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Undo,5639,2.0
1964,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Dog Days Are Over (Radio Edit),5592,3.0
9496,4bd88bfb25263a75bbdd467e74018f4ae570e5df,You're The One,5396,4.0
6498,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Revelry,4938,5.0
6825,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Secrets,4627,6.0
3437,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Horn Concerto No. 4 in E flat K495: II. Romanc...,4368,7.0
2595,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Fireflies,3835,8.0
3322,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Hey_ Soul Sister,3819,9.0
8494,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Tive Sim,3707,10.0


In [17]:
is_model=Recommenders.item_similarity_recommender_py()
is_model.create(train_data, 'user_id', 'title')

In [25]:
#Print the songs for the user in training data
user_id = users[5]
user_items = is_model.get_user_items(user_id)
#
print("------------------------------------------------------------------------------------")
print("Training data songs for the user userid: %s:" % user_id)
print("------------------------------------------------------------------------------------")

for user_item in user_items:
    print(user_item)

print("----------------------------------------------------------------------")
print("Recommendation process going on:")
print("----------------------------------------------------------------------")


------------------------------------------------------------------------------------
Training data songs for the user userid: 4bd88bfb25263a75bbdd467e74018f4ae570e5df:
------------------------------------------------------------------------------------
The Real Slim Shady
Forgive Me
Say My Name
Speechless
Ghosts 'n' Stuff (Original Instrumental Mix)
Missing You
Without Me
Somebody To Love
Just Lose It
----------------------------------------------------------------------
Recommendation process going on:
----------------------------------------------------------------------


In [26]:
#Recommend songs for the user using personalized model
is_model.recommend(user_id)

No. of unique songs for the user: 9
no. of unique songs in the training set: 9567
Non zero values in cooccurence_matrix :60155


Unnamed: 0,user_id,song,score,rank
0,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Mockingbird,0.057687,1
1,4bd88bfb25263a75bbdd467e74018f4ae570e5df,My Name Is,0.056503,2
2,4bd88bfb25263a75bbdd467e74018f4ae570e5df,U Smile,0.044817,3
3,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Terre Promise,0.044756,4
4,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Eenie Meenie,0.043378,5
5,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Superman,0.042695,6
6,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Hailie's Song,0.041082,7
7,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Drop The World,0.04093,8
8,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Love Me,0.040303,9
9,4bd88bfb25263a75bbdd467e74018f4ae570e5df,OMG,0.040012,10


In [7]:
from scipy.sparse.linalg import svds
from scipy.sparse import csc_matrix
import numpy as np
import math as mt

In [10]:
#Trying SVD
MAX_PID=4 #Number of items
MAX_UID=5 #Number of users

def computeSVD(urm, K):     
    # Use svds for truncated SVD on sparse matrix
    U, s, Vt = svds(urm, k=K)
    
    # Create the diagonal matrix S from the singular values
    dim = (len(s), len(s))
    S = np.zeros(dim, dtype=np.float32)
    for i in range(len(s)):
        S[i, i] = mt.sqrt(s[i])
    
    # Convert to sparse matrix format
    U = csc_matrix(U, dtype=np.float32)  # Do not transpose
    S = csc_matrix(S, dtype=np.float32)
    Vt = csc_matrix(Vt, dtype=np.float32)
    
    return U, S, Vt



In [12]:
#Compute estimated rating for the test user
def computeEstimatedRatings(urm, U, S, Vt, uTest, K, test):
    rightTerm = S*Vt
    estimatedRatings = np.zeros(shape=(MAX_UID, MAX_PID), dtype=np.float16)
    for userTest in uTest:
        prod = U[userTest, :]*rightTerm
        #we convert the vector to dense format in order to get the     #indices
        #of the movies with the best estimated ratings 
        estimatedRatings[userTest, :] = prod.todense()
        recom = (-estimatedRatings[userTest, :]).argsort()[:250]
    return recom

#Used in SVD calculation (number of latent factors)
K=2
#Initialize a sample user rating matrix
urm = np.array([[3, 1, 2, 3],[4, 3, 4, 3],[3, 2, 1, 5], [1, 6, 5, 2], [5, 0,0 , 0]])
urm = csc_matrix(urm, dtype=np.float32)
#Compute SVD of the input user ratings matrix
U, S, Vt = computeSVD(urm, K)
#Test user set as user_id 4 with ratings [0, 0, 5, 0]
uTest = [4]
print("User id for whom recommendations are needed: %d" % uTest[0])
#Get estimated rating for test user
print("Predicted ratings:")
uTest_recommended_items = computeEstimatedRatings(urm, U, S, Vt, uTest, K, True)
print(uTest_recommended_items)


User id for whom recommendations are needed: 4
Predicted ratings:
[0 3 2 1]
