# Building a song recommender

In [1]:
%matplotlib inline

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import time
import joblib
import Recommenders as Recommenders
import Evaluation as Evaluation

# Load music data

In [2]:
song_df_1 = pd.read_table('10000.txt',header=None)
song_df_1.columns = ['user_id', 'song_id', 'listen_count']

song_df_2 =  pd.read_csv('song_data.csv')

# Hợp nhất 2 file dữ liệu
song_df = pd.merge(song_df_1, song_df_2.drop_duplicates(['song_id']), on="song_id", how="left") 

# Explore data

Hiển thị số lần người dùng đã nghe 1 bài hát, cũng như thông tin chi tiết của bài hát

In [3]:
song_df.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999


## Tổng độ dài bộ dữ liệu

In [4]:
len(song_df)

2000000

## Chọn tập dữ liệu
10000 mẫu dữ liệu đầu tiên

In [5]:
song_df = song_df.head(10000)

# Hợp nhất cột tên bài hát và tên nghệ sĩ thành một cột duy nhất
song_df['song'] = song_df['title'].map(str) + " - " + song_df['artist_name']
song_df.head(5)

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year,song
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0,The Cove - Jack Johnson
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976,Entre Dos Aguas - Paco De Lucia
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007,Stronger - Kanye West
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005,Constellations - Jack Johnson
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999,Learn To Fly - Foo Fighters


## Hiển thị các bài hát phổ biến nhất trong tập dữ liệu

In [6]:
# Nhóm theo số lượng lượt nghe tăng dần
song_grouped = song_df.groupby(['song']).agg({'listen_count': 'count'}).reset_index()
# Tính tổng số lượt nghe
grouped_sum = song_grouped['listen_count'].sum()
# Tính tỉ lệ phần trăm = số lượt nghe của mỗi bài hát / tổng số lượt nghe
song_grouped['percentage']  = song_grouped['listen_count'].div(grouped_sum)*100
# Sort theo thứ tự tăng dần về mức độ phổ biến
song_grouped.sort_values(['listen_count', 'song'], ascending = [0,1])

Unnamed: 0,song,listen_count,percentage
3660,Sehr kosmisch - Harmonia,45,0.45
4678,Undo - Björk,32,0.32
5105,You're The One - Dwight Yoakam,32,0.32
1071,Dog Days Are Over (Radio Edit) - Florence + Th...,28,0.28
3655,Secrets - OneRepublic,28,0.28
...,...,...,...
5139,high fives - Four Tet,1,0.01
5140,in white rooms - Booka Shade,1,0.01
5143,paranoid android - Christopher O'Riley,1,0.01
5149,¿Lo Ves? [Piano Y Voz] - Alejandro Sanz,1,0.01


## Đếm số lượng người dùng trong tập dữ liệu

In [7]:
users = song_df['user_id'].unique()

In [8]:
len(users)
# Trả về 365 người dùng

365

## Đếm số lượng bài hát trong tập dữ liệu

In [9]:
songs = song_df['song'].unique()
len(songs)
# Trả về 5151 bài hát

5151

# Create a song recommender

In [10]:
train_data, test_data = train_test_split(song_df, test_size = 0.20, random_state=0)
print(train_data.head(5))

                                       user_id             song_id  \
7389  94d5bdc37683950e90c56c9b32721edb5d347600  SOXNZOW12AB017F756   
9275  1012ecfd277b96487ed8357d02fa8326b13696a5  SOXHYVQ12AB0187949   
2995  15415fa2745b344bce958967c346f2a89f792f63  SOOSZAZ12A6D4FADF8   
5316  ffadf9297a99945c0513cd87939d91d8b602936b  SOWDJEJ12A8C1339FE   
356   5a905f000fc1ff3df7ca807d57edb608863db05d  SOAMPRJ12A8AE45F38   

      listen_count                 title  \
7389             2      Half Of My Heart   
9275             1  The Beautiful People   
2995             1     Sanctify Yourself   
5316             4     Heart Cooks Brain   
356             20                 Rorol   

                                                release      artist_name  \
7389                                     Battle Studies       John Mayer   
9275             Antichrist Superstar (Ecopac Explicit)   Marilyn Manson   
2995                             Glittering Prize 81/92     Simple Minds   
5316  Ever

## Đề xuất dựa trên mức độ phổ biến

In [11]:
pm = Recommenders.popularity_recommender_py()
pm.create(train_data, 'user_id', 'song')
user_id = users[5]
pm.recommend(user_id)

Unnamed: 0,user_id,song,score,Rank
3194,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Sehr kosmisch - Harmonia,37,1.0
4083,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Undo - Björk,27,2.0
931,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Dog Days Are Over (Radio Edit) - Florence + Th...,24,3.0
4443,4bd88bfb25263a75bbdd467e74018f4ae570e5df,You're The One - Dwight Yoakam,24,4.0
3034,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Revelry - Kings Of Leon,21,5.0
3189,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Secrets - OneRepublic,21,6.0
4112,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Use Somebody - Kings Of Leon,21,7.0
1207,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Fireflies - Charttraxx Karaoke,20,8.0
1577,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Hey_ Soul Sister - Train,19,9.0
1626,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Horn Concerto No. 4 in E flat K495: II. Romanc...,19,10.0


## Đề xuất dựa trên Item similarity based collaborative filtering model

Áp dụng cho người dùng cụ thể

In [12]:
is_model = Recommenders.item_similarity_recommender_py()
is_model.create(train_data, 'user_id', 'song')

user_id = users[5]
user_items = is_model.get_user_items(user_id)

print("------------------------------------------------------------------------------------")
print("Training data songs for the user userid: %s:" % user_id)
print("------------------------------------------------------------------------------------")

for user_item in user_items:
    print(user_item)

print("----------------------------------------------------------------------")
print("Recommendation process going on:")
print("----------------------------------------------------------------------")

is_model.recommend(user_id)

------------------------------------------------------------------------------------
Training data songs for the user userid: 4bd88bfb25263a75bbdd467e74018f4ae570e5df:
------------------------------------------------------------------------------------
Just Lose It - Eminem
Without Me - Eminem
16 Candles - The Crests
Speechless - Lady GaGa
Push It - Salt-N-Pepa
Ghosts 'n' Stuff (Original Instrumental Mix) - Deadmau5
Say My Name - Destiny's Child
My Dad's Gone Crazy - Eminem / Hailie Jade
The Real Slim Shady - Eminem
Somebody To Love - Justin Bieber
Forgive Me - Leona Lewis
Missing You - John Waite
Ya Nada Queda - Kudai
----------------------------------------------------------------------
Recommendation process going on:
----------------------------------------------------------------------
No. of unique songs for the user: 13
no. of unique songs in the training set: 4483
Non zero values in cooccurence_matrix :2097


Unnamed: 0,user_id,song,score,rank
0,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Superman - Eminem / Dina Rae,0.088692,1
1,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Mockingbird - Eminem,0.067663,2
2,4bd88bfb25263a75bbdd467e74018f4ae570e5df,I'm Back - Eminem,0.065385,3
3,4bd88bfb25263a75bbdd467e74018f4ae570e5df,U Smile - Justin Bieber,0.064525,4
4,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Here Without You - 3 Doors Down,0.062293,5
5,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Hellbound - J-Black & Masta Ace,0.055769,6
6,4bd88bfb25263a75bbdd467e74018f4ae570e5df,The Seed (2.0) - The Roots / Cody Chestnutt,0.052564,7
7,4bd88bfb25263a75bbdd467e74018f4ae570e5df,I'm The One Who Understands (Edit Version) - War,0.052564,8
8,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Falling - Iration,0.052564,9
9,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Armed And Ready (2009 Digital Remaster) - The ...,0.052564,10


Áp dụng cho bài hát cụ thể

In [13]:
is_model.get_similar_items(['Yellow - Coldplay'])

no. of unique songs in the training set: 4483
Non zero values in cooccurence_matrix :969


Unnamed: 0,user_id,song,score,rank
0,,Fix You - Coldplay,0.375,1
1,,Creep (Explicit) - Radiohead,0.291667,2
2,,Clocks - Coldplay,0.28,3
3,,Seven Nation Army - The White Stripes,0.25,4
4,,Paper Planes - M.I.A.,0.208333,5
5,,Halo - Beyoncé,0.2,6
6,,The Funeral (Album Version) - Band Of Horses,0.181818,7
7,,In My Place - Coldplay,0.181818,8
8,,Kryptonite - 3 Doors Down,0.166667,9
9,,When You Were Young - The Killers,0.166667,10


# Matrix Factorization based Recommender System

In [47]:
# Tính tổng bài hát của từng người dùng
song2Sum  = song_df[['user_id','listen_count']].groupby('user_id').sum().reset_index()
song2Sum.rename(columns={'listen_count':'total_listen_count'},inplace=True)
song_df = pd.merge(song_df,song2Sum)
# Tính % số lần nghe của từng bài hát = số lần nghe / tổng số lần nghe của người dùng
song_df['fractional_listen_count'] = song_df['listen_count']/song_df['total_listen_count']

In [48]:
song_df.head

<bound method NDFrame.head of                                        user_id             song_id  \
0     b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995   
1     b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B   
2     b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBXHDL12A81C204C0   
3     b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBYHAJ12A6701BF1D   
4     b80344d063b5ccb3212f76538f3d9e43d87dca9e  SODACBL12A8C13C273   
...                                        ...                 ...   
9995  15cc706a7f24975ca831aaaf297bf0392746b3fe  SOFSETB12A8C134038   
9996  15cc706a7f24975ca831aaaf297bf0392746b3fe  SOHIROU12AB01852AF   
9997  15cc706a7f24975ca831aaaf297bf0392746b3fe  SOOAVGC12AB01821EC   
9998  15cc706a7f24975ca831aaaf297bf0392746b3fe  SOPKEIV12AB018220D   
9999  15cc706a7f24975ca831aaaf297bf0392746b3fe  SOQSUMQ12AB01821F7   

      listen_count                       title                        release  \
0                1                    The Cove  

In [70]:
from scipy.sparse import coo_matrix

small_set = song_df

user_codes = small_set.user_id.drop_duplicates().reset_index()
song_codes = small_set.song_id.drop_duplicates().reset_index()
user_codes.rename(columns={'index':'user_index'}, inplace=True)
song_codes.rename(columns={'index':'song_index'}, inplace=True)
song_codes['so_index_value'] = list(song_codes.index)
user_codes['us_index_value'] = list(user_codes.index)
small_set = pd.merge(small_set,song_codes,how='left')
small_set = pd.merge(small_set,user_codes,how='left')
mat_candidate = small_set[['us_index_value','so_index_value','fractional_listen_count']]
data_array = mat_candidate.fractional_listen_count.values
row_array = mat_candidate.us_index_value.values
col_array = mat_candidate.so_index_value.values

data_sparse = coo_matrix((data_array, (row_array, col_array)), dtype = float)
data_sparse

<365x5175 sparse matrix of type '<class 'numpy.float64'>'
	with 10000 stored elements in COOrdinate format>

In [79]:
user_codes[user_codes.user_id =='15cc706a7f24975ca831aaaf297bf0392746b3fe']

Unnamed: 0,user_index,user_id,us_index_value
364,9995,15cc706a7f24975ca831aaaf297bf0392746b3fe,364


In [93]:
import math as mt
import numpy as np
from scipy.sparse.linalg import *
from scipy.sparse.linalg import svds
from scipy.sparse import csc_matrix
import numpy as np


def compute_svd(urm, K):
    U, s, Vt = svds(urm, K)

    dim = (len(s), len(s))
    S = np.zeros(dim, dtype=np.float32)
    for i in range(0, len(s)):
        S[i,i] = mt.sqrt(s[i])

    U = csc_matrix(U, dtype=np.float32)
    S = csc_matrix(S, dtype=np.float32)
    Vt = csc_matrix(Vt, dtype=np.float32)
    
    return U, S, Vt

def compute_estimated_matrix(urm, U, S, Vt, uTest, K, MAX_PID, MAX_UID,test):
    rightTerm = S*Vt 
    max_recommendation = 250
    estimatedRatings = np.zeros(shape=(MAX_UID, MAX_PID), dtype=np.float16)
    recomendRatings = np.zeros(shape=(MAX_UID,max_recommendation ), dtype=np.float16)
    for userTest in uTest:
        prod = U[userTest, :]*rightTerm
        estimatedRatings[userTest, :] = prod.todense()
        recomendRatings[userTest, :] = (-estimatedRatings[userTest, :]).argsort()[:max_recommendation]
    return recomendRatings

K=50
utilityMatrix = data_sparse
MAX_PID = utilityMatrix.shape[1]
MAX_UID = utilityMatrix.shape[0]

U, S, Vt = compute_svd(utilityMatrix, K)

In [95]:
uTest = [100,200]

uTest_recommended_items = compute_estimated_matrix(utilityMatrix, U, S, Vt, uTest, K, MAX_PID, MAX_UID, True)

for  user  in  uTest:
    print("\n-------------------------------------- \
    \nRecommendation for user with id: {}\n--------------------------------------\n". format(user))
    rank_value = 1
    for i in uTest_recommended_items[user,0:10]:
        song_details = small_set[small_set.so_index_value == i].drop_duplicates('so_index_value')[['title','artist_name']]
        print("#{}: {} By {}".format(rank_value, list(song_details['title'])[0],list(song_details['artist_name'])[0]))
        rank_value+=1


--------------------------------------     
Recommendation for user with id: 100
--------------------------------------

#1: Undo By Björk
#2: Sehr kosmisch By Harmonia
#3: Revelry By Kings Of Leon
#4: What If I Do? By Foo Fighters
#5: Calabria 2007 By ENUR Feat NATASJA
#6: Alejandro By Lady GaGa
#7: Secrets By OneRepublic
#8: Marry Me By Train
#9: Sayonara-Nostalgia By Base Ball Bear
#10: Dog Days Are Over (Radio Edit) By Florence + The Machine

--------------------------------------     
Recommendation for user with id: 200
--------------------------------------

#1: Invalid By Tub Ring
#2: Los Aviones By Andres Calamaro
#3: Fuck Kitty By Frumpies
#4: Drunk Kid Catholic By Bright Eyes
#5: Revelry By Kings Of Leon
#6: Got Money By Lil Wayne / T-Pain
#7: Rise To The Challenge By Asian Dub Foundation
#8: The Only Moment We Were Alone By Explosions In The Sky
#9: Greece 2000 By Three Drives
#10: Fuck Kitty By Frumpies
