## Importing Libraries

In [8]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Reading Data

In [9]:
members = pd.read_csv("members.csv")
submission = pd.read_csv("sample_submission.csv", nrows = 50000)
songs_extra_info = pd.read_csv("song_extra_info.csv")
songs = pd.read_csv("songs.csv", nrows = 50000)
train = pd.read_csv("train.csv", nrows = 50000)
test = pd.read_csv("test.csv")

## Exploring Data

In [10]:
members.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,expiration_date
0,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,1,0,,7,20110820,20170920
1,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,1,0,,7,20150628,20170622
2,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,1,0,,4,20160411,20170712
3,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,1,0,,9,20150906,20150907
4,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=,1,0,,4,20170126,20170613


In [11]:
submission.head()

Unnamed: 0,id,target
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


In [12]:
songs_extra_info.head()

Unnamed: 0,song_id,name,isrc
0,LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=,我們,TWUM71200043
1,ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=,Let Me Love You,QMZSY1600015
2,u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=,原諒我,TWA530887303
3,92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=,Classic,USSM11301446
4,0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=,愛投羅網,TWA471306001


## Data Preprocessing

In [13]:
songs_extra_info.drop('isrc', axis = 1)

Unnamed: 0,song_id,name
0,LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=,我們
1,ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=,Let Me Love You
2,u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=,原諒我
3,92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=,Classic
4,0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=,愛投羅網
...,...,...
2295966,hLnetpF6UbPg28sSfXnPE2vsdaGsLvddlXEdJR4VTIA=,Deep Breathing
2295967,N+6vJ8actKQm0S3Fpf4elipTjoAo9ev28aA5FJN5e40=,In Hiding
2295968,pv35uG0ts05mWtirM/AMOWEzbHxIVart5ZzRXqKUY1c=,Il Est Ne Le Divin Enfant
2295969,QSySnm8jt2Go7byY34/PxsZP6dPCins2j2cyYquNhBo=,The Exodus Song


In [14]:
songs.head()

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,247640,465,張信哲 (Jeff Chang),董貞,何啟弘,3.0
1,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,197328,444,BLACKPINK,TEDDY| FUTURE BOUNCE| Bekuh BOOM,TEDDY,31.0
2,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,231781,465,SUPER JUNIOR,,,31.0
3,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,273554,465,S.H.E,湯小康,徐世珍,3.0
4,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=,140329,726,貴族精選,Traditional,Traditional,52.0


In [15]:
songs.isnull().sum()

song_id            0
song_length        0
genre_ids        899
artist_name        0
composer       20998
lyricist       36296
language           0
dtype: int64

In [16]:
songs['genre_ids'].fillna(' ',inplace = True)
songs['composer'].fillna(' ',inplace = True)
songs['lyricist'].fillna(' ',inplace = True)

In [17]:
songs.isnull().sum()

song_id        0
song_length    0
genre_ids      0
artist_name    0
composer       0
lyricist       0
language       0
dtype: int64

In [18]:
train.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1


In [19]:
train.isnull().sum()

msno                     0
song_id                  0
source_system_tab      132
source_screen_name    1817
source_type             85
target                   0
dtype: int64

## Data Transformation

In [20]:
train = train.drop(['source_system_tab', 'source_screen_name', 'source_type'], axis=1)

In [21]:
train.head()

Unnamed: 0,msno,song_id,target
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,1
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,1
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,1
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,1
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,1


In [22]:
train.rename(columns = {'msno':'user_id'}, inplace = True)
train.head()

Unnamed: 0,user_id,song_id,target
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,1
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,1
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,1
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,1
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,1


In [23]:
test.head()

Unnamed: 0,id,msno,song_id,source_system_tab,source_screen_name,source_type
0,0,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,WmHKgKMlp1lQMecNdNvDMkvIycZYHnFwDT72I5sIssc=,my library,Local playlist more,local-library
1,1,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,y/rsZ9DC7FwK5F2PK2D5mj+aOBUJAjuu3dZ14NgE0vM=,my library,Local playlist more,local-library
2,2,/uQAlrAkaczV+nWCd2sPF2ekvXPRipV7q0l+gbLuxjw=,8eZLFOdGVdXBSqoAv5nsLigeH2BvKXzTQYtUM53I0k4=,discover,,song-based-playlist
3,3,1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=,ztCf8thYsS4YN3GcIL/bvoxLm/T5mYBVKOO4C9NiVfQ=,radio,Radio,radio
4,4,1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=,MKVMpslKcQhMaFEgcEQhEfi5+RZhMYlU3eRDpySrH8Y=,radio,Radio,radio


## Merging & Cleaning Data

In [24]:
df = train.merge(songs,on = 'song_id')
df.head()

Unnamed: 0,user_id,song_id,target,song_length,genre_ids,artist_name,composer,lyricist,language
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,247803,1259,Desiigner,Sidney Selby| Adnan Khan,,52.0
1,hZyOA+0yqClPLt6uIEndf8fG8szH/95eKMbaxLE5z30=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,247803,1259,Desiigner,Sidney Selby| Adnan Khan,,52.0
2,XMkoCF4lkuInFshS1maokU4hZEQI2L1ZFQ3Cf7xg184=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,247803,1259,Desiigner,Sidney Selby| Adnan Khan,,52.0
3,kpCpM9wsXhj+Maa6kRAvUJJzE1dyeyPmBDtPoUr4a1o=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,247803,1259,Desiigner,Sidney Selby| Adnan Khan,,52.0
4,jiMKblWJS85kSNBCNvyW9Vz3nlfDfQQbuvPczyWMY1U=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,247803,1259,Desiigner,Sidney Selby| Adnan Khan,,52.0


In [25]:
df = df.drop(['song_length', 'language'], axis = 1)


In [26]:
df.head()

Unnamed: 0,user_id,song_id,target,genre_ids,artist_name,composer,lyricist
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,
1,hZyOA+0yqClPLt6uIEndf8fG8szH/95eKMbaxLE5z30=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,
2,XMkoCF4lkuInFshS1maokU4hZEQI2L1ZFQ3Cf7xg184=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,
3,kpCpM9wsXhj+Maa6kRAvUJJzE1dyeyPmBDtPoUr4a1o=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,
4,jiMKblWJS85kSNBCNvyW9Vz3nlfDfQQbuvPczyWMY1U=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,


In [27]:
df = df.merge(songs_extra_info, on = 'song_id')
df.head()

Unnamed: 0,user_id,song_id,target,genre_ids,artist_name,composer,lyricist,name,isrc
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,,Panda,USUM71601094
1,hZyOA+0yqClPLt6uIEndf8fG8szH/95eKMbaxLE5z30=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,,Panda,USUM71601094
2,XMkoCF4lkuInFshS1maokU4hZEQI2L1ZFQ3Cf7xg184=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,,Panda,USUM71601094
3,kpCpM9wsXhj+Maa6kRAvUJJzE1dyeyPmBDtPoUr4a1o=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,,Panda,USUM71601094
4,jiMKblWJS85kSNBCNvyW9Vz3nlfDfQQbuvPczyWMY1U=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,,Panda,USUM71601094


In [28]:
df = df.drop('isrc', axis = 1)
df.head()

Unnamed: 0,user_id,song_id,target,genre_ids,artist_name,composer,lyricist,name
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,,Panda
1,hZyOA+0yqClPLt6uIEndf8fG8szH/95eKMbaxLE5z30=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,,Panda
2,XMkoCF4lkuInFshS1maokU4hZEQI2L1ZFQ3Cf7xg184=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,,Panda
3,kpCpM9wsXhj+Maa6kRAvUJJzE1dyeyPmBDtPoUr4a1o=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,,Panda
4,jiMKblWJS85kSNBCNvyW9Vz3nlfDfQQbuvPczyWMY1U=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,,Panda


In [29]:
df.rename(columns = {'name' : 'song_name'}, inplace = True)
df.head()

Unnamed: 0,user_id,song_id,target,genre_ids,artist_name,composer,lyricist,song_name
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,,Panda
1,hZyOA+0yqClPLt6uIEndf8fG8szH/95eKMbaxLE5z30=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,,Panda
2,XMkoCF4lkuInFshS1maokU4hZEQI2L1ZFQ3Cf7xg184=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,,Panda
3,kpCpM9wsXhj+Maa6kRAvUJJzE1dyeyPmBDtPoUr4a1o=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,,Panda
4,jiMKblWJS85kSNBCNvyW9Vz3nlfDfQQbuvPczyWMY1U=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,Desiigner,Sidney Selby| Adnan Khan,,Panda


In [30]:
df['genre_ids'].value_counts()

465             4593
458             1915
1609             437
444              331
921              290
                ... 
880                1
388|940            1
2122|947|958       1
726|242            1
516                1
Name: genre_ids, Length: 87, dtype: int64

In [31]:
df['genre_ids'] = df['genre_ids'].str.replace('|', ' ', regex = True)
df['genre_ids'].value_counts()

465             4593
458             1915
1609             437
444              331
921              290
                ... 
880                1
388 940            1
2122 947 958       1
726 242            1
516                1
Name: genre_ids, Length: 87, dtype: int64

In [32]:
df['artist_name'] = df['artist_name'].str.replace('|', ' ', regex = True)
df['artist_name'] = df['artist_name'].str.replace('/', ' ', regex = True)

df['composer'] = df['composer'].str.replace('|', ' ', regex = True)
df['composer'] = df['composer'].str.replace('/', ' ', regex = True)

df['lyricist'] = df['lyricist'].str.replace('|', ' ', regex = True)
df['lyricist'] = df['lyricist'].str.replace('/', ' ', regex = True)

In [33]:
df['artist_name'] = df['artist_name'].str.lower()
df['composer'] = df['composer'].str.lower()
df['lyricist'] = df['lyricist'].str.lower()

In [34]:
df['song_details'] = df ['artist_name'] + ' ' + df ['composer'] + ' ' + df ['lyricist']
df.tail()

Unnamed: 0,user_id,song_id,target,genre_ids,artist_name,composer,lyricist,song_name,song_details
9058,px8eWjJqSqveznpopEtVWYhghz+wXqA6jqfy/IJmZ0g=,dyWJocgFvrmohlBad2IhPjVEHrG+55Ro+OfXbWGqEKA=,1,359,the 1975,george daniel matthew healy adam hann ross ...,,UGH!,the 1975 george daniel matthew healy adam ha...
9059,evsClIhkoLb1wrhXAo3kM1Po0E19D1OQsGcIULyPQKY=,3MO0Maz42/Bxm3Qbb2tGCyN24tRY2CgbT3Q65BdKCA8=,1,458,羅志祥 (show lo),羅志祥,羅銘松,獅子吼,羅志祥 (show lo) 羅志祥 羅銘松
9060,HmHoSnryX7Ih9wQWBcMBTZdpDoyZgx8xmk6T0jOjHSA=,VnRwD2SeX/g66Nq7Lr7gDP/P6gUnZL7YVxQq26vAzeQ=,1,2122,clementine,maurice gibb robin gibb barry gibb,,Melody Fair,clementine maurice gibb robin gibb barry gibb
9061,p56/iJa3NyqT0XayXDQhMVLqRF532rLyuyksGdsmwkE=,ISZdpC3A/+xvUw0I6Kw0z03/kaK3AmnqV9FU2p9M8NU=,1,465,梁靜茹 (fish leong),,,天燈,梁靜茹 (fish leong)
9062,EyU5ZaxILiSKaVo5w3XN83lEddVPTVT8mf0FRid/4fE=,MMdV9Hkl/I+ip/+TQ9+e8ZqgBxX6ZhowTqf4wpywFuk=,0,465,jason mraz,,,Sunshine Song,jason mraz


In [35]:
df.user_id.value_counts()

EozJegFxTFIWDb9aJ7O8kSUHAgx4ZIvqf7IuN5Zck50=    51
p56/iJa3NyqT0XayXDQhMVLqRF532rLyuyksGdsmwkE=    40
V5U4EGk2kaSKaUGSwhU6g3HBefxflEvAy1vWPu6UBQs=    36
W9NYSCff57nmfyYCiX6IbW0/G3YuwC18h/rld+BGxMY=    33
KRMGuUAJmY6wHmXjvJSYiIKht24aWw9tKPgcQbufhic=    27
                                                ..
EQUABG449zZu6A8xFYYnyk1g+M870Gvk6D4IN9z68Ic=     1
/QnrMls40DnnRzSuNezemNFv0UBVo0qy2899Do/mciI=     1
OKjyodiVXM56Jup+Q2LOTp3gPgsPY/rRI701VoF2qvA=     1
gzvmbIJUpL27IpnNvpGHRAE50WYIgnShjTdbdfgpemI=     1
EyU5ZaxILiSKaVo5w3XN83lEddVPTVT8mf0FRid/4fE=     1
Name: user_id, Length: 3012, dtype: int64

In [36]:
other_df = df.copy()
other_df.head()

Unnamed: 0,user_id,song_id,target,genre_ids,artist_name,composer,lyricist,song_name,song_details
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,desiigner,sidney selby adnan khan,,Panda,desiigner sidney selby adnan khan
1,hZyOA+0yqClPLt6uIEndf8fG8szH/95eKMbaxLE5z30=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,desiigner,sidney selby adnan khan,,Panda,desiigner sidney selby adnan khan
2,XMkoCF4lkuInFshS1maokU4hZEQI2L1ZFQ3Cf7xg184=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,desiigner,sidney selby adnan khan,,Panda,desiigner sidney selby adnan khan
3,kpCpM9wsXhj+Maa6kRAvUJJzE1dyeyPmBDtPoUr4a1o=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,desiigner,sidney selby adnan khan,,Panda,desiigner sidney selby adnan khan
4,jiMKblWJS85kSNBCNvyW9Vz3nlfDfQQbuvPczyWMY1U=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,desiigner,sidney selby adnan khan,,Panda,desiigner sidney selby adnan khan


In [37]:
other_df.shape

(9063, 9)

In [38]:
other_df.song_details.duplicated().sum()

6550

In [39]:
other_df = other_df.drop('user_id', axis = 1)
other_df.head()

Unnamed: 0,song_id,target,genre_ids,artist_name,composer,lyricist,song_name,song_details
0,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,desiigner,sidney selby adnan khan,,Panda,desiigner sidney selby adnan khan
1,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,desiigner,sidney selby adnan khan,,Panda,desiigner sidney selby adnan khan
2,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,desiigner,sidney selby adnan khan,,Panda,desiigner sidney selby adnan khan
3,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,desiigner,sidney selby adnan khan,,Panda,desiigner sidney selby adnan khan
4,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,desiigner,sidney selby adnan khan,,Panda,desiigner sidney selby adnan khan


In [40]:
other_df.duplicated().sum()

5243

In [41]:
other_df = other_df.drop_duplicates()
other_df.head()

Unnamed: 0,song_id,target,genre_ids,artist_name,composer,lyricist,song_name,song_details
0,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,desiigner,sidney selby adnan khan,,Panda,desiigner sidney selby adnan khan
7,u6/Pb7X4u7KU4gXrBgGqt8RlRrNNFLn03tLAHyxRxwA=,0,465,林俊傑 (jj lin),林俊傑,林怡鳳,手心的薔薇 (Beautiful) feat. G.E.M.鄧紫棋,林俊傑 (jj lin) 林俊傑 林怡鳳
8,u6/Pb7X4u7KU4gXrBgGqt8RlRrNNFLn03tLAHyxRxwA=,1,465,林俊傑 (jj lin),林俊傑,林怡鳳,手心的薔薇 (Beautiful) feat. G.E.M.鄧紫棋,林俊傑 (jj lin) 林俊傑 林怡鳳
57,TE5ASd0CqSedf7+LBnqf5Z8Kfm9X9wfj8t2IXZNuDOw=,1,465,s.h.e,,藍小邪,宇宙小姐,s.h.e 藍小邪
58,2bj5oqCPPzY6E0TPgwySkfj8/l/c+DVQBqnABx0qPSk=,1,1259,蛋堡 (soft lipa),杜振熙,杜振熙 （ot：白鴿）伍佰,史詩,蛋堡 (soft lipa) 杜振熙 杜振熙 （ot：白鴿）伍佰


In [42]:
other_df.reset_index(inplace = True)
other_df.head()

Unnamed: 0,index,song_id,target,genre_ids,artist_name,composer,lyricist,song_name,song_details
0,0,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,desiigner,sidney selby adnan khan,,Panda,desiigner sidney selby adnan khan
1,7,u6/Pb7X4u7KU4gXrBgGqt8RlRrNNFLn03tLAHyxRxwA=,0,465,林俊傑 (jj lin),林俊傑,林怡鳳,手心的薔薇 (Beautiful) feat. G.E.M.鄧紫棋,林俊傑 (jj lin) 林俊傑 林怡鳳
2,8,u6/Pb7X4u7KU4gXrBgGqt8RlRrNNFLn03tLAHyxRxwA=,1,465,林俊傑 (jj lin),林俊傑,林怡鳳,手心的薔薇 (Beautiful) feat. G.E.M.鄧紫棋,林俊傑 (jj lin) 林俊傑 林怡鳳
3,57,TE5ASd0CqSedf7+LBnqf5Z8Kfm9X9wfj8t2IXZNuDOw=,1,465,s.h.e,,藍小邪,宇宙小姐,s.h.e 藍小邪
4,58,2bj5oqCPPzY6E0TPgwySkfj8/l/c+DVQBqnABx0qPSk=,1,1259,蛋堡 (soft lipa),杜振熙,杜振熙 （ot：白鴿）伍佰,史詩,蛋堡 (soft lipa) 杜振熙 杜振熙 （ot：白鴿）伍佰


## TF-IDF

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer = 'word', stop_words = 'english')
tfidf_matrix = tfidf.fit_transform(other_df['song_details'])

In [44]:
tfidf_matrix

<3820x5017 sparse matrix of type '<class 'numpy.float64'>'
	with 16892 stored elements in Compressed Sparse Row format>

In [45]:
other_df.shape

(3820, 9)

In [46]:
other_df.reset_index(drop = True, inplace = True)
other_df.head(30)

Unnamed: 0,index,song_id,target,genre_ids,artist_name,composer,lyricist,song_name,song_details
0,0,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1,1259,desiigner,sidney selby adnan khan,,Panda,desiigner sidney selby adnan khan
1,7,u6/Pb7X4u7KU4gXrBgGqt8RlRrNNFLn03tLAHyxRxwA=,0,465,林俊傑 (jj lin),林俊傑,林怡鳳,手心的薔薇 (Beautiful) feat. G.E.M.鄧紫棋,林俊傑 (jj lin) 林俊傑 林怡鳳
2,8,u6/Pb7X4u7KU4gXrBgGqt8RlRrNNFLn03tLAHyxRxwA=,1,465,林俊傑 (jj lin),林俊傑,林怡鳳,手心的薔薇 (Beautiful) feat. G.E.M.鄧紫棋,林俊傑 (jj lin) 林俊傑 林怡鳳
3,57,TE5ASd0CqSedf7+LBnqf5Z8Kfm9X9wfj8t2IXZNuDOw=,1,465,s.h.e,,藍小邪,宇宙小姐,s.h.e 藍小邪
4,58,2bj5oqCPPzY6E0TPgwySkfj8/l/c+DVQBqnABx0qPSk=,1,1259,蛋堡 (soft lipa),杜振熙,杜振熙 （ot：白鴿）伍佰,史詩,蛋堡 (soft lipa) 杜振熙 杜振熙 （ot：白鴿）伍佰
5,59,2bj5oqCPPzY6E0TPgwySkfj8/l/c+DVQBqnABx0qPSk=,0,1259,蛋堡 (soft lipa),杜振熙,杜振熙 （ot：白鴿）伍佰,史詩,蛋堡 (soft lipa) 杜振熙 杜振熙 （ot：白鴿）伍佰
6,66,Prbb1SisBE80rnDae36BPKvkulEiXMssQvBfVCsHr38=,1,465,s.h.e,八三夭阿璞,五月天阿信,SHERO,s.h.e 八三夭阿璞 五月天阿信
7,67,Prbb1SisBE80rnDae36BPKvkulEiXMssQvBfVCsHr38=,0,465,s.h.e,八三夭阿璞,五月天阿信,SHERO,s.h.e 八三夭阿璞 五月天阿信
8,68,vM08WBQRO9eZo1K+qTJmjuw2IqbuA3L65ojbGwB4GI0=,0,921,various artists,,,零,various artists
9,69,skehue/d/R59G71dXYpntDwdjRRPlweN3JE8g40TgZU=,1,458,莊心妍,鄭建浩,鄭建浩,我過的很好,莊心妍 鄭建浩 鄭建浩


## Calculating Similarity

In [47]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarities = cosine_similarity(tfidf_matrix)

In [48]:
cosine_similarities

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [65]:
sorted(list(enumerate(cosine_similarities[1])),reverse = True, key = lambda x:x[1])[1:15]

[(2, 1.0000000000000002),
 (770, 1.0000000000000002),
 (771, 1.0000000000000002),
 (869, 0.8625226838909045),
 (2936, 0.8250798074222485),
 (686, 0.7992968199759728),
 (687, 0.7992968199759728),
 (2308, 0.7797161201608636),
 (118, 0.7749943507462418),
 (119, 0.7749943507462418),
 (1417, 0.7749943507462418),
 (3725, 0.7554733792970716),
 (1680, 0.748416726831863),
 (1681, 0.748416726831863)]

## Song Recommendation

In [85]:
def recommend(song):
    song_index = other_df[other_df['song_name'] == song].index[0]
    distances = cosine_similarities[song_index]
    song_list = sorted(list(enumerate(cosine_similarities[2])),reverse = True, key = lambda x:x[1])[1:10]
    for i in song_list:
        print(i[0])

In [86]:
recommend('Panda')

2
770
771
869
2936
686
687
2308
118


In [87]:
def recommend(song):
    song_index = other_df[other_df['song_name'] == song].index[0]
    distances = cosine_similarities[song_index]
    song_list = sorted(list(enumerate(cosine_similarities[2])),reverse = True, key = lambda x:x[1])[1:10]
    for i in song_list:
        print(other_df.iloc[i[0]].song_name)

In [88]:
recommend('Panda')

手心的薔薇 (Beautiful) feat. G.E.M.鄧紫棋
生生 (The Beacon)
生生 (The Beacon)
莎士比亞的天份
序曲：海邊 初 (The Beach Arrival)
浪漫血液 (The Romantic)
浪漫血液 (The Romantic)
黑武士
西界
