In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from sklearn.decomposition import NMF
from numpy import save

In [7]:
track_like = pd.read_csv('../../raw_data/track_like.csv')
track_like = track_like[['USER_ID', 'TRACK_ID']].assign(r=2)
track_like.columns = ['user', 'track', 'score']

In [8]:
track_download = pd.read_csv('../../raw_data/track_download.csv')
track_download = track_download[['USER_ID', 'TRACK_ID']].assign(r=20)
track_download.columns = ['user', 'track', 'score']

In [9]:
album_track_purchase = pd.read_csv('../../raw_data/album_track_purchase.csv')
track_purchase = album_track_purchase[['USER_ID', 'TRACK_ID']][~album_track_purchase.TRACK_ID.isna()].assign(r=10).drop_duplicates()
track_purchase.TRACK_ID = track_purchase.TRACK_ID.astype('int')
track_purchase.columns = ['user', 'track', 'score']

In [12]:
album_like = pd.read_csv('outputs/user_track_album_like.csv')
album_like = album_like[['user', 'track']].assign(score=1)
album_like = album_like.drop_duplicates()
album_like.shape

(1386766, 3)

In [13]:
album_download = pd.read_csv('outputs/user_track_album_download.csv')
album_download = album_download[['user', 'track']].assign(score=10)
album_download = album_download.drop_duplicates()
album_download.shape

(15248356, 3)

In [14]:
album_purchase = pd.read_csv('outputs/user_track_album_purchase.csv')
album_purchase = album_purchase[['user', 'track']].assign(score=5)
album_purchase = album_purchase.drop_duplicates()
album_purchase.shape

(11969473, 3)

In [15]:
album_like.head(2)

Unnamed: 0,user,track,score
0,62703789,2831412,1
1,62703789,2831413,1


In [16]:
album_download.head(2)

Unnamed: 0,user,track,score
0,3591560,2842079,10
1,3591560,2842080,10


In [11]:
album_purchase.head(2)

Unnamed: 0,user,track,score
0,3590040,2846416,5
1,3590040,2846417,5


In [12]:
print(track_like.shape)
print(track_download.shape)
print(track_purchase.shape)
print(album_like.shape)
print(album_download.shape)
print(album_purchase.shape)

(346043, 3)
(16266541, 3)
(3367956, 3)
(1386766, 3)
(15248356, 3)
(11969473, 3)


In [13]:
total = pd.concat([track_download, track_like, track_purchase])
total.columns = ['user', 'track', 'score']
total = total.groupby(['user', 'track']).score.sum().reset_index()

In [14]:
total.user = total.user.astype('category')
total.track = total.track.astype('category')

In [15]:
mat = sparse.coo_matrix((
    total.score.values.astype('float32'),
    (total.user.cat.codes.values,
    total.track.cat.codes.values))).tocsr()

In [16]:
model = NMF(n_components=10, init='random', random_state=0)
W = model.fit_transform(mat)
H = model.components_

In [17]:
print(total.shape)
print(W.shape)
print(H.shape)

(16612906, 3)
(567821, 10)
(10, 96120)


In [18]:
print(mat.shape)

(567821, 96120)


In [19]:
from scipy import sparse

sparse.save_npz("mat.npz", mat)
mat_load = sparse.load_npz("mat.npz")

In [20]:
save('W.npy',W)
save('H.npy',W)