In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import scipy.sparse as sps
import scipy.io as io
import time
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize

### Read needed files

In [2]:
# read interactions file
train_final = pd.read_csv('../input/train_final.csv', sep = '\t')
train_final['interaction'] = 1.0
train_final = train_final.sort_values(['playlist_id', 'track_id'], ascending=[True, True])
numberInteractions = train_final.shape[0]
print(train_final.iloc[:20,])

        playlist_id  track_id  interaction
770769         7569    162463          1.0
925571         7569    421750          1.0
912580         7569    795606          1.0
852833         7569   1195736          1.0
287048         7569   2227105          1.0
92449          7569   2634448          1.0
519471         7569   2654612          1.0
173680         7569   2693660          1.0
903004         7569   2861386          1.0
863056         7614    415173          1.0
278077         7614   1384962          1.0
394480         7614   1609224          1.0
615615         7614   1614974          1.0
328664         7614   1714787          1.0
377085         7614   2141817          1.0
83840          7614   2285204          1.0
574199         7614   3361942          1.0
481594         7614   3504896          1.0
300753         7614   3711434          1.0
775317         7614   3833025          1.0


In [25]:
# read target playlists which should receive a recommendation
target_playlists = pd.read_csv('../input/target_playlists.csv', sep = '\t')
print(target_playlists.head())
print(target_playlists.shape)

   playlist_id
0     10024884
1     10624787
2      4891851
3      4267369
4        65078
(10000, 1)


In [34]:
target_tracks = list(pd.read_csv('../input/target_tracks.csv', sep = '\t')['track_id'])
target_tracks

[1316175,
 3885714,
 3091270,
 226759,
 230596,
 1093284,
 2293453,
 2994241,
 2871264,
 93835,
 1123498,
 2374491,
 1074412,
 2853886,
 821071,
 2709648,
 2933818,
 346111,
 66103,
 3638141,
 232971,
 2848406,
 3783264,
 2842670,
 3581982,
 3171591,
 2245591,
 871828,
 2713603,
 1175043,
 106982,
 3102164,
 1553840,
 2057826,
 3557550,
 2363848,
 1253516,
 503539,
 2617340,
 1790258,
 2956696,
 193328,
 303129,
 3608340,
 950730,
 1855305,
 3777248,
 1076555,
 1150702,
 3419354,
 726682,
 465271,
 2602747,
 1762406,
 1605205,
 1759620,
 2021651,
 709968,
 2972914,
 259990,
 1349043,
 3522735,
 1773876,
 2862475,
 106083,
 2820187,
 3612400,
 2881368,
 2083728,
 1935536,
 1585141,
 1444202,
 1248376,
 881160,
 1709123,
 1154985,
 2443976,
 2304545,
 4700754,
 1080646,
 1550729,
 793276,
 4486831,
 3868340,
 2642609,
 548753,
 2168844,
 5546,
 472924,
 1604514,
 3013929,
 2213950,
 717927,
 1245649,
 3250648,
 8287,
 881469,
 3242407,
 132362,
 2466040,
 2433491,
 426884,
 2843330,
 990

We seperate each column in a seperate list:

In [3]:
playlist_id = list(train_final['playlist_id'])
track_id = list(train_final['track_id'])
interaction = list(train_final['interaction'])
print(playlist_id[:10])
print(track_id[:10])
print(interaction[:10])

[7569, 7569, 7569, 7569, 7569, 7569, 7569, 7569, 7569, 7614]
[162463, 421750, 795606, 1195736, 2227105, 2634448, 2654612, 2693660, 2861386, 415173]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


Create index mapping for playlist and track IDs

In [4]:
playlist_id_unique = list(set(playlist_id))
track_id_unique = list(set(track_id))
playlist_id_unique.sort()
playlist_id_unique = pd.DataFrame(playlist_id_unique)
playlist_id_unique.reset_index(level=0, inplace=True)
playlist_id_unique.columns = ['index_playlist', 'playlist_id']
track_id_unique.sort()
track_id_unique = pd.DataFrame(track_id_unique)
track_id_unique.reset_index(level=0, inplace=True)
track_id_unique.columns = ['index_track', 'track_id']
print(track_id_unique.head())
print(playlist_id_unique.head())

   index_track  track_id
0            0       252
1            1       360
2            2       394
3            3       527
4            4       667
   index_playlist  playlist_id
0               0         7569
1               1         7614
2               2         7641
3               3         7692
4               4         7816


In [5]:
numPlaylists = len(playlist_id_unique)
numTracks = len(track_id_unique)

In [6]:
train_intermediate = train_final.merge(playlist_id_unique, how='inner', on='playlist_id')
train_index = train_intermediate.merge(track_id_unique, how='inner', on='track_id')
train_index = train_index.sort_values(['playlist_id', 'track_id'], ascending=[True, True])
train_index

Unnamed: 0,playlist_id,track_id,interaction,index_playlist,index_track
0,7569,162463,1.0,0,3484
87,7569,421750,1.0,0,10864
116,7569,795606,1.0,0,20062
125,7569,1195736,1.0,0,29176
195,7569,2227105,1.0,0,55647
198,7569,2634448,1.0,0,66029
241,7569,2654612,1.0,0,66596
253,7569,2693660,1.0,0,67590
263,7569,2861386,1.0,0,73675
298,7614,415173,1.0,1,10624


In [7]:
index_playlist = list(train_index['index_playlist'])
index_track = list(train_index['index_track'])
interaction = list(train_index['interaction'])
print(index_playlist[:10])
print(index_track[:10])
print(interaction[:10])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[3484, 10864, 20062, 29176, 55647, 66029, 66596, 67590, 73675, 10624]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [8]:
URM_all = sps.coo_matrix((interaction, (index_playlist, index_track)))
URM_all.tocsr()
URM_all

<45649x99999 sparse matrix of type '<class 'numpy.float64'>'
	with 1040522 stored elements in COOrdinate format>

In [None]:
URM = np.zeros(shape=(45649,99999))
print(URM.shape)
print(URM[0,2])

In [None]:
start_time = time.time()
for index, row in train_index.iterrows():
    URM[row['index_playlist'], row['index_track']] = row['interaction']
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
mean_URM = np.mean(URM, axis = 1)

In [None]:
mean_URM[:10]
mean_URM.shape

In [None]:
demeaned_URM = URM - mean_URM.reshape(-1, 1)

In [None]:
# taking too long
start_time = time.time()
U, sigma, Vt = svds(demeaned_URM, k = 50)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# kernel dies because matrix is not sparse anymore
demeaned_URM_all = URM_all - URM_all.mean(axis = 1)

In [9]:
start_time = time.time()
U, s, Vt = svds(URM_all, k = 50)
print("--- %s seconds ---" % (time.time() - start_time))

--- 4.0392842292785645 seconds ---


In [10]:
s_diag = np.diag(s)

In [11]:
Us = np.dot(U, s_diag)
Us.shape

(45649, 50)

In [12]:
start_time = time.time()
URM_estm = np.dot(Us, Vt)
print("--- %s seconds ---" % (time.time() - start_time))

--- 122.29415702819824 seconds ---


In [15]:
print(URM_estm.shape)
print(URM_estm[:10,:10])
print(URM_estm[0,:].max())

(45649, 99999)
[[  5.81494202e-05   1.77410812e-05  -2.46214609e-04   6.01678510e-05
    5.33451296e-04   1.05360826e-04   1.13592088e-04   7.80126434e-06
    3.70764055e-05   6.80325116e-06]
 [ -3.71196522e-04   9.81899151e-04   2.83356814e-04   9.18600462e-05
   -2.30382209e-04  -3.55141076e-04  -1.74383469e-04  -2.95536379e-04
   -8.24466316e-04   8.81044619e-05]
 [  1.84603545e-04  -1.39226204e-04  -9.92982645e-05   4.61793417e-05
   -3.52985639e-05   4.18437211e-04  -6.68809021e-05   4.22801439e-04
    1.39319761e-04   5.10532905e-05]
 [  5.71655875e-05   4.66534103e-04  -1.00827194e-04  -7.25278409e-05
    2.88187117e-04   2.22843352e-04   1.19197626e-04   1.29410012e-04
    5.66197509e-05  -2.84272579e-05]
 [ -1.53325782e-04   8.49497258e-05  -1.66764096e-05   4.46740507e-05
   -1.51105629e-05  -1.36704899e-05   2.92211141e-05  -7.88016852e-05
    2.65979904e-05   4.07161985e-05]
 [  4.06218404e-03  -3.24275523e-04   7.83355342e-05  -4.93838660e-05
    4.27323937e-04   2.2617931

In [26]:
# filter interaction dataframe, to retain only target playlists
train = train_index.merge(target_playlists, how='inner', on='playlist_id')
print(train.shape)
print(train['playlist_id'].nunique())

(362661, 5)
10000


In [29]:
# aggregate to playlist level and coerce tracks in that playlist to list
train_agg1 = train.groupby(by='playlist_id').track_id.apply(list).to_frame()
train_agg1.reset_index(level=0, inplace=True)
train_agg2 = train.groupby(by='playlist_id').index_track.apply(list).to_frame()
train_agg2.reset_index(level=0, inplace=True)
train_agg = train_agg1.merge(train_agg2, how='inner', on='playlist_id')
train_agg = train_agg.merge(playlist_id_unique, how='inner', on='playlist_id')
train_agg['recommend'] = np.empty((len(train_agg), 0)).tolist()
train_agg

Unnamed: 0,playlist_id,track_id,index_track,index_playlist,recommend
0,7614,"[415173, 1384962, 1609224, 1614974, 1714787, 2...","[10624, 34222, 39285, 39451, 42027, 53522, 573...",1,[]
1,7692,"[88210, 266898, 280844, 302730, 384386, 551534...","[1566, 6575, 6956, 7611, 9801, 14750, 16678, 2...",3,[]
2,7816,"[126414, 245217, 513821, 611201, 767305, 84510...","[2414, 6122, 13875, 15971, 19930, 21225, 24827...",4,[]
3,8225,"[13881, 261448, 311923, 500672, 676393, 906185...","[250, 6480, 7879, 13464, 17879, 22939, 23541, ...",12,[]
4,8337,"[451881, 1157460, 1205536, 1210884, 3131838, 3...","[11904, 28209, 29601, 29880, 81701, 82528]",15,[]
5,8369,"[701941, 826559, 1042548, 1886070, 2165768, 26...","[18870, 20841, 24863, 46786, 54184, 67822, 679...",16,[]
6,8446,"[161823, 276258, 287155, 307055, 331475, 35788...","[3419, 6861, 7156, 7762, 8321, 9265, 9715, 125...",18,[]
7,8559,"[396062, 949534, 1188811, 1831605, 2142207, 34...","[10044, 23803, 29006, 45385, 53536, 88748]",21,[]
8,8636,"[87720, 98950, 355175, 410709, 433616, 434786,...","[1545, 1861, 9141, 10438, 11327, 11361, 12842,...",22,[]
9,9344,"[497514, 693228, 1075400, 1452231, 1529980, 18...","[13344, 18653, 25813, 35600, 37371, 46921, 523...",32,[]


### Find tracks to recommend in URM_estm

In [None]:
for index, row in train_agg.iterrows():
    # get row from URM_estm
    estm = pd.DataFrame(URM_estm[row['index_playlist'],:])
    estm.reset_index(level=0, inplace=True)
    estm.columns = ['index_track','pred']
    

### Find tracks which appear in less than 5 playlists and are not in the target track set

In [37]:
np.arange(99999)

(99999,)

In [44]:
pd.DataFrame(URM_estm[0,:])

Unnamed: 0,0
0,0.000058
1,0.000018
2,-0.000246
3,0.000060
4,0.000533
5,0.000105
6,0.000114
7,0.000008
8,0.000037
9,0.000007


In [41]:
np.concatenate((URM_estm[0,:].reshape(-1,1), np.arange(99999).reshape(-1,1)), axis = 1)

array([[  5.81494202e-05,   0.00000000e+00],
       [  1.77410812e-05,   1.00000000e+00],
       [ -2.46214609e-04,   2.00000000e+00],
       ..., 
       [  2.87479099e-05,   9.99960000e+04],
       [  3.63382973e-04,   9.99970000e+04],
       [  5.82191567e-05,   9.99980000e+04]])