In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import scipy.sparse as sps
import scipy.io as io
import time
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize

### Read needed files

In [2]:
# read interactions file
train_final = pd.read_csv('../input/train_final.csv', sep = '\t')
train_final['interaction'] = 1.0
train_final = train_final.sort_values(['playlist_id', 'track_id'], ascending=[True, True])
numberInteractions = train_final.shape[0]
print(train_final.iloc[:20,])

        playlist_id  track_id  interaction
770769         7569    162463          1.0
925571         7569    421750          1.0
912580         7569    795606          1.0
852833         7569   1195736          1.0
287048         7569   2227105          1.0
92449          7569   2634448          1.0
519471         7569   2654612          1.0
173680         7569   2693660          1.0
903004         7569   2861386          1.0
863056         7614    415173          1.0
278077         7614   1384962          1.0
394480         7614   1609224          1.0
615615         7614   1614974          1.0
328664         7614   1714787          1.0
377085         7614   2141817          1.0
83840          7614   2285204          1.0
574199         7614   3361942          1.0
481594         7614   3504896          1.0
300753         7614   3711434          1.0
775317         7614   3833025          1.0


In [3]:
# read target playlists which should receive a recommendation
target_playlists = pd.read_csv('../input/target_playlists.csv', sep = '\t')
print(target_playlists.head())
print(target_playlists.shape)

   playlist_id
0     10024884
1     10624787
2      4891851
3      4267369
4        65078
(10000, 1)


In [4]:
# read target tracks
target_tracks = list(pd.read_csv('../input/target_tracks.csv', sep = '\t')['track_id'])
target_tracks

[1316175,
 3885714,
 3091270,
 226759,
 230596,
 1093284,
 2293453,
 2994241,
 2871264,
 93835,
 1123498,
 2374491,
 1074412,
 2853886,
 821071,
 2709648,
 2933818,
 346111,
 66103,
 3638141,
 232971,
 2848406,
 3783264,
 2842670,
 3581982,
 3171591,
 2245591,
 871828,
 2713603,
 1175043,
 106982,
 3102164,
 1553840,
 2057826,
 3557550,
 2363848,
 1253516,
 503539,
 2617340,
 1790258,
 2956696,
 193328,
 303129,
 3608340,
 950730,
 1855305,
 3777248,
 1076555,
 1150702,
 3419354,
 726682,
 465271,
 2602747,
 1762406,
 1605205,
 1759620,
 2021651,
 709968,
 2972914,
 259990,
 1349043,
 3522735,
 1773876,
 2862475,
 106083,
 2820187,
 3612400,
 2881368,
 2083728,
 1935536,
 1585141,
 1444202,
 1248376,
 881160,
 1709123,
 1154985,
 2443976,
 2304545,
 4700754,
 1080646,
 1550729,
 793276,
 4486831,
 3868340,
 2642609,
 548753,
 2168844,
 5546,
 472924,
 1604514,
 3013929,
 2213950,
 717927,
 1245649,
 3250648,
 8287,
 881469,
 3242407,
 132362,
 2466040,
 2433491,
 426884,
 2843330,
 990

We seperate each column in a seperate list:

In [5]:
playlist_id = list(train_final['playlist_id'])
track_id = list(train_final['track_id'])
interaction = list(train_final['interaction'])
print(playlist_id[:10])
print(track_id[:10])
print(interaction[:10])

[7569, 7569, 7569, 7569, 7569, 7569, 7569, 7569, 7569, 7614]
[162463, 421750, 795606, 1195736, 2227105, 2634448, 2654612, 2693660, 2861386, 415173]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


Create index mapping for playlist and track IDs

In [6]:
playlist_id_unique = list(set(playlist_id))
track_id_unique = list(set(track_id))
playlist_id_unique.sort()
playlist_id_unique = pd.DataFrame(playlist_id_unique)
playlist_id_unique.reset_index(level=0, inplace=True)
playlist_id_unique.columns = ['index_playlist', 'playlist_id']
track_id_unique.sort()
track_id_unique = pd.DataFrame(track_id_unique)
track_id_unique.reset_index(level=0, inplace=True)
track_id_unique.columns = ['index_track', 'track_id']
print(track_id_unique.head())
print(playlist_id_unique.head())

   index_track  track_id
0            0       252
1            1       360
2            2       394
3            3       527
4            4       667
   index_playlist  playlist_id
0               0         7569
1               1         7614
2               2         7641
3               3         7692
4               4         7816


In [7]:
numPlaylists = len(playlist_id_unique)
numTracks = len(track_id_unique)

Translate interaction matrix to new indexes:

In [8]:
train_intermediate = train_final.merge(playlist_id_unique, how='inner', on='playlist_id')
train_index = train_intermediate.merge(track_id_unique, how='inner', on='track_id')
train_index = train_index.sort_values(['playlist_id', 'track_id'], ascending=[True, True])
train_index

Unnamed: 0,playlist_id,track_id,interaction,index_playlist,index_track
0,7569,162463,1.0,0,3484
87,7569,421750,1.0,0,10864
116,7569,795606,1.0,0,20062
125,7569,1195736,1.0,0,29176
195,7569,2227105,1.0,0,55647
198,7569,2634448,1.0,0,66029
241,7569,2654612,1.0,0,66596
253,7569,2693660,1.0,0,67590
263,7569,2861386,1.0,0,73675
298,7614,415173,1.0,1,10624


In [9]:
index_playlist = list(train_index['index_playlist'])
index_track = list(train_index['index_track'])
interaction = list(train_index['interaction'])
print(index_playlist[:10])
print(index_track[:10])
print(interaction[:10])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[3484, 10864, 20062, 29176, 55647, 66029, 66596, 67590, 73675, 10624]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [10]:
URM_all = sps.coo_matrix((interaction, (index_playlist, index_track)))
URM_all.tocsr()
URM_all

<45649x99999 sparse matrix of type '<class 'numpy.float64'>'
	with 1040522 stored elements in COOrdinate format>

In [11]:
# calcualte singular value decomposition
start_time = time.time()
U, s, Vt = svds(URM_all, k = 252)
print("--- %s seconds ---" % (time.time() - start_time))

--- 47.39177227020264 seconds ---


In [None]:
# make diagonal matrix from sigma values
s_diag = np.diag(s)

In [None]:
# reconstruct URM matrix as prediction
Us = np.dot(U, s_diag)
Us.shape

In [None]:
# reconstruct URM matrix as prediction
start_time = time.time()
URM_estm = np.dot(Us, Vt)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# Look at result
print(URM_estm.shape)
print(URM_estm[:10,:10])
print(URM_estm[0,:].max())

In [None]:
# filter interaction dataframe, to retain only target playlists
train = train_index.merge(target_playlists, how='inner', on='playlist_id')
print(train.shape)
print(train['playlist_id'].nunique())

In [None]:
# aggregate to playlist level and coerce tracks in that playlist to list
train_agg1 = train.groupby(by='playlist_id').track_id.apply(list).to_frame()
train_agg1.reset_index(level=0, inplace=True)
train_agg2 = train.groupby(by='playlist_id').index_track.apply(list).to_frame()
train_agg2.reset_index(level=0, inplace=True)
train_agg = train_agg1.merge(train_agg2, how='inner', on='playlist_id')
train_agg = train_agg.merge(playlist_id_unique, how='inner', on='playlist_id')
train_agg['recommend'] = np.empty((len(train_agg), 0)).tolist()
train_agg

### Find tracks to recommend in URM_estm

In [None]:
start_time = time.time()
for index, row in train_agg.iterrows():
    # get row from URM_estm
    estm = pd.DataFrame(URM_estm[row['index_playlist'],:])
    estm.reset_index(level=0, inplace=True)
    estm.columns = ['index_track','pred']
    # filter tracks which are already in the playlist, so they can't be recommended
    estm = estm[-estm["index_track"].isin(row['index_track'])]
    # translate track index back to track_id
    estm = estm.merge(track_id_unique, how='inner', on='index_track')
    # filter on target track set
    estm = estm[estm['track_id'].isin(target_tracks)]
    estm = estm.sort_values('pred',ascending=False)
    # print(estm)
    count = 1
    for index2, row2 in estm.iterrows():
        # insert 5 top recommendations into dataframe
        if count < 6:
            row['recommend'].append(int(row2['track_id']))
            count += 1
        else:
            break
print("--- %s minutes ---" % ((time.time() - start_time)/60))

In [None]:
train_agg

In [None]:
train_agg.shape

In [None]:
# Convert list to string with spaces between track_ids
train_agg['recommend'] = train_agg['recommend'].apply(lambda x: " ".join(map(str, x)))

In [None]:
# rename columns for submission
final = train_agg[['playlist_id','recommend']]
final.columns = ['playlist_id','track_ids']

In [None]:
print(final.head())

In [None]:
# export file
final.to_csv('../submission/003_svd_basic_252fac.csv', index=False)