In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import scipy.sparse as sps
import scipy.io as io
import time
from scipy.sparse.linalg import svds
from sklearn import preprocessing

In [2]:
# read interactions file
train_final = pd.read_csv('../../input/train_final.csv', sep = '\t')
train_final['interaction'] = 1.0
train_final = train_final.sort_values(['playlist_id', 'track_id'], ascending=[True, True])
numberInteractions = train_final.shape[0]
print(train_final.iloc[:20,])

        playlist_id  track_id  interaction
770769         7569    162463          1.0
925571         7569    421750          1.0
912580         7569    795606          1.0
852833         7569   1195736          1.0
287048         7569   2227105          1.0
92449          7569   2634448          1.0
519471         7569   2654612          1.0
173680         7569   2693660          1.0
903004         7569   2861386          1.0
863056         7614    415173          1.0
278077         7614   1384962          1.0
394480         7614   1609224          1.0
615615         7614   1614974          1.0
328664         7614   1714787          1.0
377085         7614   2141817          1.0
83840          7614   2285204          1.0
574199         7614   3361942          1.0
481594         7614   3504896          1.0
300753         7614   3711434          1.0
775317         7614   3833025          1.0


In [3]:
# read target playlists which should receive a recommendation
target_playlists = pd.read_csv('../../input/target_playlists.csv', sep = '\t')
print(target_playlists.head())
print(target_playlists.shape)

   playlist_id
0     10024884
1     10624787
2      4891851
3      4267369
4        65078
(10000, 1)


In [4]:
# read target tracks
target_tracks = list(pd.read_csv('../../input/target_tracks.csv', sep = '\t')['track_id'])
target_tracks

[1316175,
 3885714,
 3091270,
 226759,
 230596,
 1093284,
 2293453,
 2994241,
 2871264,
 93835,
 1123498,
 2374491,
 1074412,
 2853886,
 821071,
 2709648,
 2933818,
 346111,
 66103,
 3638141,
 232971,
 2848406,
 3783264,
 2842670,
 3581982,
 3171591,
 2245591,
 871828,
 2713603,
 1175043,
 106982,
 3102164,
 1553840,
 2057826,
 3557550,
 2363848,
 1253516,
 503539,
 2617340,
 1790258,
 2956696,
 193328,
 303129,
 3608340,
 950730,
 1855305,
 3777248,
 1076555,
 1150702,
 3419354,
 726682,
 465271,
 2602747,
 1762406,
 1605205,
 1759620,
 2021651,
 709968,
 2972914,
 259990,
 1349043,
 3522735,
 1773876,
 2862475,
 106083,
 2820187,
 3612400,
 2881368,
 2083728,
 1935536,
 1585141,
 1444202,
 1248376,
 881160,
 1709123,
 1154985,
 2443976,
 2304545,
 4700754,
 1080646,
 1550729,
 793276,
 4486831,
 3868340,
 2642609,
 548753,
 2168844,
 5546,
 472924,
 1604514,
 3013929,
 2213950,
 717927,
 1245649,
 3250648,
 8287,
 881469,
 3242407,
 132362,
 2466040,
 2433491,
 426884,
 2843330,
 990

In [5]:
# read tracks_final which should receive a recommendation
tracks_final = pd.read_csv('../../input/tracks_final.csv', sep = '\t')
print(tracks_final.head())
print(tracks_final.shape)

   track_id  artist_id  duration  playcount   album  \
0   2972914        144    224000       49.0     [7]   
1   2750239        246    157000        1.0     [8]   
2   1550729        144    217000      554.0     [9]   
3   2169950        144    207000      200.0     [9]   
4   1903709        144    198000        5.0  [None]   

                                     tags  
0     [54087, 1757, 1718, 116712, 189631]  
1   [189631, 3424, 177424, 46208, 205245]  
2   [54087, 109806, 46869, 183258, 54337]  
3  [54087, 70618, 207003, 109806, 116712]  
4   [54087, 81223, 116712, 215342, 71028]  
(100000, 6)


In [6]:
playlist_id = list(train_final['playlist_id'])
track_id = list(train_final['track_id'])
interaction = list(train_final['interaction'])
print(playlist_id[:10])
print(track_id[:10])
print(interaction[:10])

[7569, 7569, 7569, 7569, 7569, 7569, 7569, 7569, 7569, 7614]
[162463, 421750, 795606, 1195736, 2227105, 2634448, 2654612, 2693660, 2861386, 415173]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [7]:
playlist_id_unique = list(set(playlist_id))
track_id_unique = list(set(track_id))
playlist_id_unique.sort()
playlist_id_unique = pd.DataFrame(playlist_id_unique)
playlist_id_unique.reset_index(level=0, inplace=True)
playlist_id_unique.columns = ['index_playlist', 'playlist_id']
#track_id_unique.sort()
track_id_unique = pd.DataFrame(track_id_unique)
track_id_unique.reset_index(level=0, inplace=True)
track_id_unique.columns = ['index_track', 'track_id']
print(track_id_unique.head())
print(playlist_id_unique.head())

   index_track  track_id
0            0   1048594
1            1   2359314
2            2   1835030
3            3   3670041
4            4   1048604
   index_playlist  playlist_id
0               0         7569
1               1         7614
2               2         7641
3               3         7692
4               4         7816


In [8]:
train_intermediate = train_final.merge(playlist_id_unique, how='inner', on='playlist_id')
train_index = train_intermediate.merge(track_id_unique, how='inner', on='track_id')
train_index = train_index.sort_values(['playlist_id', 'track_id'], ascending=[True, True])
train_index

Unnamed: 0,playlist_id,track_id,interaction,index_playlist,index_track
0,7569,162463,1.0,0,62358
87,7569,421750,1.0,0,60999
116,7569,795606,1.0,0,3009
125,7569,1195736,1.0,0,55563
195,7569,2227105,1.0,0,49116
198,7569,2634448,1.0,0,4229
241,7569,2654612,1.0,0,12230
253,7569,2693660,1.0,0,26479
263,7569,2861386,1.0,0,93022
298,7614,415173,1.0,1,58038


In [9]:
tracks_final = tracks_final.merge(track_id_unique, how='inner', on='track_id')
tracks_final

Unnamed: 0,track_id,artist_id,duration,playcount,album,tags,index_track
0,2972914,144,224000,49.0,[7],"[54087, 1757, 1718, 116712, 189631]",33328
1,2750239,246,157000,1.0,[8],"[189631, 3424, 177424, 46208, 205245]",48728
2,1550729,144,217000,554.0,[9],"[54087, 109806, 46869, 183258, 54337]",93035
3,2169950,144,207000,200.0,[9],"[54087, 70618, 207003, 109806, 116712]",26668
4,1903709,144,198000,5.0,[None],"[54087, 81223, 116712, 215342, 71028]",25248
5,2256817,144,218000,2.0,[9],"[54087, 109806, 189631, 49166, 116712]",61032
6,2561768,928,223000,249.0,[26],"[50764, 4425, 11056, 205245, 81223]",79968
7,474864,928,193000,73.0,[22],"[205245, 81223, 11056, 267, 3982]",84079
8,1378455,928,304000,73.0,[22],"[11056, 205245, 81223, 189631, 84597]",24944
9,1523190,928,206000,10.0,[22],"[205245, 11056, 81223, 4425, 189631]",83966


In [10]:
index_playlist = list(train_index['index_playlist'])
index_track = list(train_index['index_track'])
interaction = list(train_index['interaction'])
print(index_playlist[:10])
print(index_track[:10])
print(interaction[:10])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[62358, 60999, 3009, 55563, 49116, 4229, 12230, 26479, 93022, 58038]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [11]:
URM_all = sps.coo_matrix((interaction, (index_playlist, index_track)))
URM_all = URM_all.tocsr()
URM_all

<45649x99999 sparse matrix of type '<class 'numpy.float64'>'
	with 1040522 stored elements in Compressed Sparse Row format>

In [12]:
import json
tracks_final['tags'] = tracks_final.tags.apply(json.loads)

In [13]:
tags_list = []
for index, row in tracks_final.iterrows():
    if len(row['tags']) != 0:
        for i in row['tags']:
            tags_list.append([row['index_track'], i, 1])

In [14]:
test = pd.DataFrame(tags_list)
test.columns = ['index_track', 'tag', 'interaction']
test

Unnamed: 0,index_track,tag,interaction
0,33328,54087,1
1,33328,1757,1
2,33328,1718,1
3,33328,116712,1
4,33328,189631,1
5,48728,189631,1
6,48728,3424,1
7,48728,177424,1
8,48728,46208,1
9,48728,205245,1


In [15]:
test.columns = ['index_track', 'tag', 'interaction']
track_list = list(test['index_track'])
tag_list = list(test['tag'])
interaction_list = list(test['interaction'])

In [16]:
le = preprocessing.LabelEncoder()
le.fit(tag_list)
taglist_icm = le.transform(tag_list)
print(taglist_icm.max())
print(type(taglist_icm))


31897
<class 'numpy.ndarray'>


In [17]:
test['tag'].nunique()

31898

In [18]:
ICM = sps.coo_matrix((interaction_list, (track_list, taglist_icm)))
ICM

<99999x31898 sparse matrix of type '<class 'numpy.int64'>'
	with 483496 stored elements in COOrdinate format>