In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import inv
import scipy as sp
import scipy.sparse as sps
import scipy.io as io
import time
import json
from scipy.sparse.linalg import svds
from scipy.sparse.linalg import spsolve
from sklearn import preprocessing
import matplotlib.pyplot as pyplot

In [2]:
# read interactions file
train_final = pd.read_csv('../../input/train_final.csv', sep = '\t')
train_final['interaction'] = 1.0
train_final = train_final.sort_values(['playlist_id', 'track_id'], ascending=[True, True])
numberInteractions = train_final.shape[0]
print(train_final.iloc[:20,])

        playlist_id  track_id  interaction
770769         7569    162463          1.0
925571         7569    421750          1.0
912580         7569    795606          1.0
852833         7569   1195736          1.0
287048         7569   2227105          1.0
92449          7569   2634448          1.0
519471         7569   2654612          1.0
173680         7569   2693660          1.0
903004         7569   2861386          1.0
863056         7614    415173          1.0
278077         7614   1384962          1.0
394480         7614   1609224          1.0
615615         7614   1614974          1.0
328664         7614   1714787          1.0
377085         7614   2141817          1.0
83840          7614   2285204          1.0
574199         7614   3361942          1.0
481594         7614   3504896          1.0
300753         7614   3711434          1.0
775317         7614   3833025          1.0


In [3]:
# read target playlists which should receive a recommendation
target_playlists = pd.read_csv('../../input/target_playlists.csv', sep = '\t')
print(target_playlists.head())
print(target_playlists.shape)

   playlist_id
0     10024884
1     10624787
2      4891851
3      4267369
4        65078
(10000, 1)


In [4]:
# read target tracks
target_tracks = pd.read_csv('../../input/target_tracks.csv', sep = '\t')
target_tracks['dummy'] = 1

In [5]:
#create lists of playlist_id, track_id and interaction 
playlist_id = list(train_final['playlist_id'])
track_id = list(train_final['track_id'])
interaction = list(train_final['interaction'])
print(playlist_id[:10])
print(track_id[:10])
print(interaction[:10])

[7569, 7569, 7569, 7569, 7569, 7569, 7569, 7569, 7569, 7614]
[162463, 421750, 795606, 1195736, 2227105, 2634448, 2654612, 2693660, 2861386, 415173]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [6]:
playlist_id_unique = list(set(playlist_id))
track_id_unique = list(set(track_id))
playlist_id_unique.sort()
playlist_id_unique = pd.DataFrame(playlist_id_unique)
playlist_id_unique.reset_index(level=0, inplace=True)
playlist_id_unique.columns = ['index_playlist', 'playlist_id']
#track_id_unique.sort()
track_id_unique = pd.DataFrame(track_id_unique)
track_id_unique.reset_index(level=0, inplace=True)
track_id_unique.columns = ['index_track', 'track_id']
print(track_id_unique.head)
print(playlist_id_unique.head)
print(len(playlist_id_unique))

<bound method NDFrame.head of        index_track  track_id
0                0   1048594
1                1   2359314
2                2   1835030
3                3   3670041
4                4   1048604
5                5   1835044
6                6   3670053
7                7   3670054
8                8   1835048
9                9   1835052
10              10   2359342
11              11   2621486
12              12   2621488
13              13    786481
14              14   2621490
15              15   2621491
16              16    262193
17              17   1048631
18              18   1835066
19              19    262205
20              20   1310781
21              21   2621503
22              22   2621504
23              23   2883647
24              24   1835075
25              25   2883652
26              26   1310789
27              27   2621513
28              28   2621514
29              29   2097227
...            ...       ...
99969        99969   3145502
99970        

In [7]:
# merge train_final and index of playlists and tracks
train_intermediate = train_final.merge(playlist_id_unique, how='inner', on='playlist_id')
train_index = train_intermediate.merge(track_id_unique, how='inner', on='track_id')
train_index = train_index.sort_values(['playlist_id', 'track_id'], ascending=[True, True])
train_index

Unnamed: 0,playlist_id,track_id,interaction,index_playlist,index_track
0,7569,162463,1.0,0,62358
87,7569,421750,1.0,0,60999
116,7569,795606,1.0,0,3009
125,7569,1195736,1.0,0,55563
195,7569,2227105,1.0,0,49116
198,7569,2634448,1.0,0,4229
241,7569,2654612,1.0,0,12230
253,7569,2693660,1.0,0,26479
263,7569,2861386,1.0,0,93022
298,7614,415173,1.0,1,58038


In [8]:
# list with tracks in train_final and not in target_tracks 
test = track_id_unique.merge(target_tracks, on='track_id', how='left')
filter_tracks = list(test[test.dummy != 1.0].index_track)

In [9]:
#get lists of playlist index, track index and interaction
userid = list(train_index['index_playlist'])
itemid = list(train_index['index_track'])
interaction = list(train_index['interaction'])

In [10]:
#get unique playlist index and track index
userid_unique = list(set(userid))
itemid_unique = list(set(itemid))

In [11]:
#create sparse matrix
URM_all = sps.coo_matrix((interaction, (userid, itemid)))
URM_all = URM_all.tocsr()
URM_all

<45649x99999 sparse matrix of type '<class 'numpy.float64'>'
	with 1040522 stored elements in Compressed Sparse Row format>

In [12]:
item_user_data = URM_all

In [13]:
matrix_size = URM_all.shape[0]*URM_all.shape[1] # Number of possible interactions in the matrix
num_purchases = len(URM_all.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_purchases/matrix_size))
sparsity

99.97720580066762

In [14]:
print (target_playlists.head())
target_playlist_id = pd.merge(target_playlists, train_index, how='left', on=['playlist_id', 'playlist_id'])

   playlist_id
0     10024884
1     10624787
2      4891851
3      4267369
4        65078


In [15]:
target_playlist_id_unique= list(set(target_playlist_id['index_playlist']))

In [16]:
print(len(target_playlist_id_unique))

10000


In [17]:
print(target_playlist_id.head())
print(target_playlist_id.shape)
userid_unique_2 = np.reshape(np.array(userid_unique),(1,45649))
userid_unique_2.T

   playlist_id  track_id  interaction  index_playlist  index_track
0     10024884     82829          1.0           36739        30737
1     10024884     98812          1.0           36739        36799
2     10024884    161455          1.0           36739        61727
3     10024884    162673          1.0           36739        62499
4     10024884    187052          1.0           36739        73540
(362661, 5)


array([[    0],
       [    1],
       [    2],
       ..., 
       [45646],
       [45647],
       [45648]])

In [18]:
target_tracks_id = pd.merge(target_tracks, train_index, how='left', on=['track_id', 'track_id'])
print(target_tracks_id[0:10])

   track_id  dummy  playlist_id  interaction  index_playlist  index_track
0   1316175      1    4242493.0          1.0         11511.0       1640.0
1   1316175      1    7175339.0          1.0         25591.0       1640.0
2   1316175      1    7541803.0          1.0         27888.0       1640.0
3   1316175      1   10030081.0          1.0         36771.0       1640.0
4   1316175      1   10837350.0          1.0         41507.0       1640.0
5   3885714      1      21567.0          1.0           125.0      85150.0
6   3885714      1     433235.0          1.0          1559.0      85150.0
7   3885714      1    3653000.0          1.0          8622.0      85150.0
8   3885714      1    6283961.0          1.0         21232.0      85150.0
9   3885714      1    6961378.0          1.0         24578.0      85150.0


In [19]:
print(item_user_data.shape)

(45649, 99999)


In [32]:
import implicit

# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=50)

# train the model on a sparse matrix of item/user/confidence weights
model.fit(item_user_data)

# recommend items for a user
user_items = item_user_data.T.tocsr()
#recommendations = model.recommend(userid_unique[2], user_items)

In [33]:
item_user_data

<45649x99999 sparse matrix of type '<class 'numpy.float64'>'
	with 1040522 stored elements in Compressed Sparse Row format>

In [34]:
user_items

<99999x45649 sparse matrix of type '<class 'numpy.float64'>'
	with 1040522 stored elements in Compressed Sparse Row format>

In [35]:
print(user_items.shape)
print(len(userid_unique))
recommendation_end = []
for i, playlist_id in enumerate(target_playlist_id_unique):
    if (i % 1000 == 0):
        print("Item %d of %d" % (i, len(target_playlist_id_unique)))
    for reco in model.recommend(playlist_id, user_items, N=5, filter_items=filter_tracks):
        recommendation_end.append([playlist_id, reco])

(99999, 45649)
45649
Item 0 of 10000
Item 1000 of 10000
Item 2000 of 10000
Item 3000 of 10000
Item 4000 of 10000
Item 5000 of 10000
Item 6000 of 10000
Item 7000 of 10000
Item 8000 of 10000
Item 9000 of 10000


In [24]:
print(recommendation_end[0:10])
print(len(recommendation_end))

[[1, (43043, 0.18176816678027302)], [1, (26112, 0.14306180895963444)], [1, (38940, 0.1245049423908306)], [1, (17014, 0.091222618999371771)], [1, (32438, 0.067835380502488732)], [3, (33000, 0.040821084926994017)], [3, (16300, 0.039566252537962657)], [3, (6230, 0.02023663125109031)], [3, (12320, 0.01849815761583417)], [3, (31976, 0.017797436743816866)]]
50000


In [115]:
train = train_index.merge(target_playlists, how='inner', on='playlist_id')
print(train.shape)
print(train['playlist_id'].nunique())

(362661, 5)
10000


In [25]:
#create list of playlists out of recommendations
playlist_rec = []
for i in range(len(recommendation_end)):
    playlist_rec.append(recommendation_end[i][0])

In [26]:
print(playlist_rec[0:10])
print(len(playlist_rec))

[1, 1, 1, 1, 1, 3, 3, 3, 3, 3]
50000


In [27]:
recommendation_end[1][1][1]

0.14306180895963444

In [None]:
recommendation_end[1][1][1]

In [28]:
score_rec = []
for p in range(len(recommendation_end)):
    score_rec.append(recommendation_end[p][1][1])

In [29]:
print(len(score_rec))
print(score_rec[0:10])

50000
[0.18176816678027302, 0.14306180895963444, 0.1245049423908306, 0.091222618999371771, 0.067835380502488732, 0.040821084926994017, 0.039566252537962657, 0.02023663125109031, 0.01849815761583417, 0.017797436743816866]


In [36]:
track_rec = []
for x in range(len(recommendation_end)):
    track_rec.append(recommendation_end[x][1][0])
    

In [37]:
#dritter Versuch
print(len(track_rec))
print(track_rec[0:100])

50000
[43043, 26112, 38940, 17014, 32438, 33000, 16300, 19314, 5048, 31976, 6699, 42362, 26838, 5849, 3379, 14596, 7484, 20698, 6476, 2110, 40174, 3288, 28525, 26511, 37702, 5849, 43213, 4268, 26171, 19314, 32655, 4822, 20790, 10914, 22089, 36799, 40514, 16663, 32076, 3803, 19632, 14665, 30636, 19095, 2353, 31133, 30620, 3953, 33269, 12294, 6476, 14596, 29947, 396, 43289, 13087, 22610, 2353, 43274, 19282, 6930, 31472, 13622, 4662, 43907, 22364, 28876, 38940, 13622, 4662, 14596, 37773, 17745, 308, 30631, 22000, 40142, 40447, 35192, 7866, 22000, 38369, 40142, 11802, 40447, 27992, 21488, 30318, 15131, 5849, 43719, 13622, 2907, 4662, 43907, 5849, 26171, 43213, 5599, 35384]


In [31]:
#zweiter Versuch
print(len(track_rec))
print(track_rec[0:100])

50000
[43043, 26112, 38940, 17014, 32438, 33000, 16300, 6230, 12320, 31976, 6699, 42362, 34733, 26838, 39515, 34392, 3110, 31069, 24083, 30727, 40174, 28525, 26511, 5847, 3288, 4468, 5849, 45578, 4268, 19314, 32655, 4822, 10914, 20790, 5284, 36799, 16663, 30113, 30218, 32076, 29413, 9104, 39539, 42608, 38049, 3110, 32131, 31069, 35151, 24083, 43289, 6476, 12427, 18001, 2974, 13087, 22610, 43274, 1846, 19282, 17838, 2012, 32583, 23024, 6930, 23024, 17838, 22364, 19280, 28309, 7846, 1732, 28525, 21496, 37773, 22000, 40142, 43274, 10153, 32583, 11802, 22000, 42770, 30941, 40142, 27992, 24261, 21488, 4468, 5701, 17838, 23024, 28309, 19280, 43719, 5849, 26171, 23396, 5599, 43213]


In [94]:
# erster Versuch
print(len(track_rec))
print(track_rec[0:100])

50000
[43043, 26112, 3984, 38940, 17014, 33000, 16300, 31133, 19314, 5048, 6699, 42362, 26838, 3379, 5849, 14596, 32131, 40045, 6476, 20698, 40174, 28525, 3288, 26511, 37702, 5849, 4268, 43213, 10664, 15131, 32655, 4822, 10914, 20790, 22089, 36799, 40514, 32076, 16663, 39310, 14665, 19095, 3232, 30636, 19734, 31133, 14596, 30620, 7225, 29947, 6476, 43289, 19314, 18670, 14596, 13087, 22610, 2353, 19282, 26171, 6930, 31472, 13622, 4662, 43719, 22364, 28876, 44681, 17992, 4662, 14596, 37773, 308, 28525, 30631, 22000, 40142, 40447, 35192, 7866, 22000, 40142, 44924, 40447, 28376, 27992, 21488, 12294, 30318, 40175, 13622, 43719, 2907, 4662, 43907, 5849, 26171, 43213, 40744, 39539]


In [98]:
rec_final = pd.DataFrame({'index_playlist': playlist_rec, 'index_track':track_rec})

In [105]:
rec_final[0:10]
recommend_final = rec_final.merge(playlist_id_unique, how='inner', on='index_playlist')
recommend_fin = recommend_final.merge(track_id_unique, how='inner', on='index_track')

In [132]:
recommend_fin[0:100]

Unnamed: 0,index_playlist,index_track,playlist_id,track_id
0,1,43043,7614,2473912
1,32883,43043,8460121,2473912
2,130,43043,21877,2473912
3,32917,43043,8467732,2473912
4,32942,43043,8471794,2473912
5,195,43043,28279,2473912
6,32999,43043,8483618,2473912
7,304,43043,47075,2473912
8,33077,43043,8493725,2473912
9,314,43043,49457,2473912


In [115]:
train_agg1 = recommend_fin.groupby(by='playlist_id').track_id.apply(list).to_frame()
train_agg1.reset_index(level=0, inplace=True)

In [116]:
print(train_agg1)

      playlist_id                                       track_id
0            7614    [2473912, 1905901, 274470, 628816, 3191649]
1            7692     [2185601, 830237, 3753914, 1887793, 15117]
2            7816   [281885, 3782748, 1121860, 3418668, 1590256]
3            8225       [2971260, 14407, 2651663, 580664, 60093]
4            8337    [98812, 2467425, 2445329, 1879846, 1678159]
5            8369   [3185177, 3722315, 3417894, 606837, 1102272]
6            8446   [3753914, 1874235, 868943, 3691048, 3226574]
7            8559    [806806, 1133302, 3706422, 800558, 2475526]
8            8636  [1083783, 2420781, 2891123, 2674170, 1381735]
9            9344     [1874235, 1911905, 887501, 2098236, 82527]
10           9444  [2419229, 3777203, 3778005, 2215788, 3746552]
11          10050  [1590256, 2212281, 1381735, 3516720, 3251698]
12          10732    [3185177, 606837, 1062530, 592555, 1850807]
13          11314    [3706422, 800558, 533104, 2213857, 3783305]
14          12268  [18877

In [117]:
# Convert list to string with spaces between track_ids
train_agg1['track_id'] = train_agg1['track_id'].apply(lambda x: " ".join(map(str, x)))

In [118]:
train_agg1

Unnamed: 0,playlist_id,track_id
0,7614,2473912 1905901 274470 628816 3191649
1,7692,2185601 830237 3753914 1887793 15117
2,7816,281885 3782748 1121860 3418668 1590256
3,8225,2971260 14407 2651663 580664 60093
4,8337,98812 2467425 2445329 1879846 1678159
5,8369,3185177 3722315 3417894 606837 1102272
6,8446,3753914 1874235 868943 3691048 3226574
7,8559,806806 1133302 3706422 800558 2475526
8,8636,1083783 2420781 2891123 2674170 1381735
9,9344,1874235 1911905 887501 2098236 82527


In [119]:
# rename columns for submission
train_agg1.columns = ['playlist_id','track_ids']

In [122]:
# export file
train_agg1.to_csv('../../submission/006_als_f100.csv', index=False)