In [1]:
# import libraries
import pandas as pd
import numpy as np

In [2]:
# read interactions file
train_final = pd.read_csv('../input/train_final.csv', sep = '\t')
print(train_final.head())
print(train_final.shape)

# read target tracks which need to be recommended
target_tracks = pd.read_csv('../input/target_tracks.csv', sep = '\t')
target_tracks['value'] = 1
print(target_tracks.head())
print(target_tracks.shape)

   playlist_id  track_id
0      3271849   2801526
1      5616275    727878
2     11267488   2805283
3     10103900   1515105
4      3836898   2945623
(1040522, 2)
   track_id  value
0   1316175      1
1   3885714      1
2   3091270      1
3    226759      1
4    230596      1
(32195, 2)


In [3]:
# count occurences of tracks in all playlists and sort descending
tr_agg = pd.DataFrame(train_final.groupby('track_id').playlist_id.nunique())
tr_agg.reset_index(level=0, inplace=True)
tr_agg.columns = ['track_id', 'number_pl']
tr_agg = tr_agg.sort_values('number_pl',ascending=False)

# filter on target tracks
tr_agg = tr_agg.merge(target_tracks, on='track_id', how='inner')
print(tr_agg[:20])
print(tr_agg.shape)

    track_id  number_pl  value
0    1563309        476      1
1    1363985        432      1
2    3705881        425      1
3    1595978        403      1
4    3166665        391      1
5    3779477        390      1
6     204966        387      1
7    2863395        370      1
8    1580480        349      1
9    1156143        346      1
10   1321053        339      1
11    209196        330      1
12    675104        324      1
13   3796108        317      1
14    276186        315      1
15   1074579        307      1
16   1286763        306      1
17   3628787        306      1
18   2339150        306      1
19   1495432        304      1
(32194, 3)


In [4]:
# read target playlists which should receive a recommendation
target_playlists = pd.read_csv('../input/target_playlists.csv', sep = '\t')
print(target_playlists.head())
print(target_playlists.shape)

   playlist_id
0     10024884
1     10624787
2      4891851
3      4267369
4        65078
(10000, 1)


In [5]:
# filter interaction dataframe, to retain only target playlists
train = train_final.merge(target_playlists, how='inner', on='playlist_id')
print(train.shape)
print(train['playlist_id'].nunique())

(362661, 2)
10000


In [6]:
# aggregate to playlist level and coerce tracks in that playlist to list
train_agg = train.groupby(by='playlist_id').track_id.apply(list).to_frame()
print(type(train_agg))
print(train_agg.head())

<class 'pandas.core.frame.DataFrame'>
                                                      track_id
playlist_id                                                   
7614         [2285204, 1384962, 3711434, 1714787, 2141817, ...
7692         [3036454, 1439032, 302730, 384386, 1559442, 21...
7816         [2305305, 1039409, 2674817, 126414, 2350576, 2...
8225         [2285204, 1891198, 3102263, 1464491, 2800858, ...
8337         [1210884, 1157460, 1205536, 451881, 3131838, 3...


### Basic top popular submission without correction

In [7]:
target_playlists['track_ids'] = '1563309 1363985 3705881 1595978 3166665'
print(target_playlists.head())

   playlist_id                                track_ids
0     10024884  1563309 1363985 3705881 1595978 3166665
1     10624787  1563309 1363985 3705881 1595978 3166665
2      4891851  1563309 1363985 3705881 1595978 3166665
3      4267369  1563309 1363985 3705881 1595978 3166665
4        65078  1563309 1363985 3705881 1595978 3166665


In [8]:
target_playlists.to_csv('../submission/toppopular_basic.csv', index=False)

### Top popular submission with correction when top tracks already in the playlist


In [9]:
# initialize new column with empty lists, to append the recommended songs
train_agg['recommend'] = np.empty((len(train_agg), 0)).tolist()
train_agg.reset_index(level=0, inplace=True)
train_agg

Unnamed: 0,playlist_id,track_id,recommend
0,7614,"[2285204, 1384962, 3711434, 1714787, 2141817, ...",[]
1,7692,"[3036454, 1439032, 302730, 384386, 1559442, 21...",[]
2,7816,"[2305305, 1039409, 2674817, 126414, 2350576, 2...",[]
3,8225,"[2285204, 1891198, 3102263, 1464491, 2800858, ...",[]
4,8337,"[1210884, 1157460, 1205536, 451881, 3131838, 3...",[]
5,8369,"[2165768, 826559, 2699378, 701941, 2890532, 10...",[]
6,8446,"[1081426, 3117244, 3785721, 3522428, 357883, 2...",[]
7,8559,"[1188811, 3455524, 396062, 949534, 1831605, 21...",[]
8,8636,"[3287306, 1786440, 3628787, 1955849, 813919, 2...",[]
9,9344,"[3108703, 497514, 1891090, 1529980, 2103878, 2...",[]


In [10]:
tr_agg

Unnamed: 0,track_id,number_pl,value
0,1563309,476,1
1,1363985,432,1
2,3705881,425,1
3,1595978,403,1
4,3166665,391,1
5,3779477,390,1
6,204966,387,1
7,2863395,370,1
8,1580480,349,1
9,1156143,346,1


In [11]:
# loop over playlists
for index, row in train_agg.iterrows():
    count = 1
    # loop over top popular songs
    for index_tr, row_tr in tr_agg.iterrows():
        if count < 6 and row_tr['track_id'] not in row['track_id']:
            row['recommend'].append(row_tr['track_id'])
            count += 1
        elif count >= 6:
            break

In [12]:
# look at result
train_agg

Unnamed: 0,playlist_id,track_id,recommend
0,7614,"[2285204, 1384962, 3711434, 1714787, 2141817, ...","[1563309, 1363985, 3705881, 1595978, 3166665]"
1,7692,"[3036454, 1439032, 302730, 384386, 1559442, 21...","[1563309, 1363985, 3705881, 1595978, 3166665]"
2,7816,"[2305305, 1039409, 2674817, 126414, 2350576, 2...","[1563309, 1363985, 3705881, 1595978, 3166665]"
3,8225,"[2285204, 1891198, 3102263, 1464491, 2800858, ...","[1563309, 1363985, 3705881, 1595978, 3166665]"
4,8337,"[1210884, 1157460, 1205536, 451881, 3131838, 3...","[1563309, 1363985, 3705881, 1595978, 3779477]"
5,8369,"[2165768, 826559, 2699378, 701941, 2890532, 10...","[1563309, 1363985, 3705881, 1595978, 3166665]"
6,8446,"[1081426, 3117244, 3785721, 3522428, 357883, 2...","[1563309, 1363985, 3705881, 1595978, 3166665]"
7,8559,"[1188811, 3455524, 396062, 949534, 1831605, 21...","[1563309, 1363985, 3705881, 1595978, 3166665]"
8,8636,"[3287306, 1786440, 3628787, 1955849, 813919, 2...","[1563309, 1363985, 1595978, 3166665, 204966]"
9,9344,"[3108703, 497514, 1891090, 1529980, 2103878, 2...","[1563309, 1363985, 3705881, 1595978, 3166665]"


In [13]:
# Convert list to string with spaces between track_ids
train_agg['recommend'] = train_agg['recommend'].apply(lambda x: " ".join(map(str, x)))

In [14]:
train_agg

Unnamed: 0,playlist_id,track_id,recommend
0,7614,"[2285204, 1384962, 3711434, 1714787, 2141817, ...",1563309 1363985 3705881 1595978 3166665
1,7692,"[3036454, 1439032, 302730, 384386, 1559442, 21...",1563309 1363985 3705881 1595978 3166665
2,7816,"[2305305, 1039409, 2674817, 126414, 2350576, 2...",1563309 1363985 3705881 1595978 3166665
3,8225,"[2285204, 1891198, 3102263, 1464491, 2800858, ...",1563309 1363985 3705881 1595978 3166665
4,8337,"[1210884, 1157460, 1205536, 451881, 3131838, 3...",1563309 1363985 3705881 1595978 3779477
5,8369,"[2165768, 826559, 2699378, 701941, 2890532, 10...",1563309 1363985 3705881 1595978 3166665
6,8446,"[1081426, 3117244, 3785721, 3522428, 357883, 2...",1563309 1363985 3705881 1595978 3166665
7,8559,"[1188811, 3455524, 396062, 949534, 1831605, 21...",1563309 1363985 3705881 1595978 3166665
8,8636,"[3287306, 1786440, 3628787, 1955849, 813919, 2...",1563309 1363985 1595978 3166665 204966
9,9344,"[3108703, 497514, 1891090, 1529980, 2103878, 2...",1563309 1363985 3705881 1595978 3166665


In [15]:
# rename columns for submission
final = train_agg[['playlist_id','recommend']]
final.columns = ['playlist_id','track_ids']

In [16]:
print(final.head())

   playlist_id                                track_ids
0         7614  1563309 1363985 3705881 1595978 3166665
1         7692  1563309 1363985 3705881 1595978 3166665
2         7816  1563309 1363985 3705881 1595978 3166665
3         8225  1563309 1363985 3705881 1595978 3166665
4         8337  1563309 1363985 3705881 1595978 3779477


In [18]:
# export file
final.to_csv('../submission/toppopular_advanced.csv', index=False)

Mögliche Gründe warum der Score nicht besser wird:
- eventuell müsste man die Top Tracks nur aus den 10.000 target playlists berechnen...
- vielleicht kann man die Target Tracks noch irgendwie weiter einschränken
