In [25]:
# Following the kaggle tutorial on creating a Kaggle submission
# The algorithm uses raw popularity to form recommendations
# Source: https://kaggle2.blob.core.windows.net/competitions-data/kaggle/2799/MSDChallengeGettingstarted.pdf?sv=2015-12-11&sr=b&sig=gI4ur5SDA%2FbzTJzu7SGJSC7P1FBAM7P2yBS9X2X3WrQ%3D&se=2016-12-06T21%3A17%3A44Z&sp=r

# Create counts of each song to get popularity
f = open('kaggle_visible_evaluation_triplets.txt', 'r')
song_to_count = dict()
for line in f:
    _, song, _ = line.strip().split('\t')
    if song in song_to_count:
        song_to_count[song] += 1
    else:
        song_to_count[song] = 1
f.close()

In [26]:
# Make sure we have counts
song_to_count['SONZTNP12A8C1321DF']

KeyError: 'SONZTNP12A8C1321DF'

In [27]:
# Re-order songs by decreasing popularity
songs_ordered = sorted(song_to_count.keys(),
                    key=lambda s: song_to_count[s],
                    reverse=True)

In [29]:
# Recommend most popular songs to every user
# Filter out songs already in user's library
# Let's create a map from user to songs they listened to
f = open(path_to_triplets, 'r')
user_to_songs = dict()
for line in f:
    user, song, _ = line.strip().split('\t')
    if user in user_to_songs:
        user_to_songs[user].add(song)
    else:
        user_to_songs[user] = set([song])

In [30]:
# For each user, we have list of songs
user_to_songs['d7083f5e1d50c264277d624340edaaf3dc16095b'] 

KeyError: 'd7083f5e1d50c264277d624340edaaf3dc16095b'

In [3]:
# Load canonical ordering of users
f = open('kaggle_users.txt', 'r')
canonical_users = map(lambda line: line.strip(), f.readlines())
f.close()

In [18]:
# Now have users in submission order
# Let's see the first two
canonical_users[:2]

['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d',
 'd7083f5e1d50c264277d624340edaaf3dc16095b']

In [14]:
# We do not submit actual song ids, but index in canonical
# list of songs
# Let's create map from song ID to song index
f = open('kaggle_songs.txt', 'r')
song_to_index = dict(map(lambda line: line.strip().split(' '),
                        f.readlines()))
f.close()

In [20]:
# Now for a given song ID we have the integer index
song_to_index['SOSOUKN12A8C13AB79']

'283892'

In [21]:
# We are ready to create submission file
# For each user in canonical list
# recommend songs in order of popularity
# except those already in the user's profile
f = open('submission.txt', 'w')
for user in canonical_users:
    songs_to_recommend = []
    for song in songs_ordered:
        if len(songs_to_recommend) >= 500:
            break
        if not song in user_to_songs[user]:
            songs_to_recommend.append(song)
    # Transform song IDs to song indexes
    indices = map(lambda s: song_to_index[s],
                 songs_to_recommend)
    # Write line for that user
    f.write(' '.join(indices) + '\n')
f.close()