In [1]:
import os
import csv
import time
import gensim
from gensim.models.callbacks import CallbackAny2Vec

csv.field_size_limit(1000000)

131072

In [4]:
print('Read in tracks...')
tracks = {}
with open('tracks.csv', "r", encoding='utf-8') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=';')
    for row in spamreader:
        columns = str(row)[2:-2].split(';')
        tracks[columns[0]] = [columns[1] + ' - ' +
                              columns[2], columns[3]] # title - artist, url
print(f'Number of tracks: {len(tracks)}')

print('Read in playlists...')
playlists = []
with open('playlists.csv', 'r', encoding='utf-8') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=';')
    for row in spamreader:
        columns = str(row)[2:-2].split(';')
        if len(columns) < 1000:
            playlist = []
            for column in columns[2:]:
                playlist.append(column)
            playlists.append(playlist)
print(f'Number of playlists: {len(playlists)}')

Read in tracks...
Number of tracks: 9724993
Read in playlists...
Number of playlists: 717586


In [14]:
min_count = 100      # minimum number of occurences of a track in all playlists
window = 4          # length of sequences
embedding_dim = 100 # number of dimensions in hidden layer
batch_words = 10000 # number of tracks to process in each batch
iter = 15           # number of iterations
# sg = 1              # skip-gram (1) or Continuous Bag Of Words (0)

# ------------------------------------------------------------------------------

valid_examples = [
    '2NMgVh5qaPprKTEzFe3501', # The Police - Roxanne
    '3Ti0GdlrotgwsAVBBugv0I', # A Tribe Called Quest - Can I Kick It?
    '0nyrltZrQGAJMBZc1bYvuQ', # James Brown - Get Up Offa That Thing
    '4hy4fb5D1KL50b3sng9cjw', # Nirvana - Smells Like Teen Spirit
    '1P49MJhU5vzttesFxw3dOM', # Bob Marley & The Wailers - Three Little Birds
    '76GlO5H5RT6g7y0gev86Nk', # The Cure - Just Like Heaven
    '40tAOP3DPqmVD6L1h45Jp6', # Frank Sinatra - My Way
    # '4IMvgp0WZqr9mRqpEvDKxI', # The Clash - Rock the Casbah
    '1iDcKYNvo6gglrOG6lvnHL', # The Rolling Stones - Sympathy For The Devil
    '5uvosCdMlFdTXhoazkTI5R', # The Doors - Light My Fire
    '15JINEqzVMv3SvJTAXAKED', # Eminem - Love The Way You Lie
    '69kOkLUCkxIZYexIgSG8rq', # Daft Punk - Get Lucky
    # '6oVY50pmdXqLNVeK8bzomn', # John Coltrane - My Favorite Things
    '6ui6l3ZNvlrGQZArwo8195', # Sex Pistols - God Save The Queen
    '0YammaEkYSeo9vQYZ1OwS6', # David Guetta - Say My Name
    # '4SHZsQIdS2N1E5yqvoXF8o'  # Andy Williams - Can't Take My Eyes Off You
]

# ------------------------------------------------------------------------------

class logger(CallbackAny2Vec):
    def __init__(self):
        print('Starting...')
        self.epoch = 0
        self.loss = 0

    def on_train_begin(self, model):
        self.start = time.time()

    def on_epoch_end(self, model):
        elapsed = time.time() - self.start
        print('#{}'.format(self.epoch), 'loss =',
              (model.get_latest_training_loss() - self.loss) / batch_words,
              'elapsed time =', elapsed // 60, 'minutes', elapsed % 60, 'seconds')
        self.epoch += 1
        self.loss = model.get_latest_training_loss()
        print('Saving model...')
        model.save('word2vec.model')
        _model = gensim.models.Word2Vec.load('word2vec.model')
        for track in valid_examples:
            similar = _model.wv.most_similar(positive=[track], topn=8)
            most_similar = ''
            for i in range(0, 8):
                most_similar = most_similar + '%s (%.2f)' % (tracks[similar[i][0]][0], similar[i][1]) + ', '
            print('  %s -> %s' % (tracks[track][0], most_similar))
        print()
        del _model

In [15]:
model = gensim.models.Word2Vec(sentences=playlists, size=embedding_dim,
                               min_count=min_count, window=window, iter=iter,
                               batch_words=batch_words, compute_loss=True, sg=True,
                               callbacks=[logger()])
print(model)

b Marley & The Wailers - Could You Be Loved (0.77), Bob Marley & The Wailers - Three Little Birds (0.75), Bob Marley & The Wailers - Jamming (0.75), Bob Marley & The Wailers - One Love / People Get Ready - Medley (0.73), Bob Marley & The Wailers - Buffalo Soldier (0.73), Bob Marley & The Wailers - Positive Vibration (0.73), 
  The Cure - Just like Heaven -> The Cure - Lovesong - 2010 Remaster (0.86), The Cure - Friday I'm in Love (0.85), The Cure - Close to Me - 2006 Remaster (0.83), The Cure - Boys Don't Cry - Single Version (0.83), The Cure - In Between Days - 2006 Remaster (0.79), The Cure - Pictures of You - 2010 Remaster (0.79), The Cure - Just like Heaven - 2006 Remaster (0.78), New Order - Age of Consent - 2015 Remaster (0.77), 
  Frank Sinatra - My Way -> Louis Armstrong - What A Wonderful World (0.78), The Beatles - Hey Jude - Remastered 2015 (0.74), Frank Sinatra - Somethin' Stupid (0.73), Louis Armstrong - La vie en rose - Single Version (0.71), Frank Sinatra - Strangers In 

In [17]:
model.save('song2vec')

In [16]:
model.wv.save_word2vec_format(fname='song2vec.bin',fvocab='songVocab.bin',binary=True)