# Train a music recommender

In [2]:
import gensim
import sqlite3
from operator import itemgetter

In [2]:
SONGS_DB_PATH = 'data/songs.db'
PLAYLISTS_PATH = 'data/playlists.ndjson'
SONG_IDS_PATH = 'data/songs_ids.txt'
MODEL_PATH = 'zoo/15/songs.word2vec'

## Training the model

In [3]:
class WordSplitter(object):
    """Creates an iterator that yields words per line in a file."""
    
    def __init__(self, filename, max_count=None):
        self.filename = filename
        self.max_count = max_count
 
    def __iter__(self):
        with open(self.filename) as fin:
            count = 0
            for line in fin:
                if self.max_count is not None and count > self.max_count:
                    return
                else:
                    yield line.split()
                    count += 1

In [4]:
model_input = WordSplitter(SONG_IDS_PATH)

In [5]:
%%time

model = gensim.models.Word2Vec(model_input, min_count=4)

CPU times: user 1min 15s, sys: 2.3 s, total: 1min 18s
Wall time: 36.5 s


In [6]:
model.save(open(MODEL_PATH, 'wb'))

## Using the model to retrieve and recommend songs

In [7]:
conn = sqlite3.connect(SONGS_DB_PATH)
cursor = conn.cursor()

Retrieve songs by popularity (defined by the frequency of the song in vocabulary):

In [8]:
def find_song(song_name, cursor, limit=10):
    cursor.execute(
        "SELECT * FROM songs WHERE UPPER(name) LIKE ?",
        (f'%{song_name}%',)
    )
    # The element song_info[0] contains song_id
    results = iter(
        song_info + (model.wv.vocab[song_info[0]].count,) 
        for song_info in cursor.fetchall()
        if song_info[0] in model.wv.vocab
    )
    # Sorts song_info results by frequency in model vocabulary:
    results = sorted(results, key=itemgetter(-1), reverse=True)
    return results[:limit]

In [138]:
results = find_song('black or white', cursor)
results = {
    song_id: (song_name, artist, score)
    for song_id, song_name, artist, score in results
}
for r in results:
    print(r)
    print(results[r], '\n')

2Cy7QY8HPLk925AyNAt6OG
('Black or White - Single Version', 'Michael Jackson', 96) 

7EsjkelQuoUlJXEw7SeVV4
('Black or White', 'Michael Jackson', 35) 

6wdviVWctiZnY6tVn6bh6A
('Black or White - Single Version', 'Michael Jackson', 19) 

2PdFSWL1Zpg0lRnujLnNJJ
('Black or White - Remastered Version', 'Michael Jackson', 18) 

62ECYqWLwhndUSjfVdpSPc
('Black or White', 'Michael Jackson', 9) 

3UHYHc72E3SXoMHI0gux7x
('Black or White (Glee Cast Version)', 'Glee Cast', 5) 

4NUSJn6eHJOOr4MzhQLzZT
('Black Or White', 'Honeywagon', 4) 

5VnOk8tmQBoU1vEiiNvSWx
('Black Or White (Bossa version)', 'Joana Duah', 4) 



In [139]:
song_id = next(iter(results))
song_info = results[song_id]
song_id, song_info

('2Cy7QY8HPLk925AyNAt6OG',
 ('Black or White - Single Version', 'Michael Jackson', 96))

In [132]:
def suggest_songs(song_id, cursor):
    similar = dict(model.wv.most_similar(song_id))
    song_ids = list(similar.keys())
    query = (
        'SELECT * FROM songs WHERE id in ({})'
        .format(('?,'*len(song_ids))[:-1])
    )
    cursor.execute(query, song_ids)
    results = iter(
        song_info + (similar[song_info[0]],)
        for song_info in cursor.fetchall()
    )
    # Sorts results by similarity score
    results = sorted(results, key=itemgetter(-1), reverse=True)
    return results

In [136]:
print(song_id, *song_info)
for t in suggest_songs(song_id, cursor):
    print(*t)

2Cy7QY8HPLk925AyNAt6OG Black or White - Single Version Michael Jackson 96
0gmbgwZ8iqyMPmXefof8Yf How You Remind Me Nickelback 0.9992876052856445
4eHbdreAnSOrDDsFfc4Fpm I Will Always Love You Whitney Houston 0.9992775917053223
2tUBqZG2AbRi7Q0BIrVrEj I Wanna Dance with Somebody (Who Loves Me) Whitney Houston 0.9990498423576355
3cfOd4CMv2snFaKAnMdnvK All Star Smash Mouth 0.9990429878234863
0COqiPhxzoWICwFCS4eZcp Bring Me To Life Evanescence 0.9990196228027344
5QpaGzWp0hwB5faV8dkbAz Wherever You Will Go The Calling 0.9990159869194031
37Q5anxoGWYdRsyeXkkNoI Heaven Is a Place on Earth Belinda Carlisle 0.9989984035491943
1Je1IMUlBXcx1Fz0WE7oPT Wannabe Spice Girls 0.998955249786377
6Qyc6fS4DsZjB2mRW9DsQs Iris The Goo Goo Dolls 0.9988621473312378
37qI0mchgzUSeUhPiwUWPY Wherever You Will Go The Calling 0.9988144636154175


In [141]:
suggest_songs('0U0ldCRmgCqhVvD6ksG63j', cursor)

[('6ZFbXIJkuI1dVNWvzJzown', 'Time', 'Hans Zimmer', 0.9974794983863831),
 ('6VuUMo1TusnqBxLTnr3hOI', 'Brillas', 'León Larregui', 0.9974169731140137),
 ('1ykbtFnlIjmIFnZ8j6wg6i',
  'The Breaking of the Fellowship (feat. "In Dreams")',
  'Howard Shore',
  0.9974009990692139),
 ('4Gf68vwxa69hCiXmJ1jvgj', 'Azúcar Amargo', 'Fey', 0.9973743557929993),
 ('18z7tK7u9DcDw85LYRR5Fe',
  'Cornfield Chase',
  'Hans Zimmer',
  0.9973671436309814),
 ('1U3tv41tFRgOfbytpuYQrR',
  'Ferida Curada',
  'Zé Neto & Cristiano',
  0.9973652362823486),
 ('4p7XH4NhQ25iGYrrbg93gt', 'Suave', 'Luis Miguel', 0.9973102807998657),
 ('2FxXQtnQoryi52LA0V9pJV', 'On My Own', 'Drako', 0.9973098039627075),
 ('5nzEilkVhvnCD0xpW3raTy', 'No Dejes Que...', 'Caifanes', 0.9973037242889404),
 ('4fFgpzgSkmn3wajioLBPOr',
  'Mis Ojos Lloran Por Ti',
  'Big Boy',
  0.9973029494285583)]