# Training a Song Embedding Model to Find Similar Songs in Playlists:

In this notebook, a song embedding model Word2Vec is trained to find an example song in multiple playlists. 

In [1]:
import pandas as pd
from urllib import request

In [2]:
# Load the playlist dataset 
df = request.urlopen('https://storage.googleapis.com/maps-premium/dataset/yes_complete/train.txt')
df

<http.client.HTTPResponse at 0x78040717cac0>

In [3]:
# Parse the playlist dataset file and skip the first 2 lines. The first 2 lines contain metadata
lines = df.read().decode("utf-8").split("\n")[2:]

In [4]:
# Remove the playlists including just one song
playlists_clean = [s.rstrip().split() for s in lines if len(s.split()) > 1]

In [5]:
# Load song metadata
songs_file = request.urlopen('https://storage.googleapis.com/maps-premium/dataset/yes_complete/song_hash.txt')
songs_file = songs_file.read().decode('utf_8').split('\n')
songs = [s.rstrip().split('\t') for s in songs_file]
songs_df = pd.DataFrame(data=songs, columns = ['id', 'title', 'artist'])
songs_df = songs_df.set_index('id')

In [6]:
print('Playlist #1:\n', playlists_clean[0], "\n")
print('Playlist #2:\n', playlists_clean[1])

Playlist #1:
 ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '2', '42', '43', '44', '45', '46', '47', '48', '20', '49', '8', '50', '51', '52', '53', '54', '55', '56', '57', '25', '58', '59', '60', '61', '62', '3', '63', '64', '65', '66', '46', '47', '67', '2', '48', '68', '69', '70', '57', '50', '71', '72', '53', '73', '25', '74', '59', '20', '46', '75', '76', '77', '59', '20', '43'] 

Playlist #2:
 ['78', '79', '80', '3', '62', '81', '14', '82', '48', '83', '84', '17', '85', '86', '87', '88', '74', '89', '90', '91', '4', '73', '62', '92', '17', '53', '59', '93', '94', '51', '50', '27', '95', '48', '96', '97', '98', '99', '100', '57', '101', '102', '25', '103', '3', '104', '105', '106', '107', '47', '108', '109', '110', '111', '112', '113', '25', '63', '62', '114', '115', '84', '116', '117', '

In [7]:
# Training the model
from gensim.models import Word2Vec

#Train the Word2Vec model
model = Word2Vec(playlists_clean, vector_size = 32, window = 20, negative = 50, min_count = 1, workers = 4) # The window size sliding for 20 tokens. Then added 50 unrelated tokens that are not in the sliding window so that every 2 tokens are similary compared and predicted.

In [8]:
# Ask the model for songs similar to song #21

song_id = 21

model.wv.most_similar(positive=str(song_id))

[('5878', 0.9962319135665894),
 ('35723', 0.9959354996681213),
 ('21108', 0.995265543460846),
 ('387', 0.9952342510223389),
 ('1143', 0.9950798153877258),
 ('346', 0.9950380325317383),
 ('126', 0.9945993423461914),
 ('269', 0.9944682121276855),
 ('442', 0.9943040013313293),
 ('28', 0.9942772388458252)]

In [9]:
# Name and information of song 21

print(songs_df.iloc[21])

title     Do For Love
artist           2Pac
Name: 21 , dtype: object


In [10]:
# The results in recommendations in the same heavy metal and hard rock genre
import numpy as np

def print_recommendations(song_id):
    similar_songs = np.array(
        model.wv.most_similar(positive=str(song_id),topn=5))[:,0]
    return songs_df.iloc[similar_songs]

print_recommendations(21)

Unnamed: 0_level_0,title,artist
id,Unnamed: 1_level_1,Unnamed: 2_level_1
5878,Invented Sex (w\/ Drake),Trey Songz
35723,I Won't Tell (w\/ J. Holiday),Fat Joe
21108,Playaz Club,Rappin' 4-Tay
387,Say Something (w\/ Drake),Timbaland
1143,Mad,Ne-Yo
