# Expand Spotify playlist with Word2Vec embeddings

In [1]:
from gensim.models import Word2Vec

model = Word2Vec.load("../models/w2v_v2")
len(set(model.wv.index_to_key))

281217

In [2]:
import pandas as pd

df = pd.read_csv("../data/track_uri_mappings.csv")
df = df.astype({"uri": "string", "name": "string", "artist": "string", "album": "string"})
df.head()

Unnamed: 0,uri,name,artist,album
0,spotify:track:58f6PxrFxQ8jEWoDoDVZ9C,Ease Back,The Meters,Funkify Your Life: The Meters Anthology
1,spotify:track:0WSlOSMLJWoWUpWci9nnRb,Cissy Strut,The Meters,The Meters
2,spotify:track:2fkmrMW5eV3VvHeUicem25,People Say - Single Version,The Meters,Funkify Your Life: The Meters Anthology
3,spotify:track:6YBc1KPVqvhB8ugdC9Enkh,Funky Miracle,The Meters,Look-Ka Py Py
4,spotify:track:0mfQH4OgwV8aQ7JUgVjkhd,Sing A Simple Song,The Meters,The Meters


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2262292 entries, 0 to 2262291
Data columns (total 4 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   uri     string
 1   name    string
 2   artist  string
 3   album   string
dtypes: string(4)
memory usage: 69.0 MB


In [4]:
df.describe()

Unnamed: 0,uri,name,artist,album
count,2262292,2262285,2262282,2262285
unique,2262292,1483758,287739,571626
top,spotify:track:58f6PxrFxQ8jEWoDoDVZ9C,Intro,Johann Sebastian Bach,Greatest Hits
freq,1,1559,5417,2171


In [5]:
playlist = [
    "spotify:track:0UaMYEvWZi0ZqiDOoHU3YI",
    "spotify:track:6I9VzXrHxO9rA9A5euc8Ak",
    "spotify:track:7uKcScNXuO3MWw6LowBjW1",
    "spotify:track:3dFwpxh2yH7C7p9BGEKLVB",
    "spotify:track:7KXjTSCq5nL1LoYtL7XAwS",
]
missing = [track for track in playlist if track not in model.wv]
assert not missing, "There are tracks missing from model.wv"

In [6]:
pl_df = pd.DataFrame(playlist, columns=["uri"])
pl_df = pl_df.merge(df, on="uri", how="left")
pl_df

Unnamed: 0,uri,name,artist,album
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Lose Control (feat. Ciara & Fat Man Scoop),Missy Elliott,The Cookbook
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Toxic,Britney Spears,In The Zone
2,spotify:track:7uKcScNXuO3MWw6LowBjW1,"One, Two Step",Ciara,Goodies
3,spotify:track:3dFwpxh2yH7C7p9BGEKLVB,Goodies,Ciara,Goodies
4,spotify:track:7KXjTSCq5nL1LoYtL7XAwS,HUMBLE.,Kendrick Lamar,DAMN.


In [7]:
import numpy as np

vectors = [model.wv[track] for track in playlist]
centroid = np.mean(vectors, axis=0)

recommendations = model.wv.similar_by_vector(centroid, topn=20)
recommendations = [r for r, _ in recommendations if r not in playlist]
recommendations

['spotify:track:0RhXv8YzZD1jLiITZiMFe0',
 'spotify:track:2SUjRDIn23wnKXpO69apxl',
 'spotify:track:1GH5tpvz5g1ZGKNBPBWkTm',
 'spotify:track:6gyHX1r0K9fGUjRJKHSRNw',
 'spotify:track:3fIExmgYdyeMiIKgSUohZ4',
 'spotify:track:4MPo5nRV5Z1Mdv82DDCitK',
 'spotify:track:04KTF78FFg8sOHC1BADqbY',
 'spotify:track:4duAM9CrgspEQyP3B8ZyhT',
 'spotify:track:6nv0cOrhhVdzUakLrpL77h',
 'spotify:track:42D1iUtSH8CpktXbnVzAm6',
 'spotify:track:0aL8yL3w7z7H0l33spCKfj',
 'spotify:track:2gam98EZKrF9XuOkU13ApN',
 'spotify:track:3x9LFpV6tzq0qWaBMvY2UT',
 'spotify:track:2cMTIlktg3M9mXYqCPqw1J',
 'spotify:track:6RcQOut9fWL6FSqeIr5M1r',
 'spotify:track:3jagJCUbdqhDSPuxP8cAqF',
 'spotify:track:0vupCkmy497h49a74Xsxj1']

In [8]:
rec_df = pd.DataFrame(recommendations, columns=["uri"])
rec_df = rec_df.merge(df, on="uri", how="left")
rec_df

Unnamed: 0,uri,name,artist,album
0,spotify:track:0RhXv8YzZD1jLiITZiMFe0,"1,2 Step",Clara Pedro,Aqui T치-Se Bem
1,spotify:track:2SUjRDIn23wnKXpO69apxl,Lip Gloss/ No Music - Main Version - clean,Lil Mama,VYP - Voice of the Young People
2,spotify:track:1GH5tpvz5g1ZGKNBPBWkTm,London Bridge,Fergie,London Bridge
3,spotify:track:6gyHX1r0K9fGUjRJKHSRNw,Run It!,Chris Brown,Run It!
4,spotify:track:3fIExmgYdyeMiIKgSUohZ4,No Diggity,MC Cologne,"Move Ya Feet, Vol. 2"
5,spotify:track:4MPo5nRV5Z1Mdv82DDCitK,Hot in Here,Crib,The Very Best of Nelly
6,spotify:track:04KTF78FFg8sOHC1BADqbY,Hot In Herre,Nelly,Nellyville
7,spotify:track:4duAM9CrgspEQyP3B8ZyhT,2 Step - Feat. T-Pain & Jim Jones - Remix,Lydell Lucky,Making Dollars
8,spotify:track:6nv0cOrhhVdzUakLrpL77h,Shake Your Pom Pom - R.N. Remix - 32 Counts Edit,MC Joe,Latin Hits for Summer 2017
9,spotify:track:42D1iUtSH8CpktXbnVzAm6,Queen Bee,Gen. Mill$ (feat. Ryan),Queen Bee


In [13]:
recommendations = [r for r, _ in model.wv.most_similar(playlist, topn=20)]
recommendations = [r for r in recommendations if r not in playlist]
rec_df = pd.DataFrame(recommendations, columns=["uri"])
rec_df = rec_df.merge(df, on="uri", how="left")
rec_df

Unnamed: 0,uri,name,artist,album
0,spotify:track:0RhXv8YzZD1jLiITZiMFe0,"1,2 Step",Clara Pedro,Aqui T치-Se Bem
1,spotify:track:2SUjRDIn23wnKXpO69apxl,Lip Gloss/ No Music - Main Version - clean,Lil Mama,VYP - Voice of the Young People
2,spotify:track:1GH5tpvz5g1ZGKNBPBWkTm,London Bridge,Fergie,London Bridge
3,spotify:track:6gyHX1r0K9fGUjRJKHSRNw,Run It!,Chris Brown,Run It!
4,spotify:track:4MPo5nRV5Z1Mdv82DDCitK,Hot in Here,Crib,The Very Best of Nelly
5,spotify:track:3fIExmgYdyeMiIKgSUohZ4,No Diggity,MC Cologne,"Move Ya Feet, Vol. 2"
6,spotify:track:4duAM9CrgspEQyP3B8ZyhT,2 Step - Feat. T-Pain & Jim Jones - Remix,Lydell Lucky,Making Dollars
7,spotify:track:04KTF78FFg8sOHC1BADqbY,Hot In Herre,Nelly,Nellyville
8,spotify:track:6nv0cOrhhVdzUakLrpL77h,Shake Your Pom Pom - R.N. Remix - 32 Counts Edit,MC Joe,Latin Hits for Summer 2017
9,spotify:track:42D1iUtSH8CpktXbnVzAm6,Queen Bee,Gen. Mill$ (feat. Ryan),Queen Bee
