In [22]:
import pandas as pd
import numpy as np


In [23]:
df = pd.read_csv("spotify_songs.csv")

In [24]:
df = df.sample(25000).reset_index(drop=True)

In [25]:
df = df.drop_duplicates(subset=['track_name', 'track_artist'])

In [26]:
df['id'] = range(0, len(df))

In [27]:
df.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,id
0,39shmbIHICJ2Wxnk1fPSdz,Should I Stay or Should I Go - Remastered,The Clash,79,1ZH5g1RDq3GY1OvyD0w0s2,Combat Rock (Remastered),1982,The Sound of Permanent Wave,4EYSGTuqe9cVfSVpX4gtGv,rock,...,-6.463,1,0.115,0.079,0.0,0.384,0.816,113.375,188987,0
1,1cU34sZG9kF4FYHCoAD0Ib,Some Kind Of Wonderful - Remastered 1999,Grand Funk Railroad,60,0ib2UtSmLGssyqyoY6X8cm,30 Years Of Funk: 1969-1999 The Anthology,1999-01-01,HARD ROCK CAFE,55ybnG2z8rpTIerRfnCsw6,rock,...,-6.51,1,0.0514,0.144,0.0,0.36,0.647,121.472,202733,1
2,5hOaRdlUqUSZ5nv8kQx1DF,Don't You (Forget About Me),The Wind and The Wave,45,34WkSeCkKh4wMsz7AFtU0F,Covers One,2015-02-17,Bluegrass Covers,37i9dQZF1DX56crgoe4TG3,r&b,...,-4.409,1,0.0387,0.00324,0.0295,0.367,0.666,119.978,249587,2
3,4i2OsaRNDW7Jr3KqUb4xfw,Her (Loving You),Glades,35,1Ezvo2RC12cFqbJSdSzQXv,Her (Loving You),2015-10-07,Indie Poptimism,21nxmipQe5xtXoHHPaYfnY,pop,...,-4.252,1,0.0649,0.0542,0.000349,0.13,0.311,176.0,215353,3
4,4yxSvdt8kl6jMdvFaiNtve,Lucky Love,Michael Seyer,54,5p3gsTRh2rd4cZfNjyVgln,Bad Bonez,2018-03-07,②⓪①⑨ mixed,2bOjjgN1S3Gqd8vSMyafvJ,rock,...,-13.288,1,0.0263,0.474,0.00366,0.105,0.234,143.966,286186,4


Specify features for content-based filtering

In [28]:
features = ['track_artist', 'track_album_name', 'playlist_genre', 'energy', 'key', 'tempo']

In [29]:
for feature in features:
	df[feature] = df[feature].fillna('')

In [30]:
def combine_features(row):
    return (
        row['track_artist'] + " " +
        row['track_album_name'] + " " +
        row['playlist_genre'] + " " +
        str(row['energy']) + " " +
        str(row['key']) + " " +
        str(row['tempo'])
    )

In [31]:
df["combined_features"] = df.apply(combine_features, axis=1)

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [33]:
cv = CountVectorizer()

In [34]:
count_matrix = cv.fit_transform(df["combined_features"])

Find cosine similarity

In [35]:
similar = cosine_similarity(count_matrix)

Helper Functions

In [36]:
def get_title_from_index(index):
    filtered_df = df[df.id == index]
    if not filtered_df.empty:
        return filtered_df["track_name"].values[0]
    else:
        return None

def get_index_from_title(title):
	return df[df.track_name == title]["id"].values[0]

def get_artist_from_index(index):
    filtered_df = df[df.id == index]
    if not filtered_df.empty:
        return filtered_df["track_artist"].values[0]
    else:
        return None

Recommender Function

In [37]:
def recommend(song):
    song_index = get_index_from_title(song)
    similar_songs = list(enumerate(similar[song_index]))
    sorted_similar_songs = sorted(similar_songs, key= lambda x:x[1], reverse=True)
    return sorted_similar_songs

Test Model

In [38]:
sorted_similar_songs = recommend("Adventure of a Lifetime")

In [39]:
i = 0
for song in sorted_similar_songs:
	if i == 0:
		i+=1
		continue
	print(get_title_from_index(song[0]) + ", " + get_artist_from_index(song[0]))
	i+=1
	if i > 50:
		break 

Hymn for the Weekend, Coldplay
The Scientist, Coldplay
Clocks, Coldplay
City Of Dreams - Radio Edit, Dirty South
Ghost Story, Coldplay
Locked Away, R. City
Down, Brandyn Burnette
O - Reprise, Coldplay
Sweet Dreams (Are Made of This), Winati
Sweet Dreams (Are Made of This) - Remastered, Eurythmics
Fix You, Coldplay
Peach, Broods
Daydreams - Sultan + Shepard Echoes Of Life Remix, ARTY
Little Talks, Of Monsters and Men
From Finner, Of Monsters and Men
Beautiful Life, Ace of Base
We Sink, Of Monsters and Men
King And Lionheart, Of Monsters and Men
A Sky Full of Stars - Robin Schulz Edit, Coldplay
Set Fire to the Rain, Adele
My Love (feat. Major Lazer, WizKid, Dua Lipa), Wale
Telecommunication, A Flock Of Seagulls
The More You Live, The More You Love, A Flock Of Seagulls
Honey, Erykah Badu
Fight Song, Rachel Platten
Happier - Breathe Carolina Remix, Marshmello
Lemontree, Julian Daniel
i miss the old u, blackbear
Paradise, Coldplay
In The Middle, dodie
Yellow, Coldplay
Silver Bombs, [:SITD:]

Export pickle files for use in python

In [40]:
import pickle

In [41]:
pickle.dump(similar, open("models/similarity.pkl", "wb"))

In [42]:
pickle.dump(df, open("models/df.pkl", "wb"))