# Spotify Recommender
#### Setting up the environment

In [103]:
# import necessary libraries
from dotenv import load_dotenv
import os
import base64
from requests import post, get
import json
import pandas as pd
import numpy as np
import matplotlib as plt

# to suppress warnings that I may get
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# set up the environment
load_dotenv()
client_id = os.getenv("CLIENT_ID")
client_secret = os.getenv("CLIENT_SECRET") 


#### Functions to set up token authorization

In [104]:
# function to generate the token
def get_token():
    auth_string = client_id + ":" + client_secret
    auth_bytes = auth_string.encode("utf-8")
    auth_base64 = str(base64.b64encode(auth_bytes), "utf-8")

    url = "https://accounts.spotify.com/api/token"
    headers = {
        "Authorization": "Basic " + auth_base64,
        "Content-Type": "application/x-www-form-urlencoded"
    }
    data = {"grant_type": "client_credentials"}
    result = post(url, headers=headers, data=data)
    json_result = json.loads(result.content)
    token = json_result["access_token"]
    return token

# function to generate the authorization header
def get_auth_header(token):
    return {"Authorization": "Bearer " + token}

#### Functions getting info from Spotify's Web API

In [105]:
# search for artist by name
def search_for_artist(token, artist_name):
    url = "https://api.spotify.com/v1/search"
    headers = get_auth_header(token)
    query = f"?q={artist_name}&type=artist&limit=1"

    query_url = url + query
    result = get(query_url, headers=headers)
    json_result = json.loads(result.content)["artists"]["items"]
    if len(json_result) == 0:
        print("No artist with this name exists...")
        return None
    return json_result[0]

# search for track by name
def search_for_track(token, track_name):
    url = "https://api.spotify.com/v1/search"
    headers = get_auth_header(token)
    query = f"?q={track_name}&type=track&limit=1"

    query_url = url + query
    result = get(query_url, headers=headers)
    json_result = json.loads(result.content)["tracks"]["items"]
    if len(json_result) == 0:
        print("No track with this name exists...")
        return None
    return json_result[0]

# get artist by artist id
def get_artist(token, artist_id):
    url = f"https://api.spotify.com/v1/artists/{artist_id}"
    headers = get_auth_header(token)
    result = get(url, headers=headers)
    json_result = json.loads(result.content)
    return json_result

# get songs by artist from artist id
def get_songs_by_artist(token, artist_id):
    url = f"https://api.spotify.com/v1/artists/{artist_id}/top-tracks?country=CA"
    headers = get_auth_header(token)
    result = get(url, headers=headers)
    json_result = json.loads(result.content)["tracks"]
    return json_result

# get track info from track id
def get_track(token, track_id):
    url = f"https://api.spotify.com/v1/tracks/{track_id}"
    headers = get_auth_header(token)
    result = get(url, headers=headers)
    json_result = json.loads(result.content)
    return json_result

# get track audio features from track id
def get_track_features(token, track_id):
    url = f"https://api.spotify.com/v1/audio-features/{track_id}"
    headers = get_auth_header(token)
    result = get(url, headers=headers)
    json_result = json.loads(result.content)
    return json_result

# get all available markets on Spotify (CA, US, etc)
def get_markets(token):
    url = "https://api.spotify.com/v1/markets"
    headers = get_auth_header(token)
    result = get(url, headers=headers)
    json_result = json.loads(result.content)
    return json_result

# get all available genres in the database
def get_genres(token):
    url = "https://api.spotify.com/v1/recommendations/available-genre-seeds"
    headers = get_auth_header(token)
    result = get(url, headers=headers)
    json_result = json.loads(result.content)
    return json_result

# get recommendations based on tracks
def get_recommendations(token, seed_tracks):
    url = f"https://api.spotify.com/v1/recommendations?market=CA&limit=1&seed_tracks={seed_tracks}"
    headers = get_auth_header(token)
    result = get(url, headers=headers)
    json_result = json.loads(result.content)['tracks']
    return json_result


#### Set up token and play around with Spotify's API

In [106]:
# set up token that allows us to use the Spotify API
token = get_token()

# getting familiar with Spotify API, printing top songs of an artist given their artist name
result = search_for_artist(token, "Of Monsters and Men")
artist_id = result["id"]
print(result['name'], "Top Tracks")
# by default this gives top 10 tracks, but we'll use top 3 for the sake of smaller output
songs = get_songs_by_artist(token, artist_id)[:3]
for idx, song in enumerate(songs):
    print(f"{idx + 1}. {song['name']}")

Of Monsters and Men Top Tracks
1. Little Talks
2. Dirty Paws
3. Mountain Sound


## PROJECT PLAN
**Problem:** Given a song, recommend a new song.<br><br>
**Solution:**
1. Load in the dataset
2. Perform exploratory data analysis to gain info on the dataset
3. Perform feature engineering to remove irrelevant data, mutate data, etc
4. Get the user's song/track
5. Feature engineer the user's track
6. Use cosine similarity to find similarity between user's track, and the tracks in the dataset
7. Reveal to the user the track that is the most similar
8. Enjoy the recommended song :)

## EDA and Feature Engineering

#### Load in data from Kaggle dataset

In [107]:
# load in the data and display first track
all_song_data = pd.read_csv("spotify_data.csv")

# we will be modifying all_song_data so raw_data is to keep original info on tracks in the dataset
raw_data = all_song_data
all_song_data.head(1)

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3


In [108]:
# drop the first column since it is just the index
all_song_data = all_song_data.drop(all_song_data.columns[0], axis=1)
all_song_data.head(1)

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3


In [109]:
# check the format of genres in Spotify's API (since it could be different from the format in the dataset)

# the artist stored in artist_id is Of Monsters and Men from above
artist_genre = get_artist(token, artist_id)['genres']
# print their genres
print('Of Monsters and Men Genres:', artist_genre)

# showing how to get an artist's genres given the artist name
jason_mraz = search_for_artist(token, "Jason Mraz")['id']
jason_genre = get_artist(token, jason_mraz)['genres']
print('Jason Mraz Genres:', jason_genre)

# get all unique genres in the dataset
kaggle_genres = list(all_song_data['genre'].unique())
# we store in a variable to show reduction in genres size which we will perform later
orig_len_genres = len(kaggle_genres)

Of Monsters and Men Genres: ['folk-pop', 'metropopolis', 'modern rock', 'stomp and holler']
Jason Mraz Genres: ['acoustic pop', 'dance pop', 'neo mellow', 'pop']


#### Manually reduce amount of genres

In [110]:
# list of genres to remove
remove_genres = []
# loop through all genres
for genre in kaggle_genres:
    # there are 4 subgenres of metal that can be reduced to just metal
    if 'metal' in genre and genre != 'metal':
        remove_genres.append(genre)
    # there are 3 subgenres of house that can be reduced to just house
    if 'house' in genre and genre != 'house':
        remove_genres.append(genre)
    # there are 5 subgenres of rock that can be reduced to just rock
    if 'rock' in genre and genre != 'rock':
        remove_genres.append(genre)
    # change these two pop genres to just pop
    if genre == "pop-film" or genre == "power-pop":
        remove_genres.append(genre)

# all these fit into edm
edm_subs = ['techno', 'minimal-techno', 'electronic', 'electro', 'dubstep']
for genre in edm_subs:
    remove_genres.append(genre)

# remove the genres in remove_genres
for genre in remove_genres:
    kaggle_genres.remove(genre)

print('We\'ve reduced the number of genres by', str(orig_len_genres - len(kaggle_genres)) + '.')


We've reduced the number of genres by 19.


#### Map values in dataset to remaining genres

In [111]:
map_genre = {
    'alt-rock': 'rock', 'hard-rock': 'rock', 'psych-rock': 'rock', 'punk-rock': 'rock', 'rock-n-roll': 'rock',
    'pop-film': 'pop', 'power-pop': 'pop',
    'black-metal': 'metal', 'death-metal': 'metal', 'heavy-metal': 'metal', 'metalcore': 'metal',
    'chicago-house': 'house', 'deep-house': 'house', 'progressive-house': 'house',
    'techno': 'edm', 'minimal-techno': 'edm', 'electronic': 'edm', 'electro': 'edm', 'dubstep': 'edm'}
all_song_data['genre'] = all_song_data['genre'].map(map_genre).fillna(all_song_data['genre'])
print(len(all_song_data['genre'].unique()))


63


#### One Hot Encode the Genres in Kaggle Dataset

In [112]:
from sklearn.preprocessing import OneHotEncoder


ohe = OneHotEncoder()
genre_array = ohe.fit_transform(all_song_data[['genre']]).toarray()
genre_labels = ohe.categories_[0]
genres = pd.DataFrame(genre_array, columns=genre_labels)

In [113]:
# concatenate the original DataFrame to the One Hot Encoded genres
all_song_data = pd.concat([all_song_data, genres], axis=1)

# drop the genres column
all_song_data = all_song_data.drop('genre', axis=1)

In [114]:
# dropping columns that don't help with machine learning to suggest songs (note that artist contributes already through their genre)
all_song_data = all_song_data.drop(['track_id', 'track_name', 'artist_name', 'popularity', 'duration_ms'], axis=1)

In [115]:
# checking data to see make sure we've removed all useless columns
all_song_data.iloc[0:1159764, 0:13]

Unnamed: 0,year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,2012,0.483,0.303,4,-10.058,1,0.0429,0.6940,0.000000,0.1150,0.1390,133.406,3
1,2012,0.572,0.454,3,-10.286,1,0.0258,0.4770,0.000014,0.0974,0.5150,140.182,4
2,2012,0.409,0.234,3,-13.711,1,0.0323,0.3380,0.000050,0.0895,0.1450,139.832,4
3,2012,0.392,0.251,10,-9.845,1,0.0363,0.8070,0.000000,0.0797,0.5080,204.961,4
4,2012,0.430,0.791,6,-5.419,0,0.0302,0.0726,0.019300,0.1100,0.2170,171.864,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1159759,2011,0.373,0.742,10,-6.453,0,0.0736,0.3250,0.000141,0.1590,0.5220,107.951,3
1159760,2011,0.516,0.675,7,-7.588,0,0.0326,0.7880,0.000129,0.1300,0.2640,119.897,4
1159761,2011,0.491,0.440,5,-8.512,1,0.0274,0.4770,0.003130,0.0936,0.0351,100.076,4
1159762,2011,0.480,0.405,0,-13.343,1,0.0276,0.4310,0.000063,0.1250,0.2020,133.885,3


#### Scale the features
We use the StandardScaler from scikit-learn to scale the features so that each feature has the same affect on the similarities between songs when we later use cosine similarity to calculate similarities between two tracks. The StandardScaler makes the mean of each column 0 and the standard deviation of each column 1.

Note: if we were to use a model to create the parameters then we would not need to scale since the model would do it for us.

In [116]:
from sklearn.preprocessing import StandardScaler


original_data = all_song_data
scaler = StandardScaler()
scaled_songs = scaler.fit_transform(all_song_data)
all_song_data = pd.DataFrame(scaled_songs, columns=all_song_data.columns, index=all_song_data.index)
all_song_data


Unnamed: 0,year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,singer-songwriter,ska,sleep,songwriter,soul,spanish,swedish,tango,trance,trip-hop
0,0.006614,-0.295093,-1.244617,-0.362224,-0.189477,0.758725,-0.393523,1.049230,-0.691229,-0.537219,...,-0.110679,-0.109641,-0.12503,-0.022542,-0.088307,-0.129601,-0.100584,-0.114735,-0.091322,-0.099126
1,0.006614,0.187349,-0.686393,-0.643503,-0.229602,0.758725,-0.528337,0.437940,-0.691191,-0.624750,...,-0.110679,-0.109641,-0.12503,-0.022542,-0.088307,-0.129601,-0.100584,-0.114735,-0.091322,-0.099126
2,0.006614,-0.696225,-1.499700,-0.643503,-0.832360,0.758725,-0.477092,0.046376,-0.691092,-0.664040,...,-0.110679,-0.109641,-0.12503,-0.022542,-0.088307,-0.129601,-0.100584,-0.114735,-0.091322,-0.099126
3,0.006614,-0.788377,-1.436853,1.325447,-0.151991,0.758725,-0.445556,1.367551,-0.691229,-0.712779,...,-0.110679,-0.109641,-0.12503,-0.022542,-0.088307,-0.129601,-0.100584,-0.114735,-0.091322,-0.099126
4,0.006614,-0.582391,0.559444,0.200333,0.626931,-1.318001,-0.493648,-0.701256,-0.638363,-0.562085,...,-0.110679,-0.109641,-0.12503,-0.022542,-0.088307,-0.129601,-0.100584,-0.114735,-0.091322,-0.099126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1159759,-0.140361,-0.891371,0.378299,1.325447,0.444959,-1.318001,-0.151487,0.009755,-0.690842,-0.318390,...,-0.110679,-0.109641,-0.12503,-0.022542,-0.088307,-0.129601,-0.100584,-0.114735,-0.091322,10.088132
1159760,-0.140361,-0.116210,0.130610,0.481611,0.245213,-1.318001,-0.474727,1.314028,-0.690875,-0.462618,...,-0.110679,-0.109641,-0.12503,-0.022542,-0.088307,-0.129601,-0.100584,-0.114735,-0.091322,10.088132
1159761,-0.140361,-0.251728,-0.738149,-0.080946,0.082600,0.758725,-0.515723,0.437940,-0.682655,-0.643649,...,-0.110679,-0.109641,-0.12503,-0.022542,-0.088307,-0.129601,-0.100584,-0.114735,-0.091322,10.088132
1159762,-0.140361,-0.311356,-0.867539,-1.487338,-0.767597,0.758725,-0.514146,0.308358,-0.691055,-0.487485,...,-0.110679,-0.109641,-0.12503,-0.022542,-0.088307,-0.129601,-0.100584,-0.114735,-0.091322,10.088132


#### Load in the user's track based on ID

In [117]:
# ex. Little Talks by Of Monsters and Men track id = 2ihCaVdNZmnHZWt0fvAM7B
# Here we search for song by name and then use the id given to us to get the track info
track2 = search_for_track(token, 'Dancing With Your Ghost')
print(track2['id'])

track = get_track(token, '1TQXIltqoZ5XXyfCbAeSQQ')
track_features = get_track_features(token, '1TQXIltqoZ5XXyfCbAeSQQ')

# only taking the first artist on the track for now
track_artist = track['artists']
first_artist = track_artist[0]
artist_id = first_artist['id']

artist = get_artist(token, artist_id)
print(artist['genres'])

# we see here that the genres do not perfectly line up
genres = get_genres(token)['genres']
print('alt z' in genres)


1TQXIltqoZ5XXyfCbAeSQQ
['alt z']
False


#### Match up columns of existing tracks data with the new track

In [118]:
# release date might be slightly off due to this being the release date of the album rather than the track
release_year = track['album']['release_date'][0:4]

danceability = track_features['danceability']
energy = track_features['energy']
key = track_features['key']
loudness = track_features['loudness']
mode = track_features['mode']
speechiness = track_features['speechiness']
acousticness = track_features['acousticness']
instrumentalness = track_features['instrumentalness']
liveness = track_features['liveness']
valence = track_features['valence']
tempo = track_features['tempo']
time_signature = track_features['time_signature']

track_info = [release_year, danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, tempo, time_signature]

tracks_genres = []
artist_genres = artist['genres']
for genre in kaggle_genres:
    for tGenre in artist_genres:
        if genre in tGenre:
            tracks_genres.append(genre)

for tGenre in artist_genres:
    if 'indie-pop' not in tracks_genres and 'alt' in tGenre:
        tracks_genres.append('indie-pop')
print(tracks_genres)

tracks_genres = []

# example to show that this works for more common genres
artist_genres = ['french-pop']
for genre in kaggle_genres:
    for tGenre in artist_genres:
        if genre in tGenre:
            tracks_genres.append(genre)

print(tracks_genres)

# set genre to 1 if it is the track's genre, else 0
for genre in kaggle_genres:
    if genre in tracks_genres:
        track_info.append(1)
    else:
        track_info.append(0)

['indie-pop']
['french', 'pop']


#### Shift and rescale the track's feature values based on how we shifted database

In [119]:
# use the original values to get mean and std of each feature so that we can shift and rescale the user's track features
features_mean = original_data.mean()
features_std = original_data.std()
num_features = len(track_info)
for i in range(num_features):
    track_info[i] = float(track_info[i])
    track_info[i] = (track_info[i] - features_mean[i]) / features_std[i]

#### Use cosine similarity to find track in database most similar to given track

In [120]:
# import function that computes cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity


track_info_arr = np.array(track_info, dtype='float64')
cos_sim = cosine_similarity(all_song_data, track_info_arr.reshape((1, -1))).reshape(-1)
sim_index = np.argmax(cos_sim)
sim_song_row = raw_data.iloc[sim_index]
rec_artist = sim_song_row['artist_name']
rec_track = sim_song_row['track_name']

#### TIME TO GIVE RECOMMENDATION :)

In [121]:
print('Song Recommendation: If you like', track['name'], 'by', track['artists'][0]['name'] + ',', 'then we would recommend', rec_track, 'by', rec_artist + '.')

Song Recommendation: If you like Dancing With Your Ghost by Sasha Alex Sloan, then we would recommend Ivory - Please by Joe Bel.
