I'm addicted to Spotify. What gets me somewhat excited for Monday morning is a little nugget called *__Discover Weekly__*. It's a playlist of recommended songs based on a user's preferences, which I'm guessing is based on play history. It's a machine learning-powered playlist generated by those magicians at Spotify. A software engineer explains how these song recommendations are made [here](https://medium.com/s/story/spotifys-discover-weekly-how-machine-learning-finds-your-new-music-19a41ab76efe).

My Discover Weekly playlist is hit or miss. Sometimes I find a few really good songs in there, but other times the majority of the songs are just "meh". So I decided to create a playlist of songs that I KNOW I like, and a playlist of songs that I KNOW I do not like. I'll combine these tracks into one playlist and use them as training data to feed into a machine learning algorithm. Once the algorithm is sufficiently trained, the hope is that it will be able to create me a filtered Discover Weekly playlist.

I use the [Spotipy](https://spotipy.readthedocs.io/en/latest/) Python library to access the Spotify Web API and obtain data on song features.

In [1]:
import spotipy
import spotipy.util as util
from config import client_id, client_secret, redirect_uri, username, good_playlist_id, bad_playlist_id
from dw_id import dw_playlist_id
import numpy as np
import pandas as pd

scope = 'playlist-modify-private playlist-modify-public playlist-read-private user-library-read'
token = util.prompt_for_user_token(username, scope, client_id=client_id, client_secret=client_secret, redirect_uri=redirect_uri)
if token:
    sp = spotipy.Spotify(auth=token)
else:
    print("Can't get token for", username)

# Pull data for good and bad playlists

In [2]:
def get_playlist_tracks(username, playlist_id):
    results = sp.user_playlist_tracks(username, playlist_id)
    tracks = results['items']
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    return tracks

def tracks_to_df(username, playlist_id, data_array):
    tracks = get_playlist_tracks(username, playlist_id)
    for i in range(len(tracks)):
        row = [tracks[i]['track']['id'],
              tracks[i]['track']['name'],
              tracks[i]['track']['artists'][0]['name'],
              tracks[i]['track']['popularity']]
        data_array.append(row)
        
    data_df = pd.DataFrame(data=data_array,columns=['id','name','artist','popularity'])
    return data_df

In [3]:
data_good = []
df_good = tracks_to_df(username, good_playlist_id, data_good)

data_bad = []
df_bad = tracks_to_df(username, bad_playlist_id, data_bad)

# Pull features

In [4]:
def chunks(mylist, chunk_size):
    # For item i in a range that is a length of l,
    for i in range(0, len(mylist), chunk_size):
        # Create an index range for l of n items:
        yield mylist[i:i+chunk_size]

In [5]:
def features_to_df(ids, data_array):
    # Create a list from the results of the function chunks, get features for batch of ids, append to array
    for i in range(0, len(list(chunks(ids, 50)))):
        ids_batch = list(chunks(ids, 50))[i]
        features_temp = sp.audio_features(tracks=ids_batch)
        data_array.append(features_temp)

    columns = list(data_array[0][0].keys())
    columns.sort()

    # convert to df
    # instantiate empty dataframe
    df_features = pd.DataFrame(columns = columns)

    for i in range(0, len(data_array)):
        df_temp = pd.DataFrame(data_array[i], columns = columns)
        df_features = df_features.append(df_temp, ignore_index=True)
    
    return df_features

In [6]:
good_ids=df_good['id'].tolist()
bad_ids=df_bad['id'].tolist()

In [7]:
good_features = []
df_features_good = features_to_df(good_ids, good_features)

bad_features = []
df_features_bad = features_to_df(bad_ids, bad_features)

df_features_good['target'] = 1
df_features_bad['target'] = 0

data = df_features_good.append(df_features_bad, ignore_index=True)

# Create test and training data

In [90]:
#Define the set of features that we want to look at
features_full = ["acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "speechiness", "tempo", "valence", "key", "duration_ms"]
features_variation = ["danceability", "energy", "tempo", "valence", "key", "duration_ms"]
features = features_variation

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data[features], data['target'], test_size = 0.25)

In [91]:
x_train.shape

(378, 6)

In [92]:
x_test.shape

(127, 6)

# Models

## 1. Decision Tree Classifier

In [93]:
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(min_samples_split=100)
tree.fit(x_train, y_train)
tree_pred = tree.predict(x_test)
score = accuracy_score(y_test, tree_pred) * 100
print("Accuracy using Decision Tree: ", round(score, 1), "%")

Accuracy using Decision Tree:  72.4 %


In [94]:
# # https://medium.com/@rnbrown/creating-and-visualizing-decision-trees-with-python-f8e8fa394176
# # # https://towardsdatascience.com/interactive-visualization-of-decision-trees-with-jupyter-widgets-ca15dd312084

# from sklearn.externals.six import StringIO  
# from IPython.display import Image  
# from sklearn.tree import export_graphviz
# import pydotplus
# dot_data = StringIO()
# export_graphviz(tree_model, out_file=dot_data,  
#                 filled=True, rounded=True,
#                 special_characters=True)
# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
# Image(graph.create_png())

## 2. K Neighbours Classifier

In [95]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(3)
knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)
score = accuracy_score(y_test, knn_pred) * 100
print("Accuracy using Knn Tree: ", round(score, 1), "%")

Accuracy using Knn Tree:  52.8 %


## 3. Multi-layer Perceptron

In [96]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
mlp.fit(x_train, y_train)
mlp_pred = mlp.predict(x_test)
score = accuracy_score(y_test, mlp_pred) * 100
print("Accuracy using mlp Tree: ", round(score, 1), "%")

Accuracy using mlp Tree:  48.8 %


## 4. Random Forest Classifier

In [97]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
forest.fit(x_train, y_train)
forest_pred = forest.predict(x_test)
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, forest_pred) * 100
print("Accuracy using random forest: ", round(score, 1), "%")

Accuracy using random forest:  70.9 %


## 5. AdaBoost Classifier

In [98]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=100)
ada.fit(x_train, y_train)
ada_pred = ada.predict(x_test)
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, ada_pred) * 100
print("Accuracy using ada: ", round(score, 1), "%")

Accuracy using ada:  71.7 %


## 6. Naive Bayes

In [99]:
from sklearn.naive_bayes import GaussianNB
gauss = GaussianNB()
gauss.fit(x_train, y_train)
gauss_pred = gauss.predict(x_test)
score = accuracy_score(y_test, gauss_pred)*100
print("Accuracy using gauss: ", round(score, 1), "%")

Accuracy using gauss:  65.4 %


## 7. K Means Clustering

In [100]:
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=3, random_state=0)
k_means.fit(x_train, y_train)
predicted= k_means.predict(x_test)
score = accuracy_score(y_test, predicted)*100
print("Accuracy using Kmeans: ", round(score, 1), "%")

Accuracy using Kmeans:  19.7 %


## 8. Gradient Boosting Classifier

In [101]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=.1, max_depth=1, random_state=0)
gbc.fit(x_train, y_train)
predicted = gbc.predict(x_test)
score = accuracy_score(y_test, predicted)*100
print("Accuracy using Gbc: ", round(score, 1), "%")

Accuracy using Gbc:  73.2 %


# Now apply predictions to my Discover Weekly playlist

In [102]:
data_dw = []
df_dw = tracks_to_df(username, dw_playlist_id, data_dw)

df_dw

Unnamed: 0,id,name,artist,popularity
0,5GukxVkcnm6wyuw17nYevK,Done - R3hab Remix,Nikki Vianna,46
1,0ap4E0W70EcUjXqItoM74l,Walk Away - 3LAU Deep Mix,3LAU,39
2,3ZuLTogqYwaL7DLqAP43t3,Growing Pains - Justin Caruso Remix,Alessia Cara,28
3,2bcTdyGjBUR8fknw2GeH0z,Gone (feat. Marvin Brooks) - Flyboy Remix,Maan On The Moon,42
4,1MqBckcnN45W32KSSHnylW,Sometimes,DallasK,59
5,7mAYdYyUrkUSArOdSrC7rR,Drew Barrymore,LU2VYK,46
6,0xARbGHzPT1o5t1sFlmyO2,Grip - Jay Pryor Remix,Seeb,54
7,4MuYNxE0Dgw0PFXz9Aquw6,Trampoline - BKAYE Remix,SHAED,53
8,1UBDqRniw09drFPk7hgzOF,All That She Wants,Jordan Jay,44
9,3SoHRFBuaJ11rD7uxxG5Uq,Off My Back,Thoreau,30


In [103]:
dw_ids=df_dw['id'].tolist()

In [104]:
dw_features = []
data_discover_weekly = features_to_df(dw_ids, dw_features)

In [120]:
pred_gbc = gbc.predict(data_discover_weekly[features])
pred_tree = tree.predict(data_discover_weekly[features])

In [121]:
likedSongs = 0
i = 0
for prediction in pred_tree:
    if(prediction == 1):
        print ("Song " + str(likedSongs+1) + ": " + df_dw["name"][i] + ", By: "+ df_dw["artist"][i])
        #sp.user_playlist_add_tracks(username, '2RARDnZLQGVPo0sXScDA8g', [df_dw['id'][i]]) # add to playlist
        likedSongs= likedSongs + 1
    i = i +1

Song 1: Done - R3hab Remix, By: Nikki Vianna
Song 2: Walk Away - 3LAU Deep Mix, By: 3LAU
Song 3: Gone (feat. Marvin Brooks) - Flyboy Remix, By: Maan On The Moon
Song 4: Sometimes, By: DallasK
Song 5: Drew Barrymore, By: LU2VYK
Song 6: Grip - Jay Pryor Remix, By: Seeb
Song 7: All That She Wants, By: Jordan Jay
Song 8: Bad Habits, By: dzill
Song 9: What About Us, By: WizG
Song 10: IDWK, By: DVBBS
Song 11: Congratulations, By: Carda
Song 12: Always on My Mind, By: Nick Martin
Song 13: Wish You Well (feat. Trove) - Club Mix, By: Famba
Song 14: Into My Bed, By: Harpoon
Song 15: All U Need, By: Dizaro
Song 16: Stay Here, By: Zaxx
Song 17: Wild Like The Wind, By: Deorro
Song 18: Treat Me Like A Lady (feat. Jeanne Naylor), By: Francis Mercier
Song 19: Getting Closer - Watson Remix, By: NEW CITY


# Other algorithms

In [24]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda = QuadraticDiscriminantAnalysis()
qda.fit(x_train, y_train)
qda_pred = qda.predict(x_test)
score = accuracy_score(y_test, qda_pred)*100
print("Accuracy using qda: ", round(score, 1), "%")

Accuracy using qda:  67.8 %


In [25]:
from sklearn.svm import SVC
svc_lin = SVC(kernel="linear", C=0.025)
svc_lin.fit(x_train, y_train)
svc_pred = svc_lin.predict(x_test)
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, svc_pred) * 100
print("Accuracy using svc linear: ", round(score, 1), "%")

Accuracy using svc linear:  51.3 %


In [26]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
gpc = GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True)
gpc.fit(x_train, y_train)
gpc_pred = gpc.predict(x_test)
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, gpc_pred) * 100
print("Accuracy using gpc: ", round(score, 1), "%")

Accuracy using gpc:  55.3 %
