In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
from imblearn.over_sampling import SMOTE
%matplotlib inline

In [2]:
# Importing playlist dataframes
df = pd.read_csv('data/encoded_playlist_songs.csv')
df_fav = pd.read_csv('data/favorite_songs.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9769 entries, 0 to 9768
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          9769 non-null   object 
 1   length            9769 non-null   int64  
 2   popularity        9769 non-null   int64  
 3   danceability      9769 non-null   float64
 4   acousticness      9769 non-null   float64
 5   energy            9769 non-null   float64
 6   instrumentalness  9769 non-null   float64
 7   liveness          9769 non-null   float64
 8   loudness          9769 non-null   float64
 9   speechiness       9769 non-null   float64
 10  tempo             9769 non-null   float64
 11  time_signature    9769 non-null   int64  
 12  favorite          9769 non-null   int64  
dtypes: float64(8), int64(4), object(1)
memory usage: 992.3+ KB


In [4]:
df_fav.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          50 non-null     object 
 1   length            50 non-null     int64  
 2   popularity        50 non-null     int64  
 3   danceability      50 non-null     float64
 4   acousticness      50 non-null     float64
 5   energy            50 non-null     float64
 6   instrumentalness  50 non-null     float64
 7   liveness          50 non-null     float64
 8   loudness          50 non-null     float64
 9   speechiness       50 non-null     float64
 10  tempo             50 non-null     float64
 11  time_signature    50 non-null     int64  
 12  favorite          50 non-null     int64  
dtypes: float64(8), int64(4), object(1)
memory usage: 5.2+ KB


### Preparing data for model

In [5]:
# Remove song name, artist, album
df = pd.concat([df, df_fav], axis=0)
df.shape

(9819, 13)

In [6]:
df.head()

Unnamed: 0,track_id,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
0,7MAibcTli4IisCtbHKrGMh,242096,90,0.586,0.182,0.616,0.0,0.0927,-7.964,0.0324,148.088,4,0
1,5QO79kh1waicV47BqGRL3g,215626,97,0.68,0.0212,0.826,1.2e-05,0.543,-5.487,0.0309,118.051,4,0
2,1diS6nkxMQc3wwC4G1j0bh,165506,88,0.722,0.0319,0.588,0.0,0.183,-5.932,0.0544,134.01,4,0
3,4u4NyuceXP7Uzh7XFJKCr1,170813,89,0.658,0.0106,0.634,0.0,0.132,-5.797,0.0413,139.98,4,0
4,3Ofmpyhv5UAQ70mENzB277,132780,94,0.778,0.175,0.695,0.0,0.15,-6.865,0.0913,149.996,4,0


In [7]:
df['favorite'].value_counts()

0    9769
1      50
Name: favorite, dtype: int64

### Data Prep Contd, Model Selection & Hyperparameter Tuning

In [8]:
# Importing required libraries
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics 
from sklearn.metrics import f1_score
# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier

In [9]:
# Shuffle your dataset 
shuffle_df = df.sample(frac=1)

# Define a size for your train set 
train_size = int(0.8 * len(df))

# Split dataset 
train_set = shuffle_df[:train_size]
test_set = shuffle_df[train_size:]

In [10]:
train_set.head()

Unnamed: 0,track_id,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
2755,2Y4lgpOT0sPU1TiTmsbpZh,141608,60,0.409,0.00384,0.816,3e-06,0.0932,-4.59,0.0743,164.841,4,0
15,60ynsPSSKe6O3sfwRnIBRf,226986,93,0.749,0.208,0.463,0.0371,0.337,-8.433,0.0828,90.028,4,0
1146,0XqrIEnDoHvodUzGRmyKYQ,188962,73,0.898,0.0699,0.529,0.0,0.536,-5.428,0.2,110.032,4,0
2156,1vXt2d3QwSTYKhgw1pZwMZ,225717,41,0.511,0.00199,0.639,0.32,0.162,-6.493,0.0253,164.044,4,0
1789,28gqksZy2cRqteQDwLOaTj,198874,32,0.535,5.6e-05,0.863,0.000378,0.0672,-8.405,0.0463,124.055,4,0


In [11]:
X = train_set.drop(columns=['favorite', 'track_id'])
y = train_set.favorite

X.head()

Unnamed: 0,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
2755,141608,60,0.409,0.00384,0.816,3e-06,0.0932,-4.59,0.0743,164.841,4
15,226986,93,0.749,0.208,0.463,0.0371,0.337,-8.433,0.0828,90.028,4
1146,188962,73,0.898,0.0699,0.529,0.0,0.536,-5.428,0.2,110.032,4
2156,225717,41,0.511,0.00199,0.639,0.32,0.162,-6.493,0.0253,164.044,4
1789,198874,32,0.535,5.6e-05,0.863,0.000378,0.0672,-8.405,0.0463,124.055,4


In [12]:
# Checking for imbalance
y.value_counts()

0    7819
1      36
Name: favorite, dtype: int64

In [13]:
# Train / Split Data
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X, y) 

In [14]:
X_train.head()

Unnamed: 0,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,141608,60,0.409,0.00384,0.816,3e-06,0.0932,-4.59,0.0743,164.841,4
1,226986,93,0.749,0.208,0.463,0.0371,0.337,-8.433,0.0828,90.028,4
2,188962,73,0.898,0.0699,0.529,0.0,0.536,-5.428,0.2,110.032,4
3,225717,41,0.511,0.00199,0.639,0.32,0.162,-6.493,0.0253,164.044,4
4,198874,32,0.535,5.6e-05,0.863,0.000378,0.0672,-8.405,0.0463,124.055,4


In [15]:
# Checking if imbalance is gone
y_train.value_counts()

0    7819
1    7819
Name: favorite, dtype: int64

In [16]:
test_set.head()

Unnamed: 0,track_id,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
1455,6aKDdFwPJLgHT4aNktrQmb,170266,75,0.891,0.00225,0.552,2e-06,0.162,-6.877,0.223,135.976,4,0
313,2UJsKjM595pEyWUcd8JEIR,270710,65,0.695,0.0867,0.689,0.0159,0.136,-8.176,0.0492,95.013,4,0
6024,1XP0VR8KMArstV37bfzkt8,339310,48,0.331,0.0463,0.668,0.0,0.227,-3.167,0.0415,145.133,4,0
4409,3HAaEJKZPKHJY4YqYRvDas,174558,0,0.524,0.876,0.602,0.0034,0.133,-10.544,0.0327,97.484,4,0
9113,4Cr7xmAJyVbX6qgS4Pxl66,108253,26,0.492,0.994,0.00591,0.931,0.156,-31.205,0.0575,130.719,5,0


In [17]:
X_test = test_set.drop(columns=['favorite', 'track_id'])
y_test = test_set['favorite']

In [18]:
X_test.head()

Unnamed: 0,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
1455,170266,75,0.891,0.00225,0.552,2e-06,0.162,-6.877,0.223,135.976,4
313,270710,65,0.695,0.0867,0.689,0.0159,0.136,-8.176,0.0492,95.013,4
6024,339310,48,0.331,0.0463,0.668,0.0,0.227,-3.167,0.0415,145.133,4
4409,174558,0,0.524,0.876,0.602,0.0034,0.133,-10.544,0.0327,97.484,4
9113,108253,26,0.492,0.994,0.00591,0.931,0.156,-31.205,0.0575,130.719,5


In [19]:
# Checking for imbalance in test set
y_test.value_counts()

0    1950
1      14
Name: favorite, dtype: int64

#### Testing models

In [20]:
%%time
# Logistic Regression
lr = LogisticRegression(solver='lbfgs', max_iter=400)
lr_scores = cross_val_score(lr, X_train, y_train, cv=10, scoring="f1")
print(np.mean(lr_scores))

0.8221827050829876
CPU times: user 2.51 s, sys: 411 ms, total: 2.92 s
Wall time: 820 ms


In [21]:
# Hyperparameter optimization for Decision Tree Classifier
parameters = {
    'max_depth':[3, 4, 5, 6, 10, 15,20,30],
}
dtc = Pipeline([('CV',GridSearchCV(DecisionTreeClassifier(), parameters, cv = 5))])
dtc.fit(X_train, y_train)
dtc.named_steps['CV'].best_params_

{'max_depth': 30}

In [22]:
%%time
# Decision Tree Classifier
dt = DecisionTreeClassifier(max_depth=30)
dt_scores = cross_val_score(dt, X_train, y_train, cv=10, scoring="f1")
np.mean(dt_scores)

CPU times: user 1.56 s, sys: 11.5 ms, total: 1.57 s
Wall time: 1.58 s


0.9949613401906703

In [23]:
%%time
# Hyperparameter optimization of RandomForestClassifier
parameters = {
    'max_depth':[3, 6,12,15,20],
    'n_estimators':[10, 20,30]
}
clf = Pipeline([('CV',GridSearchCV(RandomForestClassifier(), parameters, cv = 5))])
clf.fit(X_train, y_train)
clf.named_steps['CV'].best_params_

CPU times: user 23.8 s, sys: 258 ms, total: 24 s
Wall time: 24.3 s


{'max_depth': 20, 'n_estimators': 10}

In [24]:
%%time
# RandomForestClassifier
rf = Pipeline([('rf', RandomForestClassifier(n_estimators = 10, max_depth = 20))])
rf_scores = cross_val_score(rf, X_train, y_train, cv=10, scoring="f1")
np.mean(rf_scores)

CPU times: user 2.41 s, sys: 22 ms, total: 2.44 s
Wall time: 2.45 s


0.9972561335198975

Since the RandomForestClassifier has the highest accuracy, I will use it to recommend songs.

### Using algorithm on test data

In [25]:
# Checking for imabalance
y_test.value_counts()

0    1950
1      14
Name: favorite, dtype: int64

In [27]:
# Building a pipeline to use on regular data
# predict__proba
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipe = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators = 30, max_depth = 20))
pipe.fit(X_train, y_train)  # apply scaling on training data
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('rf', RandomForestClassifier(n_estimators = 30, max_depth = 20))])

pipe.score(X_test, y_test)

0.9913441955193483

## Predicting songs and saving to dataset

In [29]:
df = pd.read_csv('data/encoded_playlist_songs.csv')

In [30]:
df.head()

Unnamed: 0,track_id,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
0,7MAibcTli4IisCtbHKrGMh,242096,90,0.586,0.182,0.616,0.0,0.0927,-7.964,0.0324,148.088,4,0
1,5QO79kh1waicV47BqGRL3g,215626,97,0.68,0.0212,0.826,1.2e-05,0.543,-5.487,0.0309,118.051,4,0
2,1diS6nkxMQc3wwC4G1j0bh,165506,88,0.722,0.0319,0.588,0.0,0.183,-5.932,0.0544,134.01,4,0
3,4u4NyuceXP7Uzh7XFJKCr1,170813,89,0.658,0.0106,0.634,0.0,0.132,-5.797,0.0413,139.98,4,0
4,3Ofmpyhv5UAQ70mENzB277,132780,94,0.778,0.175,0.695,0.0,0.15,-6.865,0.0913,149.996,4,0


In [31]:
prob_preds = pipe.predict_proba(df.drop(['favorite','track_id'], axis=1))
threshold = 0.30 # define threshold here
preds = [1 if prob_preds[i][1]> threshold else 0 for i in range(len(prob_preds))]
df['prediction'] = preds

In [33]:
df['prediction'].value_counts()

0    9734
1      35
Name: prediction, dtype: int64

## Building the playlist from recommended songs

In [34]:
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy import oauth2

In [35]:
# Insert your Spotify username and the credentials that you obtained from spotify developer
cid = 'XXXXXX'
secret = 'XXXXX'
redirect_uri='http://localhost:7777/callback'
username = 'XXXXXX'

In [36]:
# Once the Authorisation is complete, we just need to `sp` to call the APIs
scope = 'user-top-read playlist-modify-private playlist-modify-public'
token = util.prompt_for_user_token(username, scope, client_id=cid, client_secret=secret, redirect_uri=redirect_uri)

if token:
    sp = spotipy.Spotify(auth=token)
else:
    print("Can't get token for", username)

In [37]:
def create_playlist(sp, username, playlist_name, playlist_description):
    playlists = sp.user_playlist_create(username, playlist_name, description = playlist_description)

In [38]:
create_playlist(sp, username, 'Your New Jams', 'This playlist was created using python!')

In [39]:
def fetch_playlists(sp, username):
    """
    Returns the user's playlists.
    """
        
    id = []
    name = []
    num_tracks = []
    
    # Make the API request
    playlists = sp.user_playlists(username)
    for playlist in playlists['items']:
        id.append(playlist['id'])
        name.append(playlist['name'])
        num_tracks.append(playlist['tracks']['total'])

    # Create the final df   
    df_playlists = pd.DataFrame({"id":id, "name": name, "#tracks": num_tracks})
    return df_playlists

In [40]:
fetch_playlists(sp,username).head()

Unnamed: 0,id,name,#tracks
0,2IILGkkdPYQhCm5FhleTH4,Your New Jams,0
1,7vBNuND6ANSxetwQP56lbB,Your New Jams,19
2,3zdUVBgr30KKu1lFPimbRQ,Python Playlist #1,50
3,4OqhYtuar9Zcjj4djxIHMo,GOSPEL,34
4,3ALg99PJwQoQTZUgVOxZCr,LUCKI,38


In [41]:
playlist_id = fetch_playlists(sp,username)['id'][0]

In [42]:
def enrich_playlist(sp, username, playlist_id, playlist_tracks):
    index = 0
    results = []
    
    while index < len(playlist_tracks):
        results += sp.user_playlist_add_tracks(username, playlist_id, tracks = playlist_tracks[index:index + 50])
        index += 50

In [43]:
list_track = df.loc[df['prediction']  == 1]['track_id']
enrich_playlist(sp, username, playlist_id, list_track)
fetch_playlists(sp,username).head()

Unnamed: 0,id,name,#tracks
0,2IILGkkdPYQhCm5FhleTH4,Your New Jams,35
1,7vBNuND6ANSxetwQP56lbB,Your New Jams,19
2,3zdUVBgr30KKu1lFPimbRQ,Python Playlist #1,50
3,4OqhYtuar9Zcjj4djxIHMo,GOSPEL,34
4,3ALg99PJwQoQTZUgVOxZCr,LUCKI,38
