In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
from imblearn.over_sampling import SMOTE
%matplotlib inline

In [2]:
# Importing playlist dataframes
df = pd.read_csv('data/encoded_playlist_songs.csv')
df_fav = pd.read_csv('data/favorite_songs.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9769 entries, 0 to 9768
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          9769 non-null   object 
 1   name              9769 non-null   int64  
 2   album             9769 non-null   int64  
 3   artist            9769 non-null   int64  
 4   release_date      9769 non-null   int64  
 5   length            9769 non-null   int64  
 6   popularity        9769 non-null   int64  
 7   danceability      9769 non-null   float64
 8   acousticness      9769 non-null   float64
 9   energy            9769 non-null   float64
 10  instrumentalness  9769 non-null   float64
 11  liveness          9769 non-null   float64
 12  loudness          9769 non-null   float64
 13  speechiness       9769 non-null   float64
 14  tempo             9769 non-null   float64
 15  time_signature    9769 non-null   int64  
 16  favorite          9769 non-null   int64  


In [4]:
df_fav.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          50 non-null     object 
 1   name              50 non-null     int64  
 2   album             50 non-null     int64  
 3   artist            50 non-null     int64  
 4   release_date      50 non-null     int64  
 5   length            50 non-null     int64  
 6   popularity        50 non-null     int64  
 7   danceability      50 non-null     float64
 8   acousticness      50 non-null     float64
 9   energy            50 non-null     float64
 10  instrumentalness  50 non-null     float64
 11  liveness          50 non-null     float64
 12  loudness          50 non-null     float64
 13  speechiness       50 non-null     float64
 14  tempo             50 non-null     float64
 15  time_signature    50 non-null     int64  
 16  favorite          50 non-null     int64  
dtyp

### Preparing data for model

In [5]:
df = pd.concat([df, df_fav], axis=0)
df.shape

(9819, 17)

In [6]:
df.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
0,7MAibcTli4IisCtbHKrGMh,4008,3340,581,2524,242096,90,0.586,0.182,0.616,0.0,0.0927,-7.964,0.0324,148.088,4,0
1,5QO79kh1waicV47BqGRL3g,6038,212,3746,2293,215626,97,0.68,0.0212,0.826,1.2e-05,0.543,-5.487,0.0309,118.051,4,0
2,1diS6nkxMQc3wwC4G1j0bh,8118,2222,1091,2505,165506,88,0.722,0.0319,0.588,0.0,0.183,-5.932,0.0544,134.01,4,0
3,4u4NyuceXP7Uzh7XFJKCr1,3096,2743,1917,2524,170813,89,0.658,0.0106,0.634,0.0,0.132,-5.797,0.0413,139.98,4,0
4,3Ofmpyhv5UAQ70mENzB277,542,433,2439,2478,132780,94,0.778,0.175,0.695,0.0,0.15,-6.865,0.0913,149.996,4,0


In [7]:
df['favorite'].value_counts()

0    9769
1      50
Name: favorite, dtype: int64

### Model Selection & Hyperparameter Tuning

In [8]:
# Importing required libraries
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics 
from sklearn.metrics import f1_score
# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier

In [9]:
# Shuffle your dataset 
shuffle_df = df.sample(frac=1)

# Define a size for your train set 
train_size = int(0.8 * len(df))

# Split dataset 
train_set = shuffle_df[:train_size]
test_set = shuffle_df[train_size:]

In [10]:
train_set.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
3071,6nlA3j4n3KAY8AgEGmos2o,4078,3383,3699,1881,230738,51,0.606,0.142,0.781,0.00803,0.0734,-5.846,0.0703,170.007,4,0
4973,5BEfxZNZZa4wN6mlz4LH4U,3071,6960,3974,2211,241838,42,0.383,0.626,0.478,0.0,0.155,-6.687,0.0428,98.153,4,0
1916,4bPQs0PHn4xbipzdPfn6du,3382,132,2855,1009,186634,75,0.569,0.0807,0.83,0.0,0.114,-4.106,0.14,170.094,4,0
4902,2rkVoKVEMuct8SmEIGKzBw,764,861,482,447,299800,58,0.67,0.325,0.468,0.0,0.0516,-6.725,0.0328,140.069,4,0
3380,1vbtH9ZIuGzTMtcl45QVg9,6362,5525,4155,2454,322813,41,0.261,0.00012,0.86,0.0154,0.15,-5.006,0.0585,159.924,4,0


In [11]:
X = train_set.drop(columns=['favorite', 'track_id'])
y = train_set.favorite

X.head()

Unnamed: 0,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
3071,4078,3383,3699,1881,230738,51,0.606,0.142,0.781,0.00803,0.0734,-5.846,0.0703,170.007,4
4973,3071,6960,3974,2211,241838,42,0.383,0.626,0.478,0.0,0.155,-6.687,0.0428,98.153,4
1916,3382,132,2855,1009,186634,75,0.569,0.0807,0.83,0.0,0.114,-4.106,0.14,170.094,4
4902,764,861,482,447,299800,58,0.67,0.325,0.468,0.0,0.0516,-6.725,0.0328,140.069,4
3380,6362,5525,4155,2454,322813,41,0.261,0.00012,0.86,0.0154,0.15,-5.006,0.0585,159.924,4


In [12]:
# Checking for imbalance
y.value_counts()

0    7813
1      42
Name: favorite, dtype: int64

In [13]:
# Train / Split Data
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X, y) 

In [14]:
X_train.head()

Unnamed: 0,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,4078,3383,3699,1881,230738,51,0.606,0.142,0.781,0.00803,0.0734,-5.846,0.0703,170.007,4
1,3071,6960,3974,2211,241838,42,0.383,0.626,0.478,0.0,0.155,-6.687,0.0428,98.153,4
2,3382,132,2855,1009,186634,75,0.569,0.0807,0.83,0.0,0.114,-4.106,0.14,170.094,4
3,764,861,482,447,299800,58,0.67,0.325,0.468,0.0,0.0516,-6.725,0.0328,140.069,4
4,6362,5525,4155,2454,322813,41,0.261,0.00012,0.86,0.0154,0.15,-5.006,0.0585,159.924,4


In [15]:
# Checking if imbalance is gone
y_train.value_counts()

0    7813
1    7813
Name: favorite, dtype: int64

In [16]:
test_set.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
1801,3XiQt1q17AatYNvhyVK64S,1694,1503,3632,2524,214889,42,0.427,0.00749,0.738,0.00012,0.0755,-3.241,0.0433,88.769,4,0
304,5VlTQnZO89Ioku8ssdbqJk,1689,1498,556,2399,247079,83,0.505,0.279,0.267,0.0,0.096,-13.763,0.05,143.929,4,0
4914,6OLlACmD4AlW2N7CWVhVxO,6050,5210,2840,2087,212973,49,0.502,0.591,0.409,0.0,0.113,-7.581,0.0512,75.231,4,0
1052,6ZmLu4NAhMsuAXLkcLMBrb,3169,255,21,528,287546,60,0.763,0.0346,0.84,0.0,0.0601,-3.704,0.0929,92.477,4,0
3387,2tmWeWTmfTX2TpLnNtnUlR,2367,2044,1501,2454,225250,38,0.16,0.000832,0.932,0.00477,0.15,-3.419,0.0833,80.607,4,0


In [17]:
X_test = test_set.drop(columns=['favorite', 'track_id'])
y_test = test_set['favorite']

In [18]:
X_test.head()

Unnamed: 0,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
1801,1694,1503,3632,2524,214889,42,0.427,0.00749,0.738,0.00012,0.0755,-3.241,0.0433,88.769,4
304,1689,1498,556,2399,247079,83,0.505,0.279,0.267,0.0,0.096,-13.763,0.05,143.929,4
4914,6050,5210,2840,2087,212973,49,0.502,0.591,0.409,0.0,0.113,-7.581,0.0512,75.231,4
1052,3169,255,21,528,287546,60,0.763,0.0346,0.84,0.0,0.0601,-3.704,0.0929,92.477,4
3387,2367,2044,1501,2454,225250,38,0.16,0.000832,0.932,0.00477,0.15,-3.419,0.0833,80.607,4


In [19]:
# Checking for imbalance in test set
y_test.value_counts()

0    1956
1       8
Name: favorite, dtype: int64

#### Testing models

In [20]:
%%time
# Logistic Regression
lr = LogisticRegression(solver='lbfgs', max_iter=400)
lr_scores = cross_val_score(lr, X_train, y_train, cv=10, scoring="f1")
print(np.mean(lr_scores))

0.8121918207896656
CPU times: user 3.68 s, sys: 661 ms, total: 4.34 s
Wall time: 1.39 s


In [21]:
# Hyperparameter optimization for Decision Tree Classifier
parameters = {
    'max_depth':[3, 4, 5, 6, 10, 15,20,30],
}
dtc = Pipeline([('CV',GridSearchCV(DecisionTreeClassifier(), parameters, cv = 5))])
dtc.fit(X_train, y_train)
dtc.named_steps['CV'].best_params_

{'max_depth': 30}

In [22]:
%%time
# Decision Tree Classifier
dt = DecisionTreeClassifier(max_depth=30)
dt_scores = cross_val_score(dt, X_train, y_train, cv=10, scoring="f1")
np.mean(dt_scores)

CPU times: user 2.25 s, sys: 25.6 ms, total: 2.28 s
Wall time: 2.31 s


0.99533530029983

In [23]:
%%time
# Hyperparameter optimization of RandomForestClassifier
parameters = {
    'max_depth':[3, 6,12,15,20],
    'n_estimators':[10, 20,30]
}
clf = Pipeline([('CV',GridSearchCV(RandomForestClassifier(), parameters, cv = 5))])
clf.fit(X_train, y_train)
clf.named_steps['CV'].best_params_

CPU times: user 25.1 s, sys: 334 ms, total: 25.5 s
Wall time: 25.8 s


{'max_depth': 20, 'n_estimators': 30}

In [24]:
%%time
# RandomForestClassifier
rf = Pipeline([('rf', RandomForestClassifier(n_estimators = 10, max_depth = 20))])
rf_scores = cross_val_score(rf, X_train, y_train, cv=10, scoring="f1")
np.mean(rf_scores)

CPU times: user 2.55 s, sys: 35.7 ms, total: 2.58 s
Wall time: 2.62 s


0.9984673390801653

Since the RandomForestClassifier has the highest accuracy, I will use it to recommend songs.

### Using algorithm on test data

In [25]:
# Checking for imabalance
y_test.value_counts()

0    1956
1       8
Name: favorite, dtype: int64

In [26]:
# Building a pipeline to use on regular data
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipe = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators = 30, max_depth = 20))
pipe.fit(X_train, y_train)  # apply scaling on training data
# pipe.set_params(logisticrergession__n_estimators=30)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('rf', RandomForestClassifier(n_estimators = 30, max_depth = 20))])
pipe.score(X_test, y_test)

0.9959266802443992

In [27]:
df.shape

(9819, 17)

In [28]:
len(pipe.predict(df.drop(['favorite','track_id'], axis=1)))

9819

## Predicting songs and saving to dataset

In [29]:
df = pd.read_csv('data/encoded_playlist_songs.csv')

In [30]:
df.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
0,7MAibcTli4IisCtbHKrGMh,4008,3340,581,2524,242096,90,0.586,0.182,0.616,0.0,0.0927,-7.964,0.0324,148.088,4,0
1,5QO79kh1waicV47BqGRL3g,6038,212,3746,2293,215626,97,0.68,0.0212,0.826,1.2e-05,0.543,-5.487,0.0309,118.051,4,0
2,1diS6nkxMQc3wwC4G1j0bh,8118,2222,1091,2505,165506,88,0.722,0.0319,0.588,0.0,0.183,-5.932,0.0544,134.01,4,0
3,4u4NyuceXP7Uzh7XFJKCr1,3096,2743,1917,2524,170813,89,0.658,0.0106,0.634,0.0,0.132,-5.797,0.0413,139.98,4,0
4,3Ofmpyhv5UAQ70mENzB277,542,433,2439,2478,132780,94,0.778,0.175,0.695,0.0,0.15,-6.865,0.0913,149.996,4,0


In [31]:
prediction = pipe.predict(df.drop(['favorite','track_id'], axis=1))
df['prediction'] = prediction

In [32]:
df['prediction'].head()

0    0
1    0
2    0
3    0
4    0
Name: prediction, dtype: int64

In [33]:
df['prediction'].value_counts()

0    9758
1      11
Name: prediction, dtype: int64

## Building the playlist from recommended songs

In [34]:
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy import oauth2

In [35]:
# Insert your Spotify username and the credentials that you obtained from spotify developer
cid = 'XXXXX'
secret = 'XXXXX'
redirect_uri='http://localhost:7777/callback'
username = 'XXXXXX'

In [36]:
# Once the Authorisation is complete, we just need to `sp` to call the APIs
scope = 'user-top-read playlist-modify-private playlist-modify-public'
token = util.prompt_for_user_token(username, scope, client_id=cid, client_secret=secret, redirect_uri=redirect_uri)

if token:
    sp = spotipy.Spotify(auth=token)
else:
    print("Can't get token for", username)

In [37]:
def create_playlist(sp, username, playlist_name, playlist_description):
    playlists = sp.user_playlist_create(username, playlist_name, description = playlist_description)

In [38]:
create_playlist(sp, username, 'Your New Jams', 'This playlist was created using python!')

In [39]:
def fetch_playlists(sp, username):
    """
    Returns the user's playlists.
    """
        
    id = []
    name = []
    num_tracks = []
    
    # Make the API request
    playlists = sp.user_playlists(username)
    for playlist in playlists['items']:
        id.append(playlist['id'])
        name.append(playlist['name'])
        num_tracks.append(playlist['tracks']['total'])

    # Create the final df   
    df_playlists = pd.DataFrame({"id":id, "name": name, "#tracks": num_tracks})
    return df_playlists

In [40]:
fetch_playlists(sp,username).head()

Unnamed: 0,id,name,#tracks
0,3HvrwHSmvuJnKZm1mhxKSK,Your New Jams,0
1,3zdUVBgr30KKu1lFPimbRQ,Your New Jams,50
2,4OqhYtuar9Zcjj4djxIHMo,GOSPEL,34
3,3ALg99PJwQoQTZUgVOxZCr,LUCKI,38
4,4STXJNA3A9Hkz7soJiPz4s,POSITIVITY,77


In [41]:
playlist_id = fetch_playlists(sp,username)['id'][0]

In [42]:
def enrich_playlist(sp, username, playlist_id, playlist_tracks):
    index = 0
    results = []
    
    while index < len(playlist_tracks):
        results += sp.user_playlist_add_tracks(username, playlist_id, tracks = playlist_tracks[index:index + 50])
        index += 50

In [44]:
list_track = df.loc[df['prediction']  == 1]['track_id']
enrich_playlist(sp, username, playlist_id, list_track)
fetch_playlists(sp,username).head()

Unnamed: 0,id,name,#tracks
0,3HvrwHSmvuJnKZm1mhxKSK,Your New Jams,11
1,3zdUVBgr30KKu1lFPimbRQ,Your New Jams,50
2,4OqhYtuar9Zcjj4djxIHMo,GOSPEL,34
3,3ALg99PJwQoQTZUgVOxZCr,LUCKI,38
4,4STXJNA3A9Hkz7soJiPz4s,POSITIVITY,77
