In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
from imblearn.over_sampling import SMOTE
%matplotlib inline

In [2]:
# Importing playlist dataframes
df = pd.read_csv('data/encoded_playlist_songs.csv')
df_fav = pd.read_csv('data/favorite_songs.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9769 entries, 0 to 9768
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          9769 non-null   object 
 1   name              9769 non-null   int64  
 2   album             9769 non-null   int64  
 3   artist            9769 non-null   int64  
 4   release_date      9769 non-null   int64  
 5   length            9769 non-null   int64  
 6   popularity        9769 non-null   int64  
 7   danceability      9769 non-null   float64
 8   acousticness      9769 non-null   float64
 9   energy            9769 non-null   float64
 10  instrumentalness  9769 non-null   float64
 11  liveness          9769 non-null   float64
 12  loudness          9769 non-null   float64
 13  speechiness       9769 non-null   float64
 14  tempo             9769 non-null   float64
 15  time_signature    9769 non-null   int64  
 16  favorite          9769 non-null   int64  


In [4]:
df_fav.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          50 non-null     object 
 1   name              50 non-null     int64  
 2   album             50 non-null     int64  
 3   artist            50 non-null     int64  
 4   release_date      50 non-null     int64  
 5   length            50 non-null     int64  
 6   popularity        50 non-null     int64  
 7   danceability      50 non-null     float64
 8   acousticness      50 non-null     float64
 9   energy            50 non-null     float64
 10  instrumentalness  50 non-null     float64
 11  liveness          50 non-null     float64
 12  loudness          50 non-null     float64
 13  speechiness       50 non-null     float64
 14  tempo             50 non-null     float64
 15  time_signature    50 non-null     int64  
 16  favorite          50 non-null     int64  
dtyp

### Preparing data for model

In [5]:
df = pd.concat([df, df_fav], axis=0)
df.shape

(9819, 17)

In [6]:
df.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
0,7MAibcTli4IisCtbHKrGMh,4008,3340,581,2524,242096,90,0.586,0.182,0.616,0.0,0.0927,-7.964,0.0324,148.088,4,0
1,5QO79kh1waicV47BqGRL3g,6038,212,3746,2293,215626,97,0.68,0.0212,0.826,1.2e-05,0.543,-5.487,0.0309,118.051,4,0
2,1diS6nkxMQc3wwC4G1j0bh,8118,2222,1091,2505,165506,88,0.722,0.0319,0.588,0.0,0.183,-5.932,0.0544,134.01,4,0
3,4u4NyuceXP7Uzh7XFJKCr1,3096,2743,1917,2524,170813,89,0.658,0.0106,0.634,0.0,0.132,-5.797,0.0413,139.98,4,0
4,3Ofmpyhv5UAQ70mENzB277,542,433,2439,2478,132780,94,0.778,0.175,0.695,0.0,0.15,-6.865,0.0913,149.996,4,0


In [7]:
df['favorite'].value_counts()

0    9769
1      50
Name: favorite, dtype: int64

### Model Selection & Hyperparameter Tuning

In [8]:
# Importing required libraries
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics 
from sklearn.metrics import f1_score
# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier

In [9]:
# Shuffle your dataset 
shuffle_df = df.sample(frac=1)

# Define a size for your train set 
train_size = int(0.8 * len(df))

# Split dataset 
train_set = shuffle_df[:train_size]
test_set = shuffle_df[train_size:]

In [10]:
train_set.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
7230,39SAR2aRZiNOFS5e9T76g8,2947,4723,2706,1728,225000,37,0.527,0.0153,0.885,0.000686,0.405,-7.017,0.087,128.057,4,0
5941,6KpGyJIyA75C0XyAkIcpZn,4678,3895,2270,2302,193132,60,0.645,0.0773,0.96,0.00641,0.594,-6.497,0.0399,127.999,4,0
4291,3cWPcCrbkfnqvpexzbx0ZR,6312,5467,839,2315,176813,33,0.654,0.0487,0.852,0.0,0.367,-4.629,0.0325,105.999,4,0
1641,3r6Y0k1n0fn8FXwq5Hbeac,2920,2551,3084,2424,140559,56,0.7,0.328,0.764,0.0,0.376,-7.143,0.198,92.045,4,0
6224,0nKh47v1SFiznHpzvWrypr,3211,2848,2090,2530,218813,50,0.71,0.456,0.49,0.00218,0.142,-9.705,0.211,80.836,4,0


In [11]:
X = train_set.drop(columns=['favorite', 'track_id'])
y = train_set.favorite

X.head()

Unnamed: 0,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
7230,2947,4723,2706,1728,225000,37,0.527,0.0153,0.885,0.000686,0.405,-7.017,0.087,128.057,4
5941,4678,3895,2270,2302,193132,60,0.645,0.0773,0.96,0.00641,0.594,-6.497,0.0399,127.999,4
4291,6312,5467,839,2315,176813,33,0.654,0.0487,0.852,0.0,0.367,-4.629,0.0325,105.999,4
1641,2920,2551,3084,2424,140559,56,0.7,0.328,0.764,0.0,0.376,-7.143,0.198,92.045,4
6224,3211,2848,2090,2530,218813,50,0.71,0.456,0.49,0.00218,0.142,-9.705,0.211,80.836,4


In [12]:
# Checking for imbalance
y.value_counts()

0    7818
1      37
Name: favorite, dtype: int64

In [13]:
# Train / Split Data
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X, y) 

In [14]:
X_train.head()

Unnamed: 0,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,2947,4723,2706,1728,225000,37,0.527,0.0153,0.885,0.000686,0.405,-7.017,0.087,128.057,4
1,4678,3895,2270,2302,193132,60,0.645,0.0773,0.96,0.00641,0.594,-6.497,0.0399,127.999,4
2,6312,5467,839,2315,176813,33,0.654,0.0487,0.852,0.0,0.367,-4.629,0.0325,105.999,4
3,2920,2551,3084,2424,140559,56,0.7,0.328,0.764,0.0,0.376,-7.143,0.198,92.045,4
4,3211,2848,2090,2530,218813,50,0.71,0.456,0.49,0.00218,0.142,-9.705,0.211,80.836,4


In [15]:
# Checking if imbalance is gone
y_train.value_counts()

0    7818
1    7818
Name: favorite, dtype: int64

In [16]:
test_set.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
4359,2SGjJdLuSIvyVB5eXAJXm7,8100,1116,2588,1538,262760,51,0.721,0.013,0.944,0.0,0.32,-5.982,0.159,85.002,4,0
8627,5qaQXGQlrANAeeZM8Kw8En,3902,2337,909,542,1343000,10,0.289,0.991,0.0622,0.921,0.0628,-24.899,0.0333,70.876,4,0
1559,5dfMZKLa5AmXmsMmkQgu0t,3758,3224,183,2418,118627,59,0.811,0.0475,0.526,0.0,0.101,-6.365,0.26,150.084,4,0
2126,78gke8uqCWgeqeQm6I9CGi,3743,3215,2338,2430,159393,64,0.461,0.0246,0.825,0.0,0.224,-3.583,0.115,198.045,4,0
8441,5W7IiH2jDV6q6EQwBkq8ld,3954,5918,3345,1056,295973,45,0.379,0.986,0.251,0.0122,0.094,-15.488,0.0991,73.316,4,0


In [17]:
X_test = test_set.drop(columns=['favorite', 'track_id'])
y_test = test_set['favorite']

In [18]:
X_test.head()

Unnamed: 0,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
4359,8100,1116,2588,1538,262760,51,0.721,0.013,0.944,0.0,0.32,-5.982,0.159,85.002,4
8627,3902,2337,909,542,1343000,10,0.289,0.991,0.0622,0.921,0.0628,-24.899,0.0333,70.876,4
1559,3758,3224,183,2418,118627,59,0.811,0.0475,0.526,0.0,0.101,-6.365,0.26,150.084,4
2126,3743,3215,2338,2430,159393,64,0.461,0.0246,0.825,0.0,0.224,-3.583,0.115,198.045,4
8441,3954,5918,3345,1056,295973,45,0.379,0.986,0.251,0.0122,0.094,-15.488,0.0991,73.316,4


In [19]:
# Checking for imbalance in test set
y_test.value_counts()

0    1951
1      13
Name: favorite, dtype: int64

#### Testing models

In [20]:
%%time
# Logistic Regression
lr = LogisticRegression(solver='lbfgs', max_iter=400)
lr_scores = cross_val_score(lr, X_train, y_train, cv=10, scoring="f1")
print(np.mean(lr_scores))

0.787081904306151
CPU times: user 4.63 s, sys: 740 ms, total: 5.37 s
Wall time: 1.48 s


In [21]:
# Hyperparameter optimization for Decision Tree Classifier
parameters = {
    'max_depth':[3, 4, 5, 6, 10, 15,20,30],
}
dtc = Pipeline([('CV',GridSearchCV(DecisionTreeClassifier(), parameters, cv = 5))])
dtc.fit(X_train, y_train)
dtc.named_steps['CV'].best_params_

{'max_depth': 30}

In [22]:
%%time
# Decision Tree Classifier
dt = DecisionTreeClassifier(max_depth=30)
dt_scores = cross_val_score(dt, X_train, y_train, cv=10, scoring="f1")
np.mean(dt_scores)

CPU times: user 2.13 s, sys: 20.7 ms, total: 2.15 s
Wall time: 2.17 s


0.9964828202843522

In [23]:
%%time
# Hyperparameter optimization of RandomForestClassifier
parameters = {
    'max_depth':[3, 6,12,15,20],
    'n_estimators':[10, 20,30]
}
clf = Pipeline([('CV',GridSearchCV(RandomForestClassifier(), parameters, cv = 5))])
clf.fit(X_train, y_train)
clf.named_steps['CV'].best_params_

CPU times: user 24.9 s, sys: 312 ms, total: 25.2 s
Wall time: 25.5 s


{'max_depth': 20, 'n_estimators': 30}

In [24]:
%%time
# RandomForestClassifier
rf = Pipeline([('rf', RandomForestClassifier(n_estimators = 10, max_depth = 20))])
rf_scores = cross_val_score(rf, X_train, y_train, cv=10, scoring="f1")
np.mean(rf_scores)

CPU times: user 2.38 s, sys: 26 ms, total: 2.41 s
Wall time: 2.42 s


0.9991694922861608

Since the RandomForestClassifier has the highest accuracy, I will use it to recommend songs.

### Using algorithm on test data

In [25]:
# Checking for imabalance
y_test.value_counts()

0    1951
1      13
Name: favorite, dtype: int64

In [26]:
# Building a pipeline to use on regular data
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipe = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators = 30, max_depth = 20))
pipe.fit(X_train, y_train)  # apply scaling on training data
# pipe.set_params(logisticrergession__n_estimators=30)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('rf', RandomForestClassifier(n_estimators = 30, max_depth = 20))])
pipe.score(X_test, y_test)

0.9938900203665988

In [27]:
df.shape

(9819, 17)

In [28]:
len(pipe.predict(df.drop(['favorite','track_id'], axis=1)))

9819

## Predicting songs and saving to dataset

In [29]:
df = pd.read_csv('data/encoded_playlist_songs.csv')

In [30]:
df.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
0,7MAibcTli4IisCtbHKrGMh,4008,3340,581,2524,242096,90,0.586,0.182,0.616,0.0,0.0927,-7.964,0.0324,148.088,4,0
1,5QO79kh1waicV47BqGRL3g,6038,212,3746,2293,215626,97,0.68,0.0212,0.826,1.2e-05,0.543,-5.487,0.0309,118.051,4,0
2,1diS6nkxMQc3wwC4G1j0bh,8118,2222,1091,2505,165506,88,0.722,0.0319,0.588,0.0,0.183,-5.932,0.0544,134.01,4,0
3,4u4NyuceXP7Uzh7XFJKCr1,3096,2743,1917,2524,170813,89,0.658,0.0106,0.634,0.0,0.132,-5.797,0.0413,139.98,4,0
4,3Ofmpyhv5UAQ70mENzB277,542,433,2439,2478,132780,94,0.778,0.175,0.695,0.0,0.15,-6.865,0.0913,149.996,4,0


In [31]:
prediction = pipe.predict(df.drop(['favorite','track_id'], axis=1))
df['prediction'] = prediction

In [32]:
df['prediction'].head()

0    0
1    0
2    0
3    0
4    0
Name: prediction, dtype: int64

In [33]:
df['prediction'].value_counts()

0    9761
1       8
Name: prediction, dtype: int64

## Building the playlist from recommended songs

In [None]:
def create_playlist(sp, username, playlist_name, playlist_description):
    playlists = sp.user_playlist_create(username, playlist_name, description = playlist_description)

In [None]:
create_playlist(sp, username, 'Your New Jams', 'This playlist was created using python!')

In [None]:
def fetch_playlists(sp, username):
    """
    Returns the user's playlists.
    """
        
    id = []
    name = []
    num_tracks = []
    
    # Make the API request
    playlists = sp.user_playlists(username)
    for playlist in playlists['items']:
        id.append(playlist['id'])
        name.append(playlist['name'])
        num_tracks.append(playlist['tracks']['total'])

    # Create the final df   
    df_playlists = pd.DataFrame({"id":id, "name": name, "#tracks": num_tracks})
    return df_playlists

In [None]:
fetch_playlists(sp,username).head()

In [None]:
playlist_id = fetch_playlists(sp,username)['id'][0]

In [None]:
def enrich_playlist(sp, username, playlist_id, playlist_tracks):
    index = 0
    results = []
    
    while index < len(playlist_tracks):
        results += sp.user_playlist_add_tracks(username, playlist_id, tracks = playlist_tracks[index:index + 50])
        index += 50

In [None]:
list_track = df2.index
enrich_playlist(sp, username, playlist_id, list_track)
fetch_playlists(sp,username).head()