In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
from imblearn.over_sampling import SMOTE
%matplotlib inline

In [2]:
# Importing playlist dataframes
df = pd.read_csv('data/encoded_playlist_songs.csv')
df_fav = pd.read_csv('data/favorite_songs.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9769 entries, 0 to 9768
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          9769 non-null   object 
 1   name              9769 non-null   int64  
 2   album             9769 non-null   int64  
 3   artist            9769 non-null   int64  
 4   release_date      9769 non-null   int64  
 5   length            9769 non-null   int64  
 6   popularity        9769 non-null   int64  
 7   danceability      9769 non-null   float64
 8   acousticness      9769 non-null   float64
 9   energy            9769 non-null   float64
 10  instrumentalness  9769 non-null   float64
 11  liveness          9769 non-null   float64
 12  loudness          9769 non-null   float64
 13  speechiness       9769 non-null   float64
 14  tempo             9769 non-null   float64
 15  time_signature    9769 non-null   int64  
 16  favorite          9769 non-null   int64  


In [4]:
df_fav.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          50 non-null     object 
 1   name              50 non-null     int64  
 2   album             50 non-null     int64  
 3   artist            50 non-null     int64  
 4   release_date      50 non-null     int64  
 5   length            50 non-null     int64  
 6   popularity        50 non-null     int64  
 7   danceability      50 non-null     float64
 8   acousticness      50 non-null     float64
 9   energy            50 non-null     float64
 10  instrumentalness  50 non-null     float64
 11  liveness          50 non-null     float64
 12  loudness          50 non-null     float64
 13  speechiness       50 non-null     float64
 14  tempo             50 non-null     float64
 15  time_signature    50 non-null     int64  
 16  favorite          50 non-null     int64  
dtyp

### Preparing data for model

In [5]:
# Remove song name, artist, album
df = pd.concat([df, df_fav], axis=0)
df.shape

(9819, 17)

In [6]:
df.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
0,7MAibcTli4IisCtbHKrGMh,4008,3340,581,2524,242096,90,0.586,0.182,0.616,0.0,0.0927,-7.964,0.0324,148.088,4,0
1,5QO79kh1waicV47BqGRL3g,6038,212,3746,2293,215626,97,0.68,0.0212,0.826,1.2e-05,0.543,-5.487,0.0309,118.051,4,0
2,1diS6nkxMQc3wwC4G1j0bh,8118,2222,1091,2505,165506,88,0.722,0.0319,0.588,0.0,0.183,-5.932,0.0544,134.01,4,0
3,4u4NyuceXP7Uzh7XFJKCr1,3096,2743,1917,2524,170813,89,0.658,0.0106,0.634,0.0,0.132,-5.797,0.0413,139.98,4,0
4,3Ofmpyhv5UAQ70mENzB277,542,433,2439,2478,132780,94,0.778,0.175,0.695,0.0,0.15,-6.865,0.0913,149.996,4,0


In [7]:
df['favorite'].value_counts()

0    9769
1      50
Name: favorite, dtype: int64

### Data Prep Contd, Model Selection & Hyperparameter Tuning

In [8]:
# Importing required libraries
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics 
from sklearn.metrics import f1_score
# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier

In [9]:
# Shuffle your dataset 
shuffle_df = df.sample(frac=1)

# Define a size for your train set 
train_size = int(0.8 * len(df))

# Split dataset 
train_set = shuffle_df[:train_size]
test_set = shuffle_df[train_size:]

In [10]:
train_set.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
1448,7eX3um6NpOQKWJMGCi97XD,5705,4893,2195,2523,192804,78,0.779,0.00471,0.705,0.0,0.0971,-5.891,0.163,140.005,4,0
408,3zPfyVThoCzQ6IB5CSGDTz,8416,7241,1462,2495,213546,73,0.685,0.016,0.791,0.153,0.703,-6.628,0.0337,125.052,4,0
4485,7L3JMPBLzzVdEeYRnHLI4d,1914,1684,2044,2430,207626,56,0.689,0.000267,0.747,1e-06,0.0909,-5.861,0.0283,100.069,4,0
9649,3O34kfvxjR1hRGOdHbHCqu,1242,6238,791,1074,268840,0,0.568,0.369,0.428,0.136,0.0981,-10.305,0.0511,119.951,4,0
3290,4t0YSm5foVoTYHLpxoK9t2,7097,895,3507,388,281666,2,0.521,0.0063,0.722,0.741,0.0682,-13.8,0.031,106.784,4,0


In [11]:
X = train_set.drop(columns=['favorite', 'track_id'])
y = train_set.favorite

X.head()

Unnamed: 0,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
1448,5705,4893,2195,2523,192804,78,0.779,0.00471,0.705,0.0,0.0971,-5.891,0.163,140.005,4
408,8416,7241,1462,2495,213546,73,0.685,0.016,0.791,0.153,0.703,-6.628,0.0337,125.052,4
4485,1914,1684,2044,2430,207626,56,0.689,0.000267,0.747,1e-06,0.0909,-5.861,0.0283,100.069,4
9649,1242,6238,791,1074,268840,0,0.568,0.369,0.428,0.136,0.0981,-10.305,0.0511,119.951,4
3290,7097,895,3507,388,281666,2,0.521,0.0063,0.722,0.741,0.0682,-13.8,0.031,106.784,4


In [12]:
# Checking for imbalance
y.value_counts()

0    7815
1      40
Name: favorite, dtype: int64

In [13]:
# Train / Split Data
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X, y) 

In [14]:
X_train.head()

Unnamed: 0,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,5705,4893,2195,2523,192804,78,0.779,0.00471,0.705,0.0,0.0971,-5.891,0.163,140.005,4
1,8416,7241,1462,2495,213546,73,0.685,0.016,0.791,0.153,0.703,-6.628,0.0337,125.052,4
2,1914,1684,2044,2430,207626,56,0.689,0.000267,0.747,1e-06,0.0909,-5.861,0.0283,100.069,4
3,1242,6238,791,1074,268840,0,0.568,0.369,0.428,0.136,0.0981,-10.305,0.0511,119.951,4
4,7097,895,3507,388,281666,2,0.521,0.0063,0.722,0.741,0.0682,-13.8,0.031,106.784,4


In [15]:
# Checking if imbalance is gone
y_train.value_counts()

0    7815
1    7815
Name: favorite, dtype: int64

In [16]:
test_set.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
1707,2Sxo09VIgJDYNrytLGkkSJ,5273,25,3928,1390,254266,15,0.732,0.445,0.15,0.0,0.371,-17.737,0.92,70.341,4,0
7879,4ZD8KtkiNJI9Jh4joc7gFA,7821,3462,1335,1954,166320,22,0.316,0.991,0.0736,0.88,0.0824,-25.506,0.0414,77.974,4,0
171,7vrJn5hDSXRmdXoR30KgF1,596,483,1933,2424,178946,93,0.863,0.212,0.666,0.000493,0.103,-4.158,0.152,163.908,4,0
6460,02CkWazM7YqypIxYKy3iSy,3544,5118,3189,2410,225494,48,0.781,0.0185,0.632,4.8e-05,0.116,-7.774,0.101,114.018,4,0
9071,53lNcoa49wX4lmLBw5Sqay,1431,5295,2710,1201,450413,26,0.154,0.972,0.0118,0.847,0.0709,-31.067,0.0467,80.151,3,0


In [17]:
X_test = test_set.drop(columns=['favorite', 'track_id'])
y_test = test_set['favorite']

In [18]:
X_test.head()

Unnamed: 0,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
1707,5273,25,3928,1390,254266,15,0.732,0.445,0.15,0.0,0.371,-17.737,0.92,70.341,4
7879,7821,3462,1335,1954,166320,22,0.316,0.991,0.0736,0.88,0.0824,-25.506,0.0414,77.974,4
171,596,483,1933,2424,178946,93,0.863,0.212,0.666,0.000493,0.103,-4.158,0.152,163.908,4
6460,3544,5118,3189,2410,225494,48,0.781,0.0185,0.632,4.8e-05,0.116,-7.774,0.101,114.018,4
9071,1431,5295,2710,1201,450413,26,0.154,0.972,0.0118,0.847,0.0709,-31.067,0.0467,80.151,3


In [19]:
# Checking for imbalance in test set
y_test.value_counts()

0    1954
1      10
Name: favorite, dtype: int64

#### Testing models

In [20]:
%%time
# Logistic Regression
lr = LogisticRegression(solver='lbfgs', max_iter=400)
lr_scores = cross_val_score(lr, X_train, y_train, cv=10, scoring="f1")
print(np.mean(lr_scores))

0.804164180311262
CPU times: user 4.07 s, sys: 773 ms, total: 4.84 s
Wall time: 1.57 s


In [21]:
# Hyperparameter optimization for Decision Tree Classifier
parameters = {
    'max_depth':[3, 4, 5, 6, 10, 15,20,30],
}
dtc = Pipeline([('CV',GridSearchCV(DecisionTreeClassifier(), parameters, cv = 5))])
dtc.fit(X_train, y_train)
dtc.named_steps['CV'].best_params_

{'max_depth': 30}

In [22]:
%%time
# Decision Tree Classifier
dt = DecisionTreeClassifier(max_depth=30)
dt_scores = cross_val_score(dt, X_train, y_train, cv=10, scoring="f1")
np.mean(dt_scores)

CPU times: user 2.4 s, sys: 26.7 ms, total: 2.43 s
Wall time: 2.45 s


0.9931676657346721

In [23]:
%%time
# Hyperparameter optimization of RandomForestClassifier
parameters = {
    'max_depth':[3, 6,12,15,20],
    'n_estimators':[10, 20,30]
}
clf = Pipeline([('CV',GridSearchCV(RandomForestClassifier(), parameters, cv = 5))])
clf.fit(X_train, y_train)
clf.named_steps['CV'].best_params_

CPU times: user 25.7 s, sys: 339 ms, total: 26 s
Wall time: 26.3 s


{'max_depth': 20, 'n_estimators': 30}

In [24]:
%%time
# RandomForestClassifier
rf = Pipeline([('rf', RandomForestClassifier(n_estimators = 10, max_depth = 20))])
rf_scores = cross_val_score(rf, X_train, y_train, cv=10, scoring="f1")
np.mean(rf_scores)

CPU times: user 2.59 s, sys: 49.2 ms, total: 2.64 s
Wall time: 2.7 s


0.9988501654805519

Since the RandomForestClassifier has the highest accuracy, I will use it to recommend songs.

### Using algorithm on test data

In [25]:
# Checking for imabalance
y_test.value_counts()

0    1954
1      10
Name: favorite, dtype: int64

In [44]:
# Building a pipeline to use on regular data
# predict__proba
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipe = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators = 30, max_depth = 20))
pipe.fit(X_train, y_train)  # apply scaling on training data
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('rf', RandomForestClassifier(n_estimators = 30, max_depth = 20))])
pipe.predict__proba # apply threshold here
# pipe.score(X_test, y_test)

AttributeError: 'Pipeline' object has no attribute 'predict__proba'

In [27]:
df.shape

(9819, 17)

In [28]:
len(pipe.predict(df.drop(['favorite','track_id'], axis=1)))

9819

## Predicting songs and saving to dataset

In [29]:
df = pd.read_csv('data/encoded_playlist_songs.csv')

In [30]:
df.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
0,7MAibcTli4IisCtbHKrGMh,4008,3340,581,2524,242096,90,0.586,0.182,0.616,0.0,0.0927,-7.964,0.0324,148.088,4,0
1,5QO79kh1waicV47BqGRL3g,6038,212,3746,2293,215626,97,0.68,0.0212,0.826,1.2e-05,0.543,-5.487,0.0309,118.051,4,0
2,1diS6nkxMQc3wwC4G1j0bh,8118,2222,1091,2505,165506,88,0.722,0.0319,0.588,0.0,0.183,-5.932,0.0544,134.01,4,0
3,4u4NyuceXP7Uzh7XFJKCr1,3096,2743,1917,2524,170813,89,0.658,0.0106,0.634,0.0,0.132,-5.797,0.0413,139.98,4,0
4,3Ofmpyhv5UAQ70mENzB277,542,433,2439,2478,132780,94,0.778,0.175,0.695,0.0,0.15,-6.865,0.0913,149.996,4,0


In [31]:
prediction = pipe.predict(df.drop(['favorite','track_id'], axis=1))
df['prediction'] = prediction

In [32]:
df['prediction'].head()

0    0
1    0
2    0
3    0
4    0
Name: prediction, dtype: int64

In [33]:
df['prediction'].value_counts()

0    9763
1       6
Name: prediction, dtype: int64

## Building the playlist from recommended songs

In [34]:
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy import oauth2

In [35]:
# Insert your Spotify username and the credentials that you obtained from spotify developer
cid = '58ecd7aadf294b9aa038a3080ef670cb'
secret = '4a277ac2c0a744eea5c839b1ecb27002'
redirect_uri='http://localhost:7777/callback'
username = 'yvngflash_'

In [36]:
# Once the Authorisation is complete, we just need to `sp` to call the APIs
scope = 'user-top-read playlist-modify-private playlist-modify-public'
token = util.prompt_for_user_token(username, scope, client_id=cid, client_secret=secret, redirect_uri=redirect_uri)

if token:
    sp = spotipy.Spotify(auth=token)
else:
    print("Can't get token for", username)

In [37]:
def create_playlist(sp, username, playlist_name, playlist_description):
    playlists = sp.user_playlist_create(username, playlist_name, description = playlist_description)

In [38]:
create_playlist(sp, username, 'Your New Jams', 'This playlist was created using python!')

In [39]:
def fetch_playlists(sp, username):
    """
    Returns the user's playlists.
    """
        
    id = []
    name = []
    num_tracks = []
    
    # Make the API request
    playlists = sp.user_playlists(username)
    for playlist in playlists['items']:
        id.append(playlist['id'])
        name.append(playlist['name'])
        num_tracks.append(playlist['tracks']['total'])

    # Create the final df   
    df_playlists = pd.DataFrame({"id":id, "name": name, "#tracks": num_tracks})
    return df_playlists

In [40]:
fetch_playlists(sp,username).head()

Unnamed: 0,id,name,#tracks
0,45lcKuY3x2s1xp92NxNPHu,Your New Jams,0
1,4YrlQcV8l2fXIut0ghVlsR,Your New Jams,9
2,4SUx4dz3ZuCvpcdCCDnGXF,Your New Jams,11
3,5BvxGNFRftGX1qrDgnOfe5,Your New Jams,3
4,3HvrwHSmvuJnKZm1mhxKSK,Your New Jams,11


In [41]:
playlist_id = fetch_playlists(sp,username)['id'][0]

In [42]:
def enrich_playlist(sp, username, playlist_id, playlist_tracks):
    index = 0
    results = []
    
    while index < len(playlist_tracks):
        results += sp.user_playlist_add_tracks(username, playlist_id, tracks = playlist_tracks[index:index + 50])
        index += 50

In [43]:
list_track = df.loc[df['prediction']  == 1]['track_id']
enrich_playlist(sp, username, playlist_id, list_track)
fetch_playlists(sp,username).head()

Unnamed: 0,id,name,#tracks
0,45lcKuY3x2s1xp92NxNPHu,Your New Jams,6
1,4YrlQcV8l2fXIut0ghVlsR,Your New Jams,9
2,4SUx4dz3ZuCvpcdCCDnGXF,Your New Jams,11
3,5BvxGNFRftGX1qrDgnOfe5,Your New Jams,3
4,3HvrwHSmvuJnKZm1mhxKSK,Your New Jams,11
