In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
from imblearn.over_sampling import SMOTE
%matplotlib inline

In [2]:
# Importing playlist dataframes
df = pd.read_csv('data/encoded_playlist_songs.csv')
df_fav = pd.read_csv('data/favorite_songs.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9769 entries, 0 to 9768
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          9769 non-null   object 
 1   name              9769 non-null   int64  
 2   album             9769 non-null   int64  
 3   artist            9769 non-null   int64  
 4   release_date      9769 non-null   int64  
 5   length            9769 non-null   int64  
 6   popularity        9769 non-null   int64  
 7   danceability      9769 non-null   float64
 8   acousticness      9769 non-null   float64
 9   energy            9769 non-null   float64
 10  instrumentalness  9769 non-null   float64
 11  liveness          9769 non-null   float64
 12  loudness          9769 non-null   float64
 13  speechiness       9769 non-null   float64
 14  tempo             9769 non-null   float64
 15  time_signature    9769 non-null   int64  
 16  favorite          9769 non-null   int64  


In [4]:
df_fav.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          50 non-null     object 
 1   name              50 non-null     int64  
 2   album             50 non-null     int64  
 3   artist            50 non-null     int64  
 4   release_date      50 non-null     int64  
 5   length            50 non-null     int64  
 6   popularity        50 non-null     int64  
 7   danceability      50 non-null     float64
 8   acousticness      50 non-null     float64
 9   energy            50 non-null     float64
 10  instrumentalness  50 non-null     float64
 11  liveness          50 non-null     float64
 12  loudness          50 non-null     float64
 13  speechiness       50 non-null     float64
 14  tempo             50 non-null     float64
 15  time_signature    50 non-null     int64  
 16  favorite          50 non-null     int64  
dtyp

### Preparing data for model

In [5]:
df = pd.concat([df, df_fav], axis=0)
df.drop(columns=['track_id'])
df.shape

(9819, 17)

### Model Selection & Hyperparameter Tuning

In [6]:
# Importing required libraries
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics 
from sklearn.metrics import f1_score
# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier

In [7]:
# Shuffle your dataset 
shuffle_df = df.sample(frac=1)

# Define a size for your train set 
train_size = int(0.8 * len(df))

# Split dataset 
train_set = shuffle_df[:train_size]
test_set = shuffle_df[train_size:]

In [8]:
train_set.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
4130,6nGAVc9i8gmC0IBUVf2DoG,824,3157,1863,1139,150653,0,0.626,0.807,0.584,0.000422,0.125,-10.317,0.0887,183.243,4,0
811,0wJoRiX5K5BxlqZTolB2LD,5623,396,1805,58,170813,72,0.533,0.00876,0.905,0.578,0.0698,-5.27,0.0754,108.9,4,0
9733,0cXWMzsFVYkb8rTti0IvO1,7791,223,1857,229,210826,38,0.448,0.802,0.0992,0.0138,0.166,-23.85,0.0361,67.025,4,0
3722,3GpSelbkykBVq1HqFQh9gN,2925,2556,315,2270,224027,5,0.533,0.00191,0.833,0.0,0.0764,-6.556,0.0356,99.981,4,0
9444,3mgO7YAlNvmGoZNEaEjfaA,4388,237,1788,1075,252186,0,0.137,0.335,0.0616,0.966,0.114,-21.753,0.038,68.854,3,0


In [13]:
X = train_set.drop(columns=['favorite', 'track_id'])
y = train_set.favorite

X.head()

Unnamed: 0,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
4130,824,3157,1863,1139,150653,0,0.626,0.807,0.584,0.000422,0.125,-10.317,0.0887,183.243,4
811,5623,396,1805,58,170813,72,0.533,0.00876,0.905,0.578,0.0698,-5.27,0.0754,108.9,4
9733,7791,223,1857,229,210826,38,0.448,0.802,0.0992,0.0138,0.166,-23.85,0.0361,67.025,4
3722,2925,2556,315,2270,224027,5,0.533,0.00191,0.833,0.0,0.0764,-6.556,0.0356,99.981,4
9444,4388,237,1788,1075,252186,0,0.137,0.335,0.0616,0.966,0.114,-21.753,0.038,68.854,3


In [14]:
# Checking for imbalance
y.value_counts()

0    7815
1      40
Name: favorite, dtype: int64

In [17]:
# Train / Split Data
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X, y)
# X_test, y_test = 

In [18]:
X_train.head()

Unnamed: 0,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,824,3157,1863,1139,150653,0,0.626,0.807,0.584,0.000422,0.125,-10.317,0.0887,183.243,4
1,5623,396,1805,58,170813,72,0.533,0.00876,0.905,0.578,0.0698,-5.27,0.0754,108.9,4
2,7791,223,1857,229,210826,38,0.448,0.802,0.0992,0.0138,0.166,-23.85,0.0361,67.025,4
3,2925,2556,315,2270,224027,5,0.533,0.00191,0.833,0.0,0.0764,-6.556,0.0356,99.981,4
4,4388,237,1788,1075,252186,0,0.137,0.335,0.0616,0.966,0.114,-21.753,0.038,68.854,3


In [21]:
# Checking if imbalance is gone
y_train.value_counts()

0    7815
1    7815
Name: favorite, dtype: int64

In [13]:
y_train.value_counts()

1    7842
0    7788
Name: favorite, dtype: int64

In [14]:
y_test.value_counts()

0    1981
1    1927
Name: favorite, dtype: int64

In [13]:
# Checking for imbalance again
y_train.value_counts()

1    7816
0    7814
Name: favorite, dtype: int64

In [14]:
X_train.head()

Unnamed: 0,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
10229,3406,1871,3470,2297,135640,38,0.852028,0.174427,0.489884,0.000371,0.294738,-9.364194,0.276863,129.3719,4
17353,6128,1686,2167,1892,135084,34,0.814834,0.389966,0.40952,0.009649,0.239826,-10.613884,0.315841,146.424723,4
14872,4562,1993,618,1974,174273,64,0.633099,0.140757,0.502026,2e-06,0.263168,-7.8388,0.403331,159.531825,4
11098,1160,729,2107,1938,95328,38,0.604338,0.253974,0.555528,0.021466,0.23582,-9.374091,0.304657,119.663414,4
4826,6281,5447,2437,2413,198299,61,0.789,0.513,0.653,0.000415,0.0992,-8.506,0.094,101.985,4


#### Testing models

In [15]:
%%time
# Logistic Regression
lr = LogisticRegression(solver='lbfgs', max_iter=400)
lr_scores = cross_val_score(lr, X_train, y_train, cv=10, scoring="f1")
print(np.mean(lr_scores))

0.8289648231501412
CPU times: user 3.87 s, sys: 1.32 s, total: 5.19 s
Wall time: 1.66 s


In [16]:
# Hyperparameter optimization for Decision Tree Classifier
parameters = {
    'max_depth':[3, 4, 5, 6, 10, 15,20,30],
}
dtc = Pipeline([('CV',GridSearchCV(DecisionTreeClassifier(), parameters, cv = 5))])
dtc.fit(X_train, y_train)
dtc.named_steps['CV'].best_params_

{'max_depth': 30}

In [17]:
%%time
# Decision Tree Classifier
dt = DecisionTreeClassifier(max_depth=5)
dt_scores = cross_val_score(dt, X_train, y_train, cv=10, scoring="f1")
np.mean(dt_scores)

CPU times: user 865 ms, sys: 10.9 ms, total: 876 ms
Wall time: 883 ms


0.9650589699146167

In [18]:
%%time
# Hyperparameter optimization of RandomForestClassifier
parameters = {
    'max_depth':[3, 6,12,15,20],
    'n_estimators':[10, 20,30]
}
clf = Pipeline([('CV',GridSearchCV(RandomForestClassifier(), parameters, cv = 5))])
clf.fit(X_train, y_train)
clf.named_steps['CV'].best_params_

CPU times: user 25.5 s, sys: 211 ms, total: 25.7 s
Wall time: 25.8 s


{'max_depth': 20, 'n_estimators': 30}

In [19]:
%%time
# RandomForestClassifier
rf = Pipeline([('rf', RandomForestClassifier(n_estimators = 30, max_depth = 20))])
rf_scores = cross_val_score(rf, X_train, y_train, cv=10, scoring="f1")
np.mean(rf_scores)

CPU times: user 7.58 s, sys: 47.5 ms, total: 7.63 s
Wall time: 7.65 s


0.9987871626850143

Since the RandomForestClassifier has the highest accuracy, I will use it to recommend songs.

### Using algorithm on test data

In [20]:
# Checking for imabalance
y_test.value_counts()

0    1955
1    1953
Name: favorite, dtype: int64

In [21]:
rf_test_scores = cross_val_score(rf, X_test, y_test, cv=10, scoring="f1")
np.mean(rf_test_scores)

0.9961714954306162

In [22]:
# Building a pipeline to use on regular data
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipe = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators = 30, max_depth = 20))
pipe.fit(X_train, y_train)  # apply scaling on training data
# pipe.set_params(logisticrergession__n_estimators=30)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('rf', RandomForestClassifier(n_estimators = 30, max_depth = 20))])
pipe.score(X_test, y_test)

0.9987205731832139

In [32]:
X.shape

(19538, 16)

In [30]:
len(pipe.predict(X.drop(['favorite'], axis=1)))

19538

## Predicting songs and saving to dataset

In [33]:
prediction = pipe.predict(X.drop(['favorite'], axis=1))
df['prediction'] = prediction

ValueError: Length of values (19538) does not match length of index (9819)

## Building the playlist from recommended songs

In [None]:
def create_playlist(sp, username, playlist_name, playlist_description):
    playlists = sp.user_playlist_create(username, playlist_name, description = playlist_description)

In [None]:
create_playlist(sp, username, 'Your New Jams', 'This playlist was created using python!')

In [None]:
def fetch_playlists(sp, username):
    """
    Returns the user's playlists.
    """
        
    id = []
    name = []
    num_tracks = []
    
    # Make the API request
    playlists = sp.user_playlists(username)
    for playlist in playlists['items']:
        id.append(playlist['id'])
        name.append(playlist['name'])
        num_tracks.append(playlist['tracks']['total'])

    # Create the final df   
    df_playlists = pd.DataFrame({"id":id, "name": name, "#tracks": num_tracks})
    return df_playlists

In [None]:
fetch_playlists(sp,username).head()

In [None]:
playlist_id = fetch_playlists(sp,username)['id'][0]

In [None]:
def enrich_playlist(sp, username, playlist_id, playlist_tracks):
    index = 0
    results = []
    
    while index < len(playlist_tracks):
        results += sp.user_playlist_add_tracks(username, playlist_id, tracks = playlist_tracks[index:index + 50])
        index += 50

In [None]:
list_track = df2.index
enrich_playlist(sp, username, playlist_id, list_track)
fetch_playlists(sp,username).head()