In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
from imblearn.over_sampling import SMOTE
%matplotlib inline

In [2]:
# Importing playlist dataframes
df = pd.read_csv('data/encoded_playlist_songs.csv')
df_fav = pd.read_csv('data/favorite_songs.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9769 entries, 0 to 9768
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          9769 non-null   object 
 1   name              9769 non-null   int64  
 2   album             9769 non-null   int64  
 3   artist            9769 non-null   int64  
 4   release_date      9769 non-null   int64  
 5   length            9769 non-null   int64  
 6   popularity        9769 non-null   int64  
 7   danceability      9769 non-null   float64
 8   acousticness      9769 non-null   float64
 9   energy            9769 non-null   float64
 10  instrumentalness  9769 non-null   float64
 11  liveness          9769 non-null   float64
 12  loudness          9769 non-null   float64
 13  speechiness       9769 non-null   float64
 14  tempo             9769 non-null   float64
 15  time_signature    9769 non-null   int64  
 16  favorite          9769 non-null   int64  


In [4]:
df_fav.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          50 non-null     object 
 1   name              50 non-null     int64  
 2   album             50 non-null     int64  
 3   artist            50 non-null     int64  
 4   release_date      50 non-null     int64  
 5   length            50 non-null     int64  
 6   popularity        50 non-null     int64  
 7   danceability      50 non-null     float64
 8   acousticness      50 non-null     float64
 9   energy            50 non-null     float64
 10  instrumentalness  50 non-null     float64
 11  liveness          50 non-null     float64
 12  loudness          50 non-null     float64
 13  speechiness       50 non-null     float64
 14  tempo             50 non-null     float64
 15  time_signature    50 non-null     int64  
 16  favorite          50 non-null     int64  
dtyp

### Preparing data for model

In [5]:
df = pd.concat([df, df_fav], axis=0)
df.shape

(9819, 17)

In [6]:
df.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
0,7MAibcTli4IisCtbHKrGMh,4008,3340,581,2524,242096,90,0.586,0.182,0.616,0.0,0.0927,-7.964,0.0324,148.088,4,0
1,5QO79kh1waicV47BqGRL3g,6038,212,3746,2293,215626,97,0.68,0.0212,0.826,1.2e-05,0.543,-5.487,0.0309,118.051,4,0
2,1diS6nkxMQc3wwC4G1j0bh,8118,2222,1091,2505,165506,88,0.722,0.0319,0.588,0.0,0.183,-5.932,0.0544,134.01,4,0
3,4u4NyuceXP7Uzh7XFJKCr1,3096,2743,1917,2524,170813,89,0.658,0.0106,0.634,0.0,0.132,-5.797,0.0413,139.98,4,0
4,3Ofmpyhv5UAQ70mENzB277,542,433,2439,2478,132780,94,0.778,0.175,0.695,0.0,0.15,-6.865,0.0913,149.996,4,0


In [7]:
df['favorite'].value_counts()

0    9769
1      50
Name: favorite, dtype: int64

### Model Selection & Hyperparameter Tuning

In [8]:
# Importing required libraries
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics 
from sklearn.metrics import f1_score
# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier

In [9]:
# Shuffle your dataset 
shuffle_df = df.sample(frac=1)

# Define a size for your train set 
train_size = int(0.8 * len(df))

# Split dataset 
train_set = shuffle_df[:train_size]
test_set = shuffle_df[train_size:]

In [10]:
train_set.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
5934,0id7TrBjYYzQ5v5mHWTUxC,2473,2115,2047,2377,205440,59,0.68,0.00485,0.761,0.00139,0.0644,-6.142,0.0288,124.981,4,0
1683,3Rp8FNCr3yxM3fGFy99LN8,6350,5512,4021,2360,251846,55,0.673,0.0258,0.74,0.0,0.127,-4.945,0.0847,90.026,4,0
7820,28bl4jzGKXm5BUaEvCm973,1417,6830,251,1140,220333,42,0.423,0.834,0.231,0.439,0.277,-16.658,0.0381,138.586,3,0
497,0M3WxzvPruZ2i5dvOOGAOC,3957,3309,1456,2466,189060,59,0.408,0.000142,0.949,5e-06,0.26,-3.187,0.168,155.124,4,0
7086,65gBp1aZvizgSWsXBSsHp0,7515,3045,887,174,462066,0,0.806,0.0245,0.937,0.834,0.101,-6.933,0.0466,121.93,4,0


In [11]:
X = train_set.drop(columns=['favorite', 'track_id'])
y = train_set.favorite

X.head()

Unnamed: 0,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
5934,2473,2115,2047,2377,205440,59,0.68,0.00485,0.761,0.00139,0.0644,-6.142,0.0288,124.981,4
1683,6350,5512,4021,2360,251846,55,0.673,0.0258,0.74,0.0,0.127,-4.945,0.0847,90.026,4
7820,1417,6830,251,1140,220333,42,0.423,0.834,0.231,0.439,0.277,-16.658,0.0381,138.586,3
497,3957,3309,1456,2466,189060,59,0.408,0.000142,0.949,5e-06,0.26,-3.187,0.168,155.124,4
7086,7515,3045,887,174,462066,0,0.806,0.0245,0.937,0.834,0.101,-6.933,0.0466,121.93,4


In [12]:
# Checking for imbalance
y.value_counts()

0    7815
1      40
Name: favorite, dtype: int64

In [13]:
# Train / Split Data
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X, y) 

In [14]:
X_train.head()

Unnamed: 0,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,2473,2115,2047,2377,205440,59,0.68,0.00485,0.761,0.00139,0.0644,-6.142,0.0288,124.981,4
1,6350,5512,4021,2360,251846,55,0.673,0.0258,0.74,0.0,0.127,-4.945,0.0847,90.026,4
2,1417,6830,251,1140,220333,42,0.423,0.834,0.231,0.439,0.277,-16.658,0.0381,138.586,3
3,3957,3309,1456,2466,189060,59,0.408,0.000142,0.949,5e-06,0.26,-3.187,0.168,155.124,4
4,7515,3045,887,174,462066,0,0.806,0.0245,0.937,0.834,0.101,-6.933,0.0466,121.93,4


In [15]:
# Checking if imbalance is gone
y_train.value_counts()

0    7815
1    7815
Name: favorite, dtype: int64

In [16]:
test_set.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
1476,7lSXb6XzrrHCxuE1KQcfRK,812,646,4038,2524,123402,54,0.856,0.241,0.522,0.0,0.156,-7.382,0.337,143.078,4,0
1140,1XZRnzOrLuUhrahFwSao51,2732,2354,4041,2519,169842,64,0.899,0.00442,0.501,0.0,0.0548,-7.15,0.32,150.028,4,0
8515,63JvZ8pIRAKsHBD1TZqDNn,65,4550,118,680,251266,15,0.132,0.977,0.112,0.549,0.114,-17.674,0.0385,98.089,3,0
3887,13hCOEVO2ErbVS7pB2dbCc,4703,6619,1995,1909,203360,0,0.532,0.197,0.632,0.0,0.129,-6.432,0.0376,189.905,3,0
1805,3OEi89tc0xveSTHAhwrFKT,8489,7287,3168,2519,188141,58,0.656,0.0946,0.56,6.3e-05,0.363,-7.499,0.0465,91.038,4,0


In [17]:
X_test = test_set.drop(columns=['favorite', 'track_id'])
y_test = test_set['favorite']

In [18]:
X_test.head()

Unnamed: 0,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
1476,812,646,4038,2524,123402,54,0.856,0.241,0.522,0.0,0.156,-7.382,0.337,143.078,4
1140,2732,2354,4041,2519,169842,64,0.899,0.00442,0.501,0.0,0.0548,-7.15,0.32,150.028,4
8515,65,4550,118,680,251266,15,0.132,0.977,0.112,0.549,0.114,-17.674,0.0385,98.089,3
3887,4703,6619,1995,1909,203360,0,0.532,0.197,0.632,0.0,0.129,-6.432,0.0376,189.905,3
1805,8489,7287,3168,2519,188141,58,0.656,0.0946,0.56,6.3e-05,0.363,-7.499,0.0465,91.038,4


In [19]:
# Checking for imbalance in test set
y_test.value_counts()

0    1954
1      10
Name: favorite, dtype: int64

#### Testing models

In [20]:
%%time
# Logistic Regression
lr = LogisticRegression(solver='lbfgs', max_iter=400)
lr_scores = cross_val_score(lr, X_train, y_train, cv=10, scoring="f1")
print(np.mean(lr_scores))

0.8261404185691497
CPU times: user 4.36 s, sys: 634 ms, total: 5 s
Wall time: 1.33 s


In [21]:
# Hyperparameter optimization for Decision Tree Classifier
parameters = {
    'max_depth':[3, 4, 5, 6, 10, 15,20,30],
}
dtc = Pipeline([('CV',GridSearchCV(DecisionTreeClassifier(), parameters, cv = 5))])
dtc.fit(X_train, y_train)
dtc.named_steps['CV'].best_params_

{'max_depth': 30}

In [22]:
%%time
# Decision Tree Classifier
dt = DecisionTreeClassifier(max_depth=30)
dt_scores = cross_val_score(dt, X_train, y_train, cv=10, scoring="f1")
np.mean(dt_scores)

CPU times: user 2.27 s, sys: 27.4 ms, total: 2.3 s
Wall time: 2.33 s


0.9939422910994944

In [23]:
%%time
# Hyperparameter optimization of RandomForestClassifier
parameters = {
    'max_depth':[3, 6,12,15,20],
    'n_estimators':[10, 20,30]
}
clf = Pipeline([('CV',GridSearchCV(RandomForestClassifier(), parameters, cv = 5))])
clf.fit(X_train, y_train)
clf.named_steps['CV'].best_params_

CPU times: user 24.7 s, sys: 270 ms, total: 25 s
Wall time: 25.2 s


{'max_depth': 20, 'n_estimators': 20}

In [24]:
%%time
# RandomForestClassifier
rf = Pipeline([('rf', RandomForestClassifier(n_estimators = 10, max_depth = 20))])
rf_scores = cross_val_score(rf, X_train, y_train, cv=10, scoring="f1")
np.mean(rf_scores)

CPU times: user 2.56 s, sys: 29 ms, total: 2.59 s
Wall time: 2.61 s


0.9984664516911563

Since the RandomForestClassifier has the highest accuracy, I will use it to recommend songs.

### Using algorithm on test data

In [25]:
# Checking for imabalance
y_test.value_counts()

0    1954
1      10
Name: favorite, dtype: int64

In [26]:
# Building a pipeline to use on regular data
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipe = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators = 30, max_depth = 20))
pipe.fit(X_train, y_train)  # apply scaling on training data
# pipe.set_params(logisticrergession__n_estimators=30)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('rf', RandomForestClassifier(n_estimators = 30, max_depth = 20))])
pipe.score(X_test, y_test)

0.9974541751527495

In [27]:
df.shape

(9819, 17)

In [28]:
len(pipe.predict(df.drop(['favorite','track_id'], axis=1)))

9819

## Predicting songs and saving to dataset

In [29]:
df = pd.read_csv('data/encoded_playlist_songs.csv')

In [30]:
df.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
0,7MAibcTli4IisCtbHKrGMh,4008,3340,581,2524,242096,90,0.586,0.182,0.616,0.0,0.0927,-7.964,0.0324,148.088,4,0
1,5QO79kh1waicV47BqGRL3g,6038,212,3746,2293,215626,97,0.68,0.0212,0.826,1.2e-05,0.543,-5.487,0.0309,118.051,4,0
2,1diS6nkxMQc3wwC4G1j0bh,8118,2222,1091,2505,165506,88,0.722,0.0319,0.588,0.0,0.183,-5.932,0.0544,134.01,4,0
3,4u4NyuceXP7Uzh7XFJKCr1,3096,2743,1917,2524,170813,89,0.658,0.0106,0.634,0.0,0.132,-5.797,0.0413,139.98,4,0
4,3Ofmpyhv5UAQ70mENzB277,542,433,2439,2478,132780,94,0.778,0.175,0.695,0.0,0.15,-6.865,0.0913,149.996,4,0


In [31]:
prediction = pipe.predict(df.drop(['favorite','track_id'], axis=1))
df['prediction'] = prediction

In [32]:
df['prediction'].head()

0    0
1    0
2    0
3    0
4    0
Name: prediction, dtype: int64

In [33]:
df['prediction'].value_counts()

0    9765
1       4
Name: prediction, dtype: int64

## Building the playlist from recommended songs

In [None]:
def create_playlist(sp, username, playlist_name, playlist_description):
    playlists = sp.user_playlist_create(username, playlist_name, description = playlist_description)

In [None]:
create_playlist(sp, username, 'Your New Jams', 'This playlist was created using python!')

In [None]:
def fetch_playlists(sp, username):
    """
    Returns the user's playlists.
    """
        
    id = []
    name = []
    num_tracks = []
    
    # Make the API request
    playlists = sp.user_playlists(username)
    for playlist in playlists['items']:
        id.append(playlist['id'])
        name.append(playlist['name'])
        num_tracks.append(playlist['tracks']['total'])

    # Create the final df   
    df_playlists = pd.DataFrame({"id":id, "name": name, "#tracks": num_tracks})
    return df_playlists

In [None]:
fetch_playlists(sp,username).head()

In [None]:
playlist_id = fetch_playlists(sp,username)['id'][0]

In [None]:
def enrich_playlist(sp, username, playlist_id, playlist_tracks):
    index = 0
    results = []
    
    while index < len(playlist_tracks):
        results += sp.user_playlist_add_tracks(username, playlist_id, tracks = playlist_tracks[index:index + 50])
        index += 50

In [None]:
list_track = df2.index
enrich_playlist(sp, username, playlist_id, list_track)
fetch_playlists(sp,username).head()