In [53]:
import datetime
import json
import os
import sys
from time import time
import pickle 
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

import random

import warnings
warnings.filterwarnings("ignore")

In [2]:
cd ..

/Users/jaredyanis/Desktop/playlist-generation


# Importing the Data

In [3]:
# Getting all of the playlist data
playlists = pd.DataFrame()

for i in ['0-999', '1000-1999', '2000-2999', '3000-3999']:
    data = json.load(open(f'Data/mpd.slice.{i}.json'))
    playlists_concat = pd.DataFrame(data["playlists"])
    
    playlists = pd.concat([playlists, playlists_concat], ignore_index = True)

In [4]:
# Getting all of the song information
from spotify_api_database import Track 

########################
########################
# NOTE: First, you need to go to: https://drive.google.com/open?id=14h1Hpdg1aLosORY6qRENhjTSqIt680SZ
# and download the file named 'spotify_api_database.pickle'
# Second, place this file in the "Data/" directory of this Github repo
########################
########################
INPUT_FILE = 'Data/spotify_api_database.pickle'

########################
########################
# Read in .pickle file
########################
########################

with open(INPUT_FILE, 'rb') as fd:
    start_time = time()
    print("Reading Spotify API Database .pickle file...")
    tracks = pickle.load(fd)
    print("Finished reading file (" + str(time() - start_time) +"s)...")

########################
########################
# The "tracks" variable is now an array of ~2M Track() objects
# The definition of the Track() class is in "spotify_api_database" -- note how we need to import this on line 7
########################
########################
# Print out first 100 tracks
for idx, t in enumerate(tracks):
    print(str(t) + ' | ' + t.get_audio_feats())
    # NOTE: Some tracks did not have audio feature data on Spotify (~600)
    if idx > 100:
        break


Reading Spotify API Database .pickle file...
Finished reading file (117.62706208229065s)...
You Lay A Whole Lot Of Love On Me, by Shania Twain | Danceability: 0.61 | Energy: 0.458
Time Has Come, by Hugh Mundell | Danceability: 0.795 | Energy: 0.302
What The Hell Did I Say, by Dierks Bentley | Danceability: 0.574 | Energy: 0.821
Higher Ground, by Wrekonize | Danceability: 0.467 | Energy: 0.842
Canta (En Vivo) [feat. Jorge Luis Chacín], by Guaco, Jorge Luis Chacin | Danceability: 0.81 | Energy: 0.622
Sunlight, by Imaginary Future | Danceability: 0.675 | Energy: 0.604
Avant, by Sonia Wieder-Atherton | Danceability: 0.183 | Energy: 0.23
Dorothy, by The Lion and Atilla | Danceability: 0.591 | Energy: 0.381
Love Is... (feat. Jan, Sy Smith), by The Brand New Heavies, Jan, Sy Smith | Danceability: 0.777 | Energy: 0.605
Hail Mary (feat. The Outlawz), by 2Pac, Outlawz | Danceability: 0.481 | Energy: 0.844
Za unuka mog, by Fazlija | Danceability: 0.566 | Energy: 0.886
Ballistic, by Skorge | Dance

In [5]:
### LOAD TRACK DATA TO RECTANGULAR MATRIX

my_tracks = []
for i in range(len(tracks)):
    my_dict = dict()
    my_dict["id"] = tracks[i].id
    my_dict["name"] = tracks[i].name
    my_dict["artists"] = tracks[i].artists[0][1]
    my_dict["album"] = tracks[i].album

    try: 
        my_dict["danceability"] = tracks[i].danceability
    except: 
        my_dict["danceability"] = None

    try:
        my_dict["energy"] = tracks[i].energy
    except:
        my_dict["energy"] = None

    try:
        my_dict["key"] = tracks[i].key
    except:
        my_dict["key"] = None

    try:
        my_dict["loudness"] = tracks[i].loudness
    except:
        my_dict["loudness"] = None

    try:
        my_dict["mode"] = tracks[i].mode
    except:
        my_dict["mode"] = None

    try:
        my_dict["speechiness"] = tracks[i].speechiness
    except: 
        my_dict["speechiness"] = None

    try:
        my_dict["acousticness"] = tracks[i].acousticness
    except:
        my_dict["acousticness"] = None

    try:
        my_dict["instrumentalness"] = tracks[i].instrumentalness
    except: 
        my_dict["instrumentalness"] = None

    try:
        my_dict["liveness"] = tracks[i].liveness
    except:
        my_dict["liveness"] = None

    try:
        my_dict["valence"] = tracks[i].valence
    except:
        my_dict["valence"] = None

    try:
        my_dict["tempo"] = tracks[i].tempo
    except:
        my_dict["tempo"] = None

    try:
        my_dict["duration_ms"] = tracks[i].duration_ms
    except:
        my_dict["duration_ms"] = None

    try:
        my_dict["time_signature"] = tracks[i].time_signature
    except:
        my_dict["time_signature"] = None

    my_tracks.append(my_dict)

my_tracks = pd.DataFrame(my_tracks)
my_tracks.head()

Unnamed: 0,acousticness,album,artists,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,name,speechiness,tempo,time_signature,valence
0,0.501,,Shania Twain,0.61,168933.0,0.458,006gwoshpgwOisFogU85xF,0.0,11.0,0.104,-10.819,1.0,You Lay A Whole Lot Of Love On Me,0.0274,108.374,4.0,0.44
1,0.00953,,Hugh Mundell,0.795,224760.0,0.302,00FtrGVrRAIjjMfqAWktvt,0.0406,9.0,0.163,-17.141,0.0,Time Has Come,0.144,128.561,4.0,0.738
2,0.000446,,Dierks Bentley,0.574,207333.0,0.821,00AxNl4D4jHL2AEf1W55j5,2e-06,2.0,0.191,-4.742,0.0,What The Hell Did I Say,0.0268,108.0,4.0,0.299
3,0.23,,Wrekonize,0.467,238032.0,0.842,008bSgDpnWOAdcxcHxJxQr,0.0,2.0,0.0897,-5.183,0.0,Higher Ground,0.429,94.499,3.0,0.408
4,0.419,,Guaco,0.81,233190.0,0.622,00N0HLqc3ebGR2DKcH8cU4,7e-05,1.0,0.153,-12.715,1.0,Canta (En Vivo) [feat. Jorge Luis Chacín],0.0671,103.915,4.0,0.961


In [6]:
# Getting a subset of the playlists
playlist_subset = playlists[:1000]
playlist_subset.shape

(1000, 12)

In [7]:
# Getting the subset of the songs in the playlists
id_dict = {}
    
# Get a dictionary of all the song id's used in a playlist
def populate_track_subset(row, ids):
    tracks = row['tracks']
    
    for track in tracks:
        track_id = track['track_uri'][14:]
        if track_id not in ids:
            ids[track_id] = True    
    
playlist_subset.apply(lambda row: populate_track_subset(row, id_dict), axis=1);

track_subset = my_tracks[my_tracks["id"].isin(id_dict)]
track_subset = track_subset.set_index('id')
track_subset.drop(['artists', 'name', 'album'], inplace=True, axis=1)
track_subset.shape

(34441, 13)

In [8]:
track_subset.head()

Unnamed: 0_level_0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
00CmjeeHvAVKvx3tcIiZTy,0.416,0.679,166733.0,0.412,0.0,2.0,0.128,-16.962,1.0,0.0305,105.903,4.0,0.963
00DlEKhhlQNtjnJk7xqB9O,0.000404,0.5,282464.0,0.89,0.0,10.0,0.085,-4.094,0.0,0.042,138.743,4.0,0.36
006yrnQMCZpiUgkR612gC8,0.0529,0.751,123411.0,0.829,7.6e-05,10.0,0.498,-2.007,0.0,0.109,140.118,4.0,0.275
00JvkzL9d727hk4Mzy1eyg,0.866,0.56,206293.0,0.794,9e-06,11.0,0.155,-6.395,0.0,0.0527,124.996,4.0,0.464
00NUqFMIpCsrYPbM9YpVHQ,0.0114,0.504,198280.0,0.856,0.0,11.0,0.256,-5.024,0.0,0.286,101.866,5.0,0.335


# Creating the average playlist statistics

In [9]:
# Getting an index row
playlist_subset.reset_index(inplace=True)

playlist_subset.head()

Unnamed: 0,index,collaborative,description,duration_ms,modified_at,name,num_albums,num_artists,num_edits,num_followers,num_tracks,pid,tracks
0,0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,0,"[{'pos': 0, 'artist_name': 'Missy Elliott', 't..."
1,1,False,,11656470,1506556800,Awesome Playlist,23,21,5,1,39,1,"[{'pos': 0, 'artist_name': 'Survivor', 'track_..."
2,2,False,,14039958,1505692800,korean,51,31,18,1,64,2,"[{'pos': 0, 'artist_name': 'Hoody', 'track_uri..."
3,3,False,,28926058,1501027200,mat,107,86,4,1,126,3,"[{'pos': 0, 'artist_name': 'Camille Saint-Saën..."
4,4,False,,4335282,1401667200,90s,16,16,7,2,17,4,"[{'pos': 0, 'artist_name': 'The Smashing Pumpk..."


In [10]:
# Creating the new empty dataframe
ps = []

def populate_playlist_statistics(row, ps):
    # Get all the tracks, the list of ids, and the index
    tracks, ids, num_tracks = row['tracks'], [], row['num_tracks']
        
#     Iterate through the tracks
    for track in tracks:
        ids.append(track['track_uri'][14:])
                        
    playlist_track_subset = track_subset.loc[ids, :].sum()/num_tracks
    playlist_track_subset = playlist_track_subset.tolist() + [num_tracks]

    ps.append(playlist_track_subset)
    
# Applying the function
playlist_subset.apply(lambda row: populate_playlist_statistics(row, ps), axis=1);

In [11]:
playlist_statistics=pd.DataFrame(ps,columns=track_subset.columns.tolist() + ['num_tracks'])
playlist_statistics.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,num_tracks
0,0.08344,0.659288,221777.461538,0.782173,0.000676,5.038462,0.192127,-4.881942,0.692308,0.107021,123.006885,4.0,0.642904,52
1,0.1631,0.496459,298844.128205,0.691077,0.22227,4.461538,0.178433,-8.291667,0.538462,0.088449,122.669615,3.769231,0.476667,39
2,0.26923,0.671875,219374.875,0.693203,0.000638,5.0,0.169028,-4.874156,0.515625,0.096288,114.600672,4.0,0.565078,64
3,0.27387,0.513714,229575.055556,0.621282,0.202042,5.103175,0.188585,-9.614937,0.714286,0.067186,125.032413,3.952381,0.451623,126
4,0.177148,0.576765,255014.352941,0.650535,0.081875,3.352941,0.166524,-7.634471,0.823529,0.041159,127.759882,3.941176,0.490765,17


# Creating the song playlist dataset

**Adding the songs in the playlists**

In [77]:
# Creating the new empty dataframe
ss = []

def populate_playlist_statistics(row, ss):
    # Get all the tracks, the list of ids, and the index
    tracks, index = row['tracks'], row['index']
        
    # Iterate through the tracks
    for track in tracks:
        track_id = track['track_uri'][14:]
        
        # Some have error and I dont know why
        try:
            # Get track and playlist info
            track_info = np.append(track_subset.loc[track_id, :].values, [0])            
            playlist_info = playlist_statistics.loc[index, :].values
            
            # Get the difference
            track_info = playlist_info - track_info
            track_info = track_info.tolist() + [1]
                        
            # Add to ss
            ss.append(track_info)
        except:
            print(track_id)

# Applying the function
playlist_subset.apply(lambda row: populate_playlist_statistics(row, ss), axis=1);

656TZlNdVe90zHvmebFt9U
5GiU7GOYjDH2yp7fMf9w9j


In [13]:
song_playlist_statistics=pd.DataFrame(ss, columns=playlist_statistics.columns.tolist() + ['in_playlist'])
song_playlist_statistics.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,num_tracks,in_playlist
0,0.05234,-0.244712,-5086.538462,-0.030827,-0.006294,1.038462,0.145027,2.223058,0.692308,-0.013979,-2.454115,0.0,-0.167096,52.0,1
1,0.05854,-0.114712,22977.461538,-0.055827,-0.024324,0.038462,-0.049873,-0.967942,0.692308,-0.006979,-20.033115,0.0,-0.281096,52.0,1
2,0.08106,-0.004712,-14155.538462,0.024173,0.000676,3.038462,0.132327,1.701058,0.692308,-0.102979,23.747885,0.0,-0.058096,52.0,1
3,-0.11856,-0.231712,-45489.538462,0.068173,0.000442,1.038462,0.140027,1.173058,0.692308,-0.032979,22.034885,0.0,-0.175096,52.0,1
4,0.02734,-0.193712,-5822.538462,0.176173,0.000676,5.038462,-0.120873,-0.285942,-0.307692,0.035721,28.247885,0.0,-0.011096,52.0,1


In [14]:
song_playlist_statistics.shape

(67501, 15)

**Randomly Choosing 67501 Songs To Use As "Not In Playlist Songs"**

In [67]:
# Initializitng a matrix with all song playlist relationships
# Use like "tpm_d[playlist] [tpm_d[track it]]"
tpm = [[0 for i in range(len(track_subset.index))] for j in range(len(playlist_statistics.index))]

# Dictionaries to go id --> column_number and column_number --> id
tpm_d = {}
tpm_d2 = {}
for i, track_id in zip([i for i in range(len(track_subset.index))], track_subset.index):
    tpm_d[track_id] = i
    tpm_d2[i] = track_id

In [16]:
def populate_track_playlist_matrix(row, tpm):
    # Get all the tracks, the list of ids, and the index
    tracks, index = row['tracks'], row['index']
        
    # Iterate through the tracks
    for track in tracks:
        # Updating the shit
        track_id = track['track_uri'][14:]
        
        # Some have error and I dont know why
        try:
            tpm[index][tpm_d[track_id]] = 1
        except:
            print(track_id)

# Applying the function
playlist_subset.apply(lambda row: populate_track_playlist_matrix(row, tpm), axis=1);

656TZlNdVe90zHvmebFt9U
5GiU7GOYjDH2yp7fMf9w9j


In [17]:
# Making the matrix
track_playlist_matrix = pd.DataFrame(tpm)
track_playlist_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34431,34432,34433,34434,34435,34436,34437,34438,34439,34440
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
track_playlist_matrix.shape

(1000, 34441)

In [46]:
# Getting all of the index combos
tpm_ind = track_playlist_matrix.stack()
tpm_ind = tpm_ind[tpm_ind == 0]
tpm_ind = tpm_ind.index.tolist()

# Sort randomly
random.shuffle(tpm_ind)

In [61]:
# Saving out the file
with open('tpm_indicies.pkl', 'wb') as f:
    pickle.dump(tpm_ind, f)

**Adding "Not in Playlist Songs" to the song_playlist_statistic df**

In [75]:
ss2 = []

for i in range(song_playlist_statistics.shape[0]):
    # Getting the playlist, song combo
    row, col = tpm_ind[i]
    index, track_id = row, tpm_d2[col]
    
    # Get track and playlist info
    track_info = np.append(track_subset.loc[track_id, :].values, [0])            
    playlist_info = playlist_statistics.loc[index, :].values

    # Get the difference
    track_info = playlist_info - track_info
    track_info = track_info.tolist() + [0]

    # Add to ss
    ss2.append(track_info)

In [168]:
full_song_playlist_statistics=pd.DataFrame(ss + ss2, columns=playlist_statistics.columns.tolist() + ['in_playlist'])
full_song_playlist_statistics.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,num_tracks,in_playlist
0,0.05234,-0.244712,-5086.538462,-0.030827,-0.006294,1.038462,0.145027,2.223058,0.692308,-0.013979,-2.454115,0.0,-0.167096,52.0,1
1,0.05854,-0.114712,22977.461538,-0.055827,-0.024324,0.038462,-0.049873,-0.967942,0.692308,-0.006979,-20.033115,0.0,-0.281096,52.0,1
2,0.08106,-0.004712,-14155.538462,0.024173,0.000676,3.038462,0.132327,1.701058,0.692308,-0.102979,23.747885,0.0,-0.058096,52.0,1
3,-0.11856,-0.231712,-45489.538462,0.068173,0.000442,1.038462,0.140027,1.173058,0.692308,-0.032979,22.034885,0.0,-0.175096,52.0,1
4,0.02734,-0.193712,-5822.538462,0.176173,0.000676,5.038462,-0.120873,-0.285942,-0.307692,0.035721,28.247885,0.0,-0.011096,52.0,1


In [169]:
full_song_playlist_statistics.shape

(135002, 15)

# Making the Logorithmic Regression

In [84]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LassoCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [244]:
fsps = full_song_playlist_statistics.drop(columns=['num_tracks'])

In [245]:
X_train, X_test, y_train, y_test = train_test_split(fsps.loc[:, fsps.columns != 'in_playlist'], 
                                                             fsps.in_playlist, test_size=0.2, 
                                                             random_state = 109, 
                                                             stratify = fsps.in_playlist)

In [246]:
print('Shape before splitting: ' + str(fsps.shape))
print('Shapes after splitting: ' + str((X_train.shape, X_test.shape, y_train.shape, y_test.shape)))

Shape before splitting: (135002, 14)
Shapes after splitting: ((108001, 13), (27001, 13), (108001,), (27001,))


In [247]:
# Making sure the data is 
Log_cv = LogisticRegressionCV(max_iter=10000, cv = 4, penalty='l1', solver='liblinear').fit(X_train, y_train)

In [248]:
# Printing the results
print(f"Accuracy on the training set: {accuracy_score(Log_cv.predict(X_train), y_train)}")
print(f"Accuracy on the testing set: {accuracy_score(Log_cv.predict(X_test), y_test)}")

Accuracy on the training set: 0.536643179229822
Accuracy on the testing set: 0.5304618347468613


In [249]:
# Getting all the coefficient values
Log_cv_coef = Log_cv.coef_.tolist()[0]

for predictor, coef in zip(fsps.columns.tolist(), Log_cv_coef):
    print(f"Predictor: {predictor} {coef}")

Predictor: acousticness 0.18473753513022292
Predictor: danceability -0.5068449982107626
Predictor: duration_ms 6.801670881373405e-08
Predictor: energy 0.5998336386001216
Predictor: instrumentalness 0.37436134520330366
Predictor: key 0.006428636471033001
Predictor: liveness 0.235494867154996
Predictor: loudness -0.01769832167117757
Predictor: mode 0.0010890205536775274
Predictor: speechiness 0.13455792396961666
Predictor: tempo -3.334363168559714e-05
Predictor: time_signature -0.03602011219387513
Predictor: valence 0.014511369714046683


# Evaluating The Logistic Model #

In [252]:
# Creating the new empty dataframe
clicks = []

# To calculate the scores once you have the adjusted playlist
def calculate_all_scores(row, playlist_info, df_maker):
    # Getting the difference between the scores
    track_info = row.values
    playlist_track_difference = playlist_info - track_info
    
    df_maker.append(playlist_track_difference)

# To get the average number of clicks needed to find the first ommitted song
def populate_playlist_statistics(row, clicks):
    # Get all the tracks, the list of ids, and the index
    tracks, index = row['tracks'], row['index']
    
    # Make sure there is enough songs
    if row['num_tracks'] >= 50:

        # Pick 2 random tracks
        track_1, track_2 = random.sample(range(0, len(tracks)-1), 2)
        track_1_id, track_2_id = tracks[track_1]['track_uri'][14:], tracks[track_2]['track_uri'][14:]

        # Get the list of the rest of the track id's
        omitted_tracks = tracks[:track_1] + tracks[track_1+1:track_2] + tracks[track_2+1:]
        omitted_tracks = {omitted_tracks[i]['track_uri'][14:] for i in range(0, len(omitted_tracks))}

        # Get the playlist information
        playlist_info = (track_subset.loc[track_1_id, :].values + track_subset.loc[track_2_id, :].values)/2

        # Make the dataframe
        df_maker = []
        track_subset.apply(lambda row: calculate_all_scores(row, playlist_info, df_maker), axis=1)
        df = pd.DataFrame(df_maker, columns=track_subset.columns.tolist())

        # Changing index
        df.set_index(track_subset.index, inplace=True)

        # Get all of the recomended clicks
        df['score'] = Log_cv.predict_proba(df)[:,1]

        # Sort values by score
        df.sort_values(by=['score'], inplace=True, ascending=False)

        # Geting the indecies
        df_index = df.index
        
        # Get the number of clicks (max 50)
        inputed = False
        
        for i in range(10*100):
            if df_index[i] in omitted_tracks:
                clicks.append((i+1)/10)
                inputed = True
            break
                
        if not inputed:
            clicks.append(101)
                
# Applying the function
playlist_subset.apply(lambda row: populate_playlist_statistics(row, clicks), axis=1);

In [253]:
sum(clicks)/len(clicks)

101.0