In [295]:
import pandas as pd
import random
import matplotlib.pyplot as plt
import numpy as np
import json
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from numpyencoder import NumpyEncoder
import time
import warnings
warnings.filterwarnings("ignore")

### Read in dataset, and get list of playlists for each track

In [517]:
playlists_feats = pd.read_csv("playlists_with_dates_genres.csv")

Drop unneccesary columns (those describing the album, which aren't ever going to be used)

In [518]:
playlists_feats = playlists_feats.drop(["Num_Artists","Num_Tracks","Num_Albums","Follow","Collab","Playlist"],axis=1)

Convert string-like columns to string type

In [464]:
playlists_feats["Album"] = playlists_feats["Album"].astype(str)
playlists_feats["Artist"] = playlists_feats["Artist"].astype(str)
playlists_feats["Genre"] = playlists_feats["Genre"].astype(str)

In [465]:
playlists_feats

Unnamed: 0,Track URI,Track,Artist,Album,Date,Duration,Danceability,Energy,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature,Pid,Genre
0,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Toxic,Britney Spears,In The Zone,2003,198800,0.774,0.838,5.0,-3.914,0.0,0.1140,0.0249,0.025000,0.2420,0.9240,143.040,4.0,0,"dance pop,pop,Emo,Pop"
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Toxic,Britney Spears,In The Zone,2003,198800,0.774,0.838,5.0,-3.914,0.0,0.1140,0.0249,0.025000,0.2420,0.9240,143.040,4.0,38,"dance pop,pop,Emo,Pop"
2,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Toxic,Britney Spears,In The Zone,2003,198800,0.774,0.838,5.0,-3.914,0.0,0.1140,0.0249,0.025000,0.2420,0.9240,143.040,4.0,123,"dance pop,pop,Emo,Pop"
3,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Toxic,Britney Spears,In The Zone,2003,198800,0.774,0.838,5.0,-3.914,0.0,0.1140,0.0249,0.025000,0.2420,0.9240,143.040,4.0,262,"dance pop,pop,Emo,Pop"
4,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Toxic,Britney Spears,In The Zone,2003,198800,0.774,0.838,5.0,-3.914,0.0,0.1140,0.0249,0.025000,0.2420,0.9240,143.040,4.0,355,"dance pop,pop,Emo,Pop"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
626272,spotify:track:5yPxMEPP3nXbMZIsuXHGxd,Graves,Ugly Heroes,Ugly Heroes,2017,199014,0.557,0.833,8.0,-5.436,0.0,0.2470,0.0610,0.000000,0.0681,0.8920,160.402,4.0,142603,Underground Rap
626273,spotify:track:2EieK2FUAxikMgLoTSZu51,That's That,Doom,Born Like This,2008,135067,0.500,0.728,8.0,-4.922,0.0,0.3850,0.2350,0.000000,0.6860,0.8530,192.178,4.0,142677,Hiphop
626274,spotify:track:0pEmCGi4ZwJvXmGVvPeaDi,38 to the Face,Iamjakehill,38 to the Face,2012,151131,0.784,0.453,0.0,-5.545,0.0,0.3500,0.0378,0.000375,0.3560,0.0733,139.040,4.0,142790,"Trap Metal,Underground Rap"
626275,spotify:track:6WPf7Nn6Fd8M32uUDrAYD1,Death on Me,Dex Osama,Nine 11,2008,161453,0.716,0.882,9.0,-6.984,1.0,0.2130,0.0422,0.000000,0.1510,0.3550,181.970,4.0,142932,Underground Rap


In [466]:
unique_tracks = playlists_feats["Track URI"].unique()

########## ONLY RUN THIS ONCE ##########
# track_pids = {track: [] for track in unique_tracks}

# for track in unique_tracks:
#     # make dictionary where each track URI is a key, with a list of all Pids for the track
#     track_pids[track].extend(playlists_feats[playlists_feats["Track URI"] == track]["Pid"].unique())

# with open("track_pids.json", "w") as outfile: 
#     json.dump(track_pids, outfile, indent=4, sort_keys=True,
#               separators=(', ', ': '), ensure_ascii=False,
#               cls=NumpyEncoder)
########################################

with open("track_pids.json", 'r') as file:
    track_pids = json.load(file)

In [467]:
features_considered = ["Date", "Duration", "Danceability", "Energy", "Key", "Loudness", "Mode", "Speechiness", "Acousticness", "Instrumentalness", "Liveness", "Valence", "Tempo", "Time Signature", "Genre", "Artist", "Album"]

### Functions

In [468]:
def find_common_tracks(occurrences_cutoff):
    track_occurrences = playlists_feats["Track URI"].value_counts()
    count_mask = playlists_feats["Track URI"].isin(track_occurrences[track_occurrences >= occurrences_cutoff].index)
    return playlists_feats[count_mask]

In [469]:
def calc_playlist_similarity(track_1, track_2, this_playlists_feats):
    playlists_track_1 = track_pids[track_1]
    playlists_track_2 = track_pids[track_2]

    count_together = 0
    for playlist in playlists_track_1:
        if playlist in playlists_track_2:
            count_together += 1

    playlist_similarity = count_together/min(len(playlists_track_1), len(playlists_track_2))
    return playlist_similarity

In [470]:
def calc_playlist_similarity_random_pairs(n_pairs, min_track_occurrences, printing=False):
    common_playlists_feats = find_common_tracks(min_track_occurrences)
    unique_common_tracks = common_playlists_feats["Track URI"].unique()
    playlist_similarities = pd.DataFrame(columns=["Track URI 1","Track URI 2","Playlist Similarity"])
    num_unique_tracks = len(unique_common_tracks)
    num_total_unique_tracks = len(playlists_feats["Track URI"].unique())
    percent_total_unique_tracks = num_unique_tracks/num_total_unique_tracks*100
    if printing:
        print("Number of unique tracks considered: %d (%.1f%% of total %d)"%(num_unique_tracks,percent_total_unique_tracks,num_total_unique_tracks))
       
    for i in range(int(n_pairs)):
        random_track_1 = unique_common_tracks[int(num_unique_tracks*random.random())]
        random_track_2 = unique_common_tracks[int(num_unique_tracks*random.random())]
        if random_track_1 != random_track_2:
            playlist_similarity = calc_playlist_similarity(random_track_1, random_track_2, common_playlists_feats)
            playlist_similarities.loc[i] = [min(random_track_1, random_track_2), max(random_track_1, random_track_2), playlist_similarity]
    
    if printing:
        print("%% of pairs having 0%% playlist similarity: %.1f%%"%(sum(playlist_similarities["Playlist Similarity"] == 0)/n_pairs*100))
    return playlist_similarities

In [471]:
def calc_playlist_similarity_random_pairs_weighted(n_pairs, printing=False):
    playlist_similarities = pd.DataFrame(columns=["Track URI 1","Track URI 2","Playlist Similarity"])
    num_rows = len(playlists_feats)
        
    for i in range(int(n_pairs)):
        random_track_1 = playlists_feats.loc[int(num_rows*random.random()),"Track URI"]
        random_track_2 = playlists_feats.loc[int(num_rows*random.random()),"Track URI"]
        if random_track_1 != random_track_2:
            playlist_similarity = calc_playlist_similarity(random_track_1, random_track_2, playlists_feats)
            playlist_similarities.loc[i] = [min(random_track_1, random_track_2), max(random_track_1, random_track_2), playlist_similarity]
    
    if printing:
        print("%% of pairs having 0%% playlist similarity: %.1f%%"%(sum(playlist_similarities["Playlist Similarity"] == 0)/n_pairs*100))
    return playlist_similarities

In [508]:
def calc_differential_features(track_frame):
    unique_track_feats = playlists_feats.drop_duplicates(subset="Track URI")
    
    quantitative_features = features_considered[:-3]
        
    for feature in quantitative_features:
        track_frame[feature] = 0
    track_frame["Genre"] = 0
    track_frame["Artist"] = 0
    track_frame["Album"] = 0
        
    for index, row in track_frame.iterrows():
        track_1 = row["Track URI 1"]
        track_2 = row["Track URI 2"]
        track_1_features = unique_track_feats[unique_track_feats["Track URI"] == track_1].iloc[0,:]
        track_2_features = unique_track_feats[unique_track_feats["Track URI"] == track_2].iloc[0,:]
                
        row = np.abs(track_1_features[quantitative_features] - track_2_features[quantitative_features])
        
        # find common genres
        common_genres = set(str(track_1_features["Genre"]).lower().split(",")) & set(str(track_2_features["Genre"]).lower().split(","))
        if common_genres:
            row["Genre"] = 1
        else:
            row["Genre"] = 0
            
        # check for common artist, album
        if track_1_features["Artist"] == track_2_features["Artist"]:
            row["Artist"] = 1
        else:
            row["Artist"] = 0
        if track_1_features["Album"] == track_2_features["Album"]:
            row["Album"] = 1
        else:
            row["Album"] = 0
                                
        track_frame.loc[index, features_considered] = row
        
            
    return track_frame

In [482]:
def train_model(playlist_similarities):
    similarity_with_differential_features = calc_differential_features(playlist_similarities)
#     model = LinearRegression()
    model = RandomForestRegressor(n_estimators=100)
    model.fit(similarity_with_differential_features.loc[:,features_considered], similarity_with_differential_features.loc[:,"Playlist Similarity"])
    return model

In [483]:
def predict_similarity(model, track_frame):
    X = calc_differential_features(track_frame)
    return model.predict(X.loc[:,features_considered])

In [484]:
def calc_mse(testing_pairs, prediction):
    playlist_similarities = np.array(testing_pairs["Playlist Similarity"])
    return np.sum((playlist_similarities - prediction)**2)/len(prediction)

In [485]:
def get_uri(track,artist):
    return playlists_feats.loc[(playlists_feats["Artist"] == artist) & (playlists_feats["Track"] == track)].iloc[0]["Track URI"]

In [541]:
def get_artist_songs(artist):
    return playlists_feats[playlists_feats["Artist"] == artist]["Track"].unique()

In [530]:
def predict_most_similar(model, base_track, min_track_occurrences, sorting=True, index=None):
    common_tracks = find_common_tracks(min_track_occurrences)["Track URI"].unique()
    df = pd.DataFrame(common_tracks, columns=["Track URI 1"])
    df["Track URI 2"] = base_track
        
    col_name = "Predicted Playlist Similarity"
    if index is not None:
        col_name = col_name + " " + str(index)
        
    predictions = predict_similarity(model, df)
    df[col_name] = predictions
    df = df.rename(columns={"Track URI 1": "Track URI"})
    
    df_without_artist = df.drop("Artist",axis=1)
    
    df = pd.merge(df_without_artist, playlists_feats, on=["Track URI"], how="left").drop_duplicates("Track URI")
        
    df = df[["Track","Artist",col_name,"Track URI"]]
    
    if sorting:
        df = df.sort_values(by=col_name, ascending=False)
        df.reset_index(drop=True, inplace=True)
        df = df[["Track","Artist",col_name]]
                
    return df

In [487]:
def predict_most_similar_for_playlist(model, base_tracks, min_track_occurrences):
    
    df = predict_most_similar(model, base_tracks[0], min_track_occurrences, sorting=False)
    
    for index, base_track in enumerate(base_tracks[1:]):
        df2 = predict_most_similar(model, base_track, min_track_occurrences, sorting=False, index=index)
        df = pd.merge(df, df2, on=["Track","Artist","Track URI"], how="inner")
        
    df = df[~df["Track URI"].isin(base_tracks)]
        
    df["Average Predicted Similarity"] = df.iloc[:,2:].mean(axis=1)
    df = df.sort_values(by="Average Predicted Similarity", ascending=False)
    df.reset_index(drop=True, inplace=True)
    
    return df[["Track","Artist","Average Predicted Similarity"]]

# Trying it out

Generate pairs and train the model:

In [555]:
n_training_pairs = 7000
n_testing_pairs = 3000
min_track_occurrences = 10

start = time.time()
playlist_similarities = calc_playlist_similarity_random_pairs_weighted(n_training_pairs,printing=True)
print("Generating %d pairs took %.1f seconds"%(n_training_pairs, time.time()-start))

start = time.time()
model = train_model(playlist_similarities)
print("Training on %d pairs took %.1f seconds"%(n_training_pairs, time.time()-start))


start = time.time()
testing_pairs = calc_playlist_similarity_random_pairs_weighted(n_testing_pairs,printing=False)
print("Generating %d pairs took %.1f seconds"%(n_testing_pairs, time.time()-start))

start = time.time()
prediction = predict_similarity(model, testing_pairs)
mse = calc_mse(testing_pairs, prediction)
print("Prediction and error calcs on %d pairs took %.1f seconds"%(n_testing_pairs, time.time()-start))
print("RMSE: %.2f%% similarity"%((mse**.5)*100))

% of pairs having 0% playlist similarity: 25.7%
Generating 7000 pairs took 17.4 seconds
Training on 7000 pairs took 38.1 seconds
Generating 3000 pairs took 7.5 seconds
Prediction and error calcs on 3000 pairs took 9.4 seconds
RMSE: 6.01% similarity


Show order of importance for features

In [533]:
ranking_order = np.argsort(-model.feature_importances_)
ranked_features = [(features_considered[index], model.feature_importances_[index]) for index in ranking_order]
ranked_features

[('Genre', 0.1509630291034687),
 ('Date', 0.1459615940540826),
 ('Acousticness', 0.07276287510799069),
 ('Speechiness', 0.07042217302912378),
 ('Liveness', 0.0668282233386714),
 ('Tempo', 0.06076103962525524),
 ('Energy', 0.05897613897010203),
 ('Duration', 0.05647755245242572),
 ('Loudness', 0.05607731333308247),
 ('Valence', 0.0538437619147277),
 ('Danceability', 0.052603685566614856),
 ('Instrumentalness', 0.05061602570405478),
 ('Artist', 0.04019293597225631),
 ('Key', 0.0345493803889013),
 ('Album', 0.019740540173251313),
 ('Mode', 0.006732363666231184),
 ('Time Signature', 0.0024913675997599435)]

### Trying out recommendations

Testing out two happy, upbeat pop songs from around 2010 by female artists.

In [534]:
playlist = [get_uri("Party In The U.S.A.","Miley Cyrus"), get_uri("Call Me Maybe","Carly Rae Jepsen")]
recommendations = predict_most_similar_for_playlist(model, playlist, min_track_occurrences)
recommendations.iloc[:10,:]

Unnamed: 0,Track,Artist,Average Predicted Similarity
0,Adore You,Miley Cyrus,0.241238
1,Wrecking Ball,Miley Cyrus,0.233847
2,Wings,Little Mix,0.224153
3,We Can't Stop,Miley Cyrus,0.222359
4,When I Look At You,Miley Cyrus,0.219679
5,Try,P!nk,0.1936
6,Raise Your Glass,P!nk,0.190875
7,New Rules,Dua Lipa,0.188896
8,Blow Me (One Last Kiss),P!nk,0.186444
9,One Thing,One Direction,0.18596


In [547]:
get_artist_songs("Miley Cyrus")

array(['Party In The U.S.A.', "We Can't Stop", 'Wrecking Ball',
       'When I Look At You', 'Adore You', 'Malibu', 'Hands Of Love',
       'Younger Now'], dtype=object)

In [548]:
get_artist_songs("Carly Rae Jepsen")

array(['Call Me Maybe'], dtype=object)

Lots of Miley Cyrus here. The 4 that are all recommended are the oldest songs of hers in the dataset (closest to the time frame of the two provided songs). Meanwhile, Carly Rae Jepsen doesn't have any other songs in the dataset, so nothing else of hers ends up here.

The two songs that are sort of outliers are New Rules by Dua Lipa (a more recent song and isn't as happy) and One Thing by One Direction (male artist but otherwise good fit). However they're still both high-energy pop songs from roughly the same time frame.

Artist gender wasn't in the dataset, but may have been correlated with other features.

Testing out two Kendrick Lamar songs.

In [535]:
playlist = [get_uri("Money Trees","Kendrick Lamar"), get_uri("HUMBLE.","Kendrick Lamar")]
recommendations = predict_most_similar_for_playlist(model, playlist, min_track_occurrences)
recommendations.iloc[:10,:]

Unnamed: 0,Track,Artist,Average Predicted Similarity
0,ELEMENT.,Kendrick Lamar,0.484866
1,LUST.,Kendrick Lamar,0.475052
2,GOD.,Kendrick Lamar,0.46806
3,LOYALTY. FEAT. RIHANNA.,Kendrick Lamar,0.465291
4,FEAR.,Kendrick Lamar,0.463079
5,XXX. FEAT. U2.,Kendrick Lamar,0.452618
6,YAH.,Kendrick Lamar,0.451394
7,Swimming Pools (Drank) - Extended Version,Kendrick Lamar,0.449418
8,LOVE. FEAT. ZACARI.,Kendrick Lamar,0.444363
9,PRIDE.,Kendrick Lamar,0.441463


...and everything is Kendrick Lamar. Makes sense. Notice the extremely high predicted similarity.

Testing out three heavy-sounding rap songs from adjacent years with male artists, although HUMBLE. is less electronic/autotuned than the other two.

In [553]:
playlist = [get_uri("HUMBLE.","Kendrick Lamar"), get_uri("goosebumps","Travis Scott"), get_uri("Trap Queen","Fetty Wap")]
recommendations = predict_most_similar_for_playlist(model, playlist, min_track_occurrences)
recommendations.iloc[:10,:]

Unnamed: 0,Track,Artist,Average Predicted Similarity
0,Skateboard P (feat. Big Sean),MadeinTYO,0.269042
1,ELEMENT.,Kendrick Lamar,0.259452
2,Bank Account,21 Savage,0.255595
3,beibs in the trap,Travis Scott,0.254595
4,LOYALTY. FEAT. RIHANNA.,Kendrick Lamar,0.254001
5,LUST.,Kendrick Lamar,0.252952
6,You Was Right,Lil Uzi Vert,0.251003
7,wokeuplikethis*,Playboi Carti,0.245141
8,outside,Travis Scott,0.244971
9,Plain Jane,A$AP Ferg,0.244437


The three Kendrick Lamar songs are on the same album as HUMBLE., and the same is true for the Travis Scott songs as goosebumps. Trap Queen is the only Fetty Wap song in the dataset. They aren't upbeat, heavy-sounding songs, however.

The rest of the songs are all rap songs from the same couple of years and all seem to match the vibe better.

Same thing with gender as last time -- all are by male artists.

Testing out two low-tempo rock songs by Pink Floyd.

In [536]:
playlist = [get_uri("Wish You Were Here","Pink Floyd"), get_uri("Comfortably Numb","Pink Floyd")]
recommendations = predict_most_similar_for_playlist(model, playlist, min_track_occurrences)
recommendations.iloc[:10,:]

Unnamed: 0,Track,Artist,Average Predicted Similarity
0,"Another Brick in the Wall, Pt. 2",Pink Floyd,0.4415
1,Let My Love Open The Door,Pete Townshend,0.151276
2,Life's Been Good,Joe Walsh,0.150531
3,Wuthering Heights,Kate Bush,0.149801
4,T.N.T.,AC/DC,0.149022
5,Livin' Thing,Electric Light Orchestra,0.149003
6,It's Still Rock and Roll to Me,Billy Joel,0.146006
7,Come Sail Away,Styx,0.145833
8,Rich Girl,Daryl Hall & John Oates,0.143075
9,Breakdown,Tom Petty and the Heartbreakers,0.140622


In [545]:
get_artist_songs("Pink Floyd")

array(['Another Brick in the Wall, Pt. 2', 'Wish You Were Here',
       'Comfortably Numb'], dtype=object)

The only other Pink Floyd song in the dataset is the top song, and has a much higher predicted similarity than the others.

I'm not as familiar with these songs as I am with the other tests. They are all older rock songs, although the vibe of these songs seems to vary a lot.