In [82]:
import numpy as np
import pandas as pd
import time

In [109]:
feature_MUSIC_dict = {
    'danceability': np.array([-0.37, 0.05, -0.35, 0.08, 0.43]),
    'energy': np.array([-0.64, -0.46, -0.13, 0.66, -0.03]),
    'instrumentalness': np.array([0.20, -0.47, 0.28, 0.09, -0.01]),
    'liveness': np.array([-0.69, -0.12, -0.07, 0.43, 0.02]),
    'loudness': np.array([-0.58, -0.19, -0.44, 0.79, -0.21]),
    'valence': np.array([-0.04, 0.18, 0.24, -0.34, 0.18]),
}
feature_MUSIC_matrix = [MUSIC for MUSIC in feature_MUSIC_dict.values()]

msd = pd.read_hdf('data/full_msd_with_audio_features.h5', key='df')
msd = msd[['song_id'] + list(feature_MUSIC_dict.keys())]

def get_MUSIC(song_ids):    
    song_vectors = msd.loc[msd['song_id'].isin(song_ids)][list(feature_MUSIC_dict.keys())].values
    return np.sum(np.dot(song_vectors, feature_MUSIC_matrix), axis=0)/len(song_ids)

def get_MUSIC_slow(song_ids):
    list_MUSIC = np.zeros(5)
    for song_id in song_ids:
        song_data = msd.loc[msd['song_id'] == song_id]
        for feature, feature_MUSIC in feature_MUSIC_dict.items():
            list_MUSIC += feature_MUSIC * list(song_data[feature])[0]
    return list_MUSIC/len(song_ids)

song_ids = list(msd.head(1000).song_id)
start = time.time()
v1 = get_MUSIC(song_ids)
print(time.time() - start)
print(v1)

# start = time.time()
# v2 = get_MUSIC_slow(song_ids)
# print(time.time() - start)
# print(v2)
# assert v1 == v2

0.046942949295043945
[ 4.78334964  1.50616185  4.0751191  -7.10651683  2.30224419]


In [90]:
msd[msd.columns.difference(['song_id'])].head()

Unnamed: 0,danceability,energy,instrumentalness,liveness,loudness,valence
0,0.471,0.91,0.764,0.403,-4.847,0.282
1,0.479,0.315,4e-06,0.0911,-12.951,0.309
2,0.678,0.787,8e-05,0.714,-6.344,0.969
3,0.804,0.676,0.919,0.0797,-8.48,0.527
4,0.604,0.603,0.0,0.185,-4.419,0.399


In [3]:
# Values from this paper: https://projects.ori.org/lrg/PDFs_papers/RentfrowEtal2012MUSICReplicationMP.pdf
# spotify features from this endpoint: - https://developer.spotify.com/documentation/web-api/reference/tracks/get-several-audio-features/
feature_MUSIC_values = [
    {'spotify_feature': 'danceability', 'paper_feature': 'Danceable', 'M': -0.37, 'U': 0.05, 'S': -0.35, 'I': 0.08, 'C': 0.43,},
    {'spotify_feature': 'energy', 'paper_feature': 'Thrilling', 'M': -0.64, 'U': -0.46, 'S': -0.13, 'I': 0.66, 'C': -0.03,},
    {'spotify_feature': 'instrumentalness', 'paper_feature': 'Instrumental', 'M': 0.20, 'U': -0.47, 'S': 0.28, 'I': 0.09, 'C': -0.01,},
    {'spotify_feature': 'liveness', 'paper_feature': 'Lively', 'M': -0.69, 'U': -0.12, 'S': -0.07, 'I': 0.43, 'C': 0.02,},
    {'spotify_feature': 'loudness', 'paper_feature': 'Loud', 'M': -0.58, 'U': -0.19, 'S': -0.44, 'I': 0.79, 'C': -0.21,},
    {'spotify_feature': 'valence', 'paper_feature': 'Happy', 'M': -0.04, 'U': 0.18, 'S': 0.24, 'I': -0.34, 'C': 0.18,},
]
MUSIC_df = pd.DataFrame(feature_MUSIC_values)
MUSIC_df.head()

Unnamed: 0,C,I,M,S,U,paper_feature,spotify_feature
0,0.43,0.08,-0.37,-0.35,0.05,Danceable,danceability
1,-0.03,0.66,-0.64,-0.13,-0.46,Thrilling,energy
2,-0.01,0.09,0.2,0.28,-0.47,Instrumental,instrumentalness
3,0.02,0.43,-0.69,-0.07,-0.12,Lively,liveliness
4,-0.21,0.79,-0.58,-0.44,-0.19,Loud,loudness


In [36]:

feature_MUSIC_dict.keys()

dict_keys(['danceability', 'energy', 'instrumentalness', 'liveliness', 'loudness', 'valence'])

In [56]:
song_ids = list(msd.head(100).song_id)
list(msd.loc[msd['song_id'] == song_ids[50]]['energy'])[0]
# dict(msd.loc[msd['song_id'] == song_ids[50]])['energy']

0.776

In [84]:
# Not vectorized

feature_MUSIC_dict = {
    'danceability': np.array([-0.37, 0.05, -0.35, 0.08, 0.43]),
    'energy': np.array([-0.64, -0.46, -0.13, 0.66, -0.03]),
    'instrumentalness': np.array([0.20, -0.47, 0.28, 0.09, -0.01]),
    'liveness': np.array([-0.69, -0.12, -0.07, 0.43, 0.02]),
    'loudness': np.array([-0.58, -0.19, -0.44, 0.79, -0.21]),
    'valence': np.array([-0.04, 0.18, 0.24, -0.34, 0.18]),
}
def get_MUSIC(song_ids):
    list_MUSIC = np.zeros(5)
    for song_id in song_ids:
        song_data = msd.loc[msd['song_id'] == song_id]
        for feature, feature_MUSIC in feature_MUSIC_dict.items():
            list_MUSIC += feature_MUSIC * list(song_data[feature])[0]
    
    return list_MUSIC/len(song_ids)

song_ids = list(msd.head(1000).song_id)
start = time.time()
get_MUSIC(song_ids)
print(time.time() - start)

36.068581104278564


In [42]:
msd.head(2)

Unnamed: 0,artist_mbid,artist_name,artist_playmeid,release,release_7digitalid,song_hotttnesss,song_id,title,track_7digitalid,track_id,...,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,b'29762c82-bb92-4acd-b1fb-09cc4da250d2',b'Joe Satriani',8426,b'Super Colossal',308005,0.693272,b'SOSIYAD12A8C14097F',"b""It's So Good""",3473087,b'TRCCCMQ128F42AE752',...,0.91,0.764,7,0.403,-4.847,1,0.109,175.816,4,0.282
1,b'',b'LANDA DANIEL',-1,b'Konec',162603,,b'SOHPHTP12A8C13BF53',b'A KDO TEDA VODDELAL TOHO FANDU?',1723114,b'TRCCCFZ128F4283A22',...,0.315,4e-06,9,0.0911,-12.951,0,0.0991,97.886,4,0.309


In [100]:
# Semi vectorized


feature_MUSIC_dict = {
    'danceability': np.array([-0.37, 0.05, -0.35, 0.08, 0.43]),
    'energy': np.array([-0.64, -0.46, -0.13, 0.66, -0.03]),
    'instrumentalness': np.array([0.20, -0.47, 0.28, 0.09, -0.01]),
    'liveness': np.array([-0.69, -0.12, -0.07, 0.43, 0.02]),
    'loudness': np.array([-0.58, -0.19, -0.44, 0.79, -0.21]),
    'valence': np.array([-0.04, 0.18, 0.24, -0.34, 0.18]),
}

feature_MUSIC_matrix = [MUSIC for MUSIC in feature_MUSIC_dict.values()]

def get_MUSIC(song_ids):
    list_MUSIC = np.zeros(5)
    for song_id in song_ids:
        song_vector = msd.loc[msd['song_id'] == song_id][list(feature_MUSIC_dict.keys())].values[0]
#         df.loc[df['column_name'].isin(some_values)]
        
#         print(feature_MUSIC_matrix)
        list_MUSIC += np.dot(song_vector, feature_MUSIC_matrix)
#         for feature, feature_MUSIC in feature_MUSIC_dict.items():
#             list_MUSIC += feature_MUSIC * list(song_data[feature])[0]
    
    return list_MUSIC/len(song_ids)

song_ids = list(msd.head(10).song_id)
start = time.time()
print(get_MUSIC(song_ids).shape)
print(time.time() - start)

(5,)
0.5955979824066162


In [66]:
a = np.array([[1], [2]])
a.append([1,2])
a

AttributeError: 'numpy.ndarray' object has no attribute 'append'