In [1]:
import pandas as pd
import numpy as np
import time
from scipy.spatial.distance import pdist
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from multiprocessing import Pool

In [2]:
embedding_cols = [
#     'year',
    'acousticness',
    'danceability',
    'duration_ms',
    'energy',
    'instrumentalness',
    'key',
    'liveness',
    'loudness',
    'mode',
    'speechiness',
    'tempo',
    'time_signature',
    'valence'
]

msd = pd.read_hdf('data/full_msd_with_audio_features.h5', key='df')[['song_id'] + embedding_cols]
msd['song_id'] = msd['song_id'].astype(str)

# triplets = pd.read_hdf('data/triplets.h5')

train_partial_data = pd.read_csv('data/train_triplets.txt', sep='\t', names=['user_id', 'song_id', 'play_count'])
train_full_data = pd.read_csv('data/EvalDataYear1MSDWebsite/year1_test_triplets_visible.txt', sep='\t', names=['user_id', 'song_id', 'play_count'])
# test_partial_data = pd.read_csv('data/EvalDataYear1MSDWebsite/year1_test_triplets_hidden.txt', sep='\t', names=['user_id', 'song_id', 'play_count'])

train_partial_data['is_test'] = False
train_full_data['is_test'] = True

triplets = train_partial_data.append(train_full_data, ignore_index=True)
triplets['song_id'] = triplets['song_id'].str.encode('utf-8')
triplets['song_id'] = triplets['song_id'].astype(str)

In [3]:
msd.head()

Unnamed: 0,song_id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,b'SOSIYAD12A8C14097F',0.0142,0.471,254537,0.91,0.764,7,0.403,-4.847,1,0.109,175.816,4,0.282
1,b'SOHPHTP12A8C13BF53',0.843,0.479,162173,0.315,4e-06,9,0.0911,-12.951,0,0.0991,97.886,4,0.309
2,b'SOFVVGL12A8C13C32F',0.307,0.678,188493,0.787,8e-05,9,0.714,-6.344,1,0.0355,128.181,4,0.969
3,b'SOHXIRQ12AAA15CF81',0.147,0.804,278600,0.676,0.919,3,0.0797,-8.48,0,0.0437,94.994,4,0.527
4,b'SOJHDEN12AB018B650',0.129,0.604,267200,0.603,0.0,11,0.185,-4.419,0,0.0507,124.088,4,0.399


In [4]:
triplets.head()

Unnamed: 0,user_id,song_id,play_count,is_test
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,b'SOAKIMP12A8C130995',1,False
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,b'SOAPDEY12A81C210A9',1,False
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,b'SOBBMDR12A8C13253B',2,False
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,b'SOBFNSP12AF72A0E22',1,False
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,b'SOBFOVM12A58A7D494',1,False


In [5]:
triplets.shape

(49693480, 4)

In [6]:
niplets = triplets.merge(msd, left_on='song_id', right_on='song_id')
niplets.head()

Unnamed: 0,user_id,song_id,play_count,is_test,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,b'SOAKIMP12A8C130995',1,False,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389
1,7c86176941718984fed11b7c0674ff04c029b480,b'SOAKIMP12A8C130995',1,False,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389
2,76235885b32c4e8c82760c340dc54f9b608d7d7e,b'SOAKIMP12A8C130995',3,False,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389
3,250c0fa2a77bc6695046e7c47882ecd85c42d748,b'SOAKIMP12A8C130995',1,False,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389
4,3f73f44560e822344b0fb7c6b463869743eb9860,b'SOAKIMP12A8C130995',6,False,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389


In [7]:
niplets.shape

(20107787, 17)

In [11]:
# Using multiprocessing.Pool.map and play_count weightings
# LET n = number of songs
# LET m = number of audio features

feature_MUSIC_dict = {
    'danceability': np.array([-0.37, 0.05, -0.35, 0.08, 0.43]),
    'energy': np.array([-0.64, -0.46, -0.13, 0.66, -0.03]),
    'instrumentalness': np.array([0.20, -0.47, 0.28, 0.09, -0.01]),
    'liveness': np.array([-0.69, -0.12, -0.07, 0.43, 0.02]),
    'loudness': np.array([-0.58, -0.19, -0.44, 0.79, -0.21]),
    'valence': np.array([-0.04, 0.18, 0.24, -0.34, 0.18]),
}
# feature_MUSIC_matrix -> m x 5 matrix, where m is the number of audio features in feature_MUSIC_dict
feature_MUSIC_matrix = [MUSIC for MUSIC in feature_MUSIC_dict.values()]

# TODO: use play counts and scale song_vectors before calculating pdist
def get_cosine_list_dissimilarity(sub_df):
    # song_vectors -> n x m matrix, where m is the number of audio features in the embedding_cols
    song_vectors = sub_df[embedding_cols].values
    if len(song_vectors) == 1:
        return None
    return np.mean(pdist(song_vectors, 'cosine'))

def get_MUSIC(sub_df):
    # song_vectors -> n x m matrix, where m is the number of audio features in feature_MUSIC_dict
    song_vectors = sub_df[list(feature_MUSIC_dict.keys())].values
    
    # unweighted_MUSIC_vals -> n x 5 matrix
    unweighted_MUSIC_vals = song_vectors @ feature_MUSIC_matrix
    
    # returns the average MUSIC values weighted by their play_counts
    return list(np.average(unweighted_MUSIC_vals, weights=sub_df['play_count'].values, axis=0))

def get_is_test(sub_df):
    return sub_df['is_test'].values[0]

def get_song_ids(sub_df):
    # this is because each song_id looks like this: "b'SOSIYAD12A8C14097F'" 
    return [song_id_str[2:-1] for song_id_str in sub_df['song_id'].to_list()]

start = time.time()
gb = niplets.groupby('user_id')

def get_row(gb_item):
    user_id, sub_df = gb_item
    return {
        'user_id': user_id,
        'MUSIC': get_MUSIC(sub_df),
        'num_songs': sub_df.shape[0],
        'is_test': get_is_test(sub_df),
        'song_ids': get_song_ids(sub_df)
    }
    
data = Pool().map(func=get_row, iterable=list(gb), chunksize=625)
df = pd.DataFrame(data)
# print(time.time()-start)
df.head()



Unnamed: 0,MUSIC,is_test,num_songs,song_ids,user_id
0,"[4.229812033333332, 1.4042373883333334, 3.7184...",False,3,"[SOBSSGK12A6D4F9EF1, SOCZQCY12AC468E40F, SOCTX...",00000b722001882066dff9d2da8a775658053ea0
1,"[4.910766939999999, 1.6201183160000001, 4.2151...",False,6,"[SOFXSRW12A6D4F3B77, SOFFWTH12A6310D9E8, SOLOD...",00001638d6189236866af9bbf309ae6c2347ffdc
2,"[3.9929606913333338, 1.3756423253666668, 3.569...",False,6,"[SOBYRTY12AB0181EDB, SOYWZXA12A8C138274, SOYFP...",0000175652312d12576d9e6b84f600caa24c4715
3,"[4.160212249999999, 1.38550505, 3.48416005, -6...",False,3,"[SOBDRND12A8C13FD08, SODRFRJ12A8C144167, SOMMJ...",00001cf0dce3fb22b0df0f3a1d9cd21e38385372
4,"[5.020851199999999, 1.2992664299999999, 4.2901...",False,9,"[SOBMSCQ12AAF3B51B7, SOJERWB12A8C13E654, SOMCH...",0000267bde1b3a70ea75cf2b2d216cb828e3202b


In [9]:
df.shape

(1107613, 4)

In [10]:
df.to_hdf('data/user_df.h5', key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->['MUSIC', 'user_id']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


# Garbage/Test Code

In [90]:
# using pandas groupby agg

def get_cosine_list_dissimilarity(song_ids):
    song_vectors = msd.loc[msd['song_id'].isin(song_ids)][embedding_cols].values
    if len(song_vectors) == 1:
        return None
    return np.mean(pdist(song_vectors, 'cosine'))

def get_MUSIC(song_ids):
    song_vectors = msd.loc[msd['song_id'].isin(song_ids)][list(feature_MUSIC_dict.keys())].values
    return list(np.mean(np.dot(song_vectors, feature_MUSIC_matrix), axis=0))

def num_songs(song_ids):
    return len(song_ids)

start = time.time()
gb = triplets.head(50000).groupby('user_id')
df = gb.agg([get_cosine_list_dissimilarity, get_MUSIC, 'count'])
df.columns = df.columns.droplevel(0)
df = df.dropna() # Removes all users who only listened to 1 song
df = df.reset_index()
df = df.rename(index=str, columns={
    "get_cosine_list_dissimilarity": "cosine_dissim",
    "get_MUSIC": "MUSIC",
    "count": "num_songs",
})
df['div_pref'] = StandardScaler().fit_transform(df['cosine_dissim'].values.reshape(-1,1))/2 + 0.5
print(time.time()-start)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  ret, rcount, out=ret, casting='unsafe', subok=False)


62.2878041267395


In [21]:
yolo = pd.DataFrame([{'a': True}])
yolo['b'] = False
yolo

Unnamed: 0,a,b
0,True,False


In [55]:
# yolo = msd.head()
# yolo
# print(yolo['song_id'].to_list())
# print(yolo['song_id'].str[2:-1].to_list())
# yolo
# yolo['song_id']
# print([song_id_str[2:-1] for song_id_str in yolo['song_id'].to_list()])

Unnamed: 0,song_id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,YAD12A8C1409,0.0142,0.471,254537,0.91,0.764,7,0.403,-4.847,1,0.109,175.816,4,0.282
1,HTP12A8C13BF,0.843,0.479,162173,0.315,4e-06,9,0.0911,-12.951,0,0.0991,97.886,4,0.309
2,VGL12A8C13C3,0.307,0.678,188493,0.787,8e-05,9,0.714,-6.344,1,0.0355,128.181,4,0.969
3,IRQ12AAA15CF,0.147,0.804,278600,0.676,0.919,3,0.0797,-8.48,0,0.0437,94.994,4,0.527
4,DEN12AB018B6,0.129,0.604,267200,0.603,0.0,11,0.185,-4.419,0,0.0507,124.088,4,0.399


In [44]:
lol = pd.read_hdf('data/full_msd_with_audio_features.h5', key='df')[['song_id'] + embedding_cols]

[song_id_str[2:-1] for song_id_str in lol['song_id'].to_list()]

['SOSIYAD12A8C14097F',
 'SOHPHTP12A8C13BF53',
 'SOFVVGL12A8C13C32F',
 'SOHXIRQ12AAA15CF81',
 'SOJHDEN12AB018B650',
 'SONVBWO12AB0187B35',
 'SOGLAKB12AB017DF39',
 'SOTAQLI12A8C13CFD5',
 'SOANPML12AB017D645',
 'SOBYKSS12A58A7C17C',
 'SOSJYVH12A8C1424B5',
 'SOCMLQF12A8C145701',
 'SOQCFUK12A8C142F17',
 'SOJTODJ12AB018ADC2',
 'SOKOCYW12AB0184A65',
 'SOSHJEP12A6D4F5886',
 'SOVOLSW12A6D4F8BE9',
 'SODSWWL12AC9618263',
 'SOHRSRU12A8C136D7A',
 'SOFJOMT12A8C137393',
 'SOFENSY12A6D4FB766',
 'SOAZIKY12A6D4FB469',
 'SOAWYTV12AB018A6E8',
 'SOBGWYD12A6D4FD600',
 'SOJEIKW12A6D4F762B',
 'SOBMJHH12AB017DF32',
 'SOPLLWC12A8C138B25',
 'SOKIABX12A8C131124',
 'SONKTHE12AB018246E',
 'SOCNGSC12AB0187A7C',
 'SOKLZDQ12A8C143D9D',
 'SOPOTYN12AB0181CD4',
 'SOJLZCA12A8C133112',
 'SOGGERY12AB01837E9',
 'SORTHCE12AB0182A78',
 'SOWPNTP12AAF3B1D86',
 'SORRXBN12A58A7C684',
 'SOJAWBG12A8C137499',
 'SOJAWBG12A8C137499',
 'SOJAWBG12A8C137499',
 'SOJAWBG12A8C137499',
 'SOIDWYM12AB0187B62',
 'SOAJJBD12A8C133D78',
 'SOLCUIC12

In [None]:
# TEST CODE

# triplets.head(120)['user_id'].nunique()
gb = triplets.head(5).groupby('user_id')
def yolo(vals):
    print(vals)
    return len(vals)
def polo(vals):
#     print(vals)
    return [len(vals), 1]

df = gb.agg([yolo, polo])
df.columns = df.columns.droplevel(0)
df = df.reset_index()
# df.columns
df.head()