In [2]:
import pandas as pd
import numpy as np
import time
from scipy.spatial.distance import pdist
from sklearn.preprocessing import StandardScaler
import dask.dataframe as dd
from dask.multiprocessing import get
from multiprocessing import cpu_count
from tqdm import tqdm
from numba import njit
from multiprocessing import Pool

In [3]:
embedding_cols = [
#     'year',
    'acousticness',
    'danceability',
    'duration_ms',
    'energy',
    'instrumentalness',
    'key',
    'liveness',
    'loudness',
    'mode',
    'speechiness',
    'tempo',
    'time_signature',
    'valence'
]

msd = pd.read_hdf('data/full_msd_with_audio_features.h5', key='df')
msd = msd[['song_id'] + embedding_cols]
msd['song_id'] = msd['song_id'].astype(str)

triplets = pd.read_hdf('data/triplets.h5')[['song_id', 'user_id']]
triplets['song_id'] = triplets['song_id'].str.encode('utf-8')
triplets['song_id'] = triplets['song_id'].astype(str)

feature_MUSIC_dict = {
    'danceability': np.array([-0.37, 0.05, -0.35, 0.08, 0.43]),
    'energy': np.array([-0.64, -0.46, -0.13, 0.66, -0.03]),
    'instrumentalness': np.array([0.20, -0.47, 0.28, 0.09, -0.01]),
    'liveness': np.array([-0.69, -0.12, -0.07, 0.43, 0.02]),
    'loudness': np.array([-0.58, -0.19, -0.44, 0.79, -0.21]),
    'valence': np.array([-0.04, 0.18, 0.24, -0.34, 0.18]),
}
feature_MUSIC_matrix = [MUSIC for MUSIC in feature_MUSIC_dict.values()]

In [4]:
msd.head()

Unnamed: 0,song_id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,b'SOSIYAD12A8C14097F',0.0142,0.471,254537,0.91,0.764,7,0.403,-4.847,1,0.109,175.816,4,0.282
1,b'SOHPHTP12A8C13BF53',0.843,0.479,162173,0.315,4e-06,9,0.0911,-12.951,0,0.0991,97.886,4,0.309
2,b'SOFVVGL12A8C13C32F',0.307,0.678,188493,0.787,8e-05,9,0.714,-6.344,1,0.0355,128.181,4,0.969
3,b'SOHXIRQ12AAA15CF81',0.147,0.804,278600,0.676,0.919,3,0.0797,-8.48,0,0.0437,94.994,4,0.527
4,b'SOJHDEN12AB018B650',0.129,0.604,267200,0.603,0.0,11,0.185,-4.419,0,0.0507,124.088,4,0.399


In [5]:
triplets.head()

Unnamed: 0,song_id,user_id
0,b'SOAKIMP12A8C130995',b80344d063b5ccb3212f76538f3d9e43d87dca9e
1,b'SOAPDEY12A81C210A9',b80344d063b5ccb3212f76538f3d9e43d87dca9e
2,b'SOBBMDR12A8C13253B',b80344d063b5ccb3212f76538f3d9e43d87dca9e
3,b'SOBFNSP12AF72A0E22',b80344d063b5ccb3212f76538f3d9e43d87dca9e
4,b'SOBFOVM12A58A7D494',b80344d063b5ccb3212f76538f3d9e43d87dca9e


In [6]:
triplets.shape

(48373586, 2)

In [7]:
niplets = triplets.merge(msd, left_on='song_id', right_on='song_id')
niplets.head()

Unnamed: 0,song_id,user_id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,b'SOAKIMP12A8C130995',b80344d063b5ccb3212f76538f3d9e43d87dca9e,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389
1,b'SOAKIMP12A8C130995',7c86176941718984fed11b7c0674ff04c029b480,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389
2,b'SOAKIMP12A8C130995',76235885b32c4e8c82760c340dc54f9b608d7d7e,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389
3,b'SOAKIMP12A8C130995',250c0fa2a77bc6695046e7c47882ecd85c42d748,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389
4,b'SOAKIMP12A8C130995',3f73f44560e822344b0fb7c6b463869743eb9860,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389


In [8]:
niplets.shape

(19593702, 15)

In [23]:
# Using multiprocessing.Pool.map

def get_cosine_list_dissimilarity(song_vectors):
    if len(song_vectors) == 1:
        return None
    return np.mean(pdist(song_vectors, 'cosine'))

def get_MUSIC(song_vectors):
    return list(np.mean(np.dot(song_vectors, feature_MUSIC_matrix), axis=0))

start = time.time()
gb = niplets.groupby('user_id')

def get_row(gb_item):
    user_id, sub_df = gb_item
    num_songs = sub_df.shape[0]
    if num_songs == 1:
        return {
            'user_id': None,
            "cosine_dissim": None,
            "MUSIC": None,
            "num_songs": None,
        }
    return {
        'user_id': user_id,
        "cosine_dissim": get_cosine_list_dissimilarity(sub_df[embedding_cols].values),
        "MUSIC": get_MUSIC(sub_df[list(feature_MUSIC_dict.keys())].values),
        "num_songs": num_songs,
    }
    
data = Pool().map(func=get_row, iterable=list(gb), chunksize=625)
df = pd.DataFrame(data)
df = df.dropna() # Removes all users who only listened to 1 song
df['div_pref'] = StandardScaler().fit_transform(df['cosine_dissim'].values.reshape(-1,1))/2 + 0.5
print(time.time()-start)


851.9124805927277


In [24]:
df.head()

Unnamed: 0,MUSIC,cosine_dissim,num_songs,user_id,div_pref
0,"[4.229812033333332, 1.4042373883333334, 3.7184...",1.89606e-08,3.0,00000b722001882066dff9d2da8a775658053ea0,0.39853
1,"[5.406152586, 1.7313612479, 4.5080888204, -7.8...",1.820415e-08,6.0,00001638d6189236866af9bbf309ae6c2347ffdc,0.397648
2,"[3.9929606913333338, 1.3756423253666668, 3.569...",3.68889e-08,6.0,0000175652312d12576d9e6b84f600caa24c4715,0.419433
3,"[4.252966333333332, 1.4018970666666666, 3.5408...",2.811389e-10,3.0,00001cf0dce3fb22b0df0f3a1d9cd21e38385372,0.376751
4,"[5.141352444444443, 1.3462904777777778, 4.3820...",1.043408e-07,9.0,0000267bde1b3a70ea75cf2b2d216cb828e3202b,0.498075


In [25]:
df.to_hdf('data/user_div_MUSIC_divpref.h5', key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['MUSIC', 'user_id']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [22]:
niplets.to_hdf('data/niplets.h5', key='df', mode='w')

In [26]:
haidao = pd.read_hdf('data/user_div_MUSIC_divpref.h5', key='df')
haidao.head()

Unnamed: 0,MUSIC,cosine_dissim,num_songs,user_id,div_pref
0,"[4.229812033333332, 1.4042373883333334, 3.7184...",1.89606e-08,3.0,00000b722001882066dff9d2da8a775658053ea0,0.39853
1,"[5.406152586, 1.7313612479, 4.5080888204, -7.8...",1.820415e-08,6.0,00001638d6189236866af9bbf309ae6c2347ffdc,0.397648
2,"[3.9929606913333338, 1.3756423253666668, 3.569...",3.68889e-08,6.0,0000175652312d12576d9e6b84f600caa24c4715,0.419433
3,"[4.252966333333332, 1.4018970666666666, 3.5408...",2.811389e-10,3.0,00001cf0dce3fb22b0df0f3a1d9cd21e38385372,0.376751
4,"[5.141352444444443, 1.3462904777777778, 4.3820...",1.043408e-07,9.0,0000267bde1b3a70ea75cf2b2d216cb828e3202b,0.498075


In [27]:
haidao.shape

(992073, 5)

In [None]:
# Using apply
def get_cosine_list_dissimilarity(df):
    song_vectors = df[embedding_cols].values
    if len(song_vectors) == 1:
        return None
    return np.mean(pdist(song_vectors, 'cosine'))

def get_MUSIC(df):
    song_vectors = df[list(feature_MUSIC_dict.keys())].values
    return list(np.mean(np.dot(song_vectors, feature_MUSIC_matrix), axis=0))

start = time.time()
gb = niplets.head(5).groupby('user_id')

# df = gb.agg([get_cosine_list_dissimilarity, get_MUSIC, 'count'])
# df.columns = df.columns.droplevel(0)
# df = df.dropna() # Removes all users who only listened to 1 song
# df = df.reset_index()
# df = df.rename(index=str, columns={
#     "get_cosine_list_dissimilarity": "cosine_dissim",
#     "get_MUSIC": "MUSIC",
#     "count": "num_songs",
# })
# df['div_pref'] = StandardScaler().fit_transform(df['cosine_dissim'].values.reshape(-1,1))/2 + 0.5
# print(time.time()-start)

# list(gb)[4][1]

# def get_row(gb_item):
#     user_id, sub_df = gb_item
#     return {
#         'user_id': user_id,
#         "cosine_dissim": get_cosine_list_dissimilarity(sub_),
#         "MUSIC": get_MUSIC(sub_),
#         "num_songs": sub_df.shape[0],
#     }
    
# data = Pool().map(func=get_row, iterable=list(gb), chunksize=625)
# df = pd.DataFrame(data)
# df.head()
print(time.time()-start)

# data = []
# for user_id, sub_df in gb:
#     data.append({
#         'user_id': user_id,
#         "cosine_dissim": get_cosine_list_dissimilarity(sub_df[embedding_cols].values),
#         "MUSIC": get_MUSIC(sub_df[list(feature_MUSIC_dict.keys())].values),
#         "num_songs": sub_df.shape[0],
#     })
# #     print(user_id)
# #     print(sub_df)
    


In [86]:
# attempt at using dask

def get_cosine_list_dissimilarity(song_ids):
    song_vectors = msd.loc[msd['song_id'].isin(song_ids)][embedding_cols].values
    if len(song_vectors) == 1:
        return None
    return np.mean(pdist(song_vectors, 'cosine'))

def get_MUSIC(song_ids):
    song_vectors = msd.loc[msd['song_id'].isin(song_ids)][list(feature_MUSIC_dict.keys())].values
    return list(np.mean(np.dot(song_vectors, feature_MUSIC_matrix), axis=0))

def num_songs(song_ids):
    return len(song_ids)

start = time.time()
trips = dd.from_pandas(triplets.head(1000), npartitions=cpu_count())
gb = trips.groupby('user_id')
df = gb.aggregate({
    "cosine_dissim": get_cosine_list_dissimilarity,
    "MUSIC": get_MUSIC,
    "num_songs": "count",
})
df.columns = df.columns.droplevel(0)
df = df.dropna() # Removes all users who only listened to 1 song
df = df.reset_index()
# df = df.rename(index=str, columns={
#     "get_cosine_list_dissimilarity": "cosine_dissim",
#     "get_MUSIC": "MUSIC",
#     "count": "num_songs",
# })
df['div_pref'] = StandardScaler().fit_transform(df['cosine_dissim'].values.reshape(-1,1))/2 + 0.5
print(time.time()-start)

ValueError: unknown aggregate get_cosine_list_dissimilarity

In [90]:
# using pandas groupby agg

def get_cosine_list_dissimilarity(song_ids):
    song_vectors = msd.loc[msd['song_id'].isin(song_ids)][embedding_cols].values
    if len(song_vectors) == 1:
        return None
    return np.mean(pdist(song_vectors, 'cosine'))

def get_MUSIC(song_ids):
    song_vectors = msd.loc[msd['song_id'].isin(song_ids)][list(feature_MUSIC_dict.keys())].values
    return list(np.mean(np.dot(song_vectors, feature_MUSIC_matrix), axis=0))

def num_songs(song_ids):
    return len(song_ids)

start = time.time()
gb = triplets.head(50000).groupby('user_id')
df = gb.agg([get_cosine_list_dissimilarity, get_MUSIC, 'count'])
df.columns = df.columns.droplevel(0)
df = df.dropna() # Removes all users who only listened to 1 song
df = df.reset_index()
df = df.rename(index=str, columns={
    "get_cosine_list_dissimilarity": "cosine_dissim",
    "get_MUSIC": "MUSIC",
    "count": "num_songs",
})
df['div_pref'] = StandardScaler().fit_transform(df['cosine_dissim'].values.reshape(-1,1))/2 + 0.5
print(time.time()-start)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  ret, rcount, out=ret, casting='unsafe', subok=False)


62.2878041267395


In [27]:
df

Unnamed: 0,user_id,cosine_dissim,MUSIC,num_songs,div_pref
0,17aa9f6dbdf753831da8f38c71b66b64373de613,7.611582e-08,"[4.199986510848485, 1.3041145548090916, 3.6276...",137,0.713546
1,4bd88bfb25263a75bbdd467e74018f4ae570e5df,3.248289e-08,"[1.9401426666666663, 0.6614746666666667, 1.947...",17,0.13488
2,5a905f000fc1ff3df7ca807d57edb608863db05d,7.470124e-08,"[4.332525952176469, 1.224742882483333, 3.74100...",442,0.694786
3,85c1f87fea955d09b4bec2e36aee110927aedf9a,6.337935e-08,"[3.4347712182857144, 1.1867674870285714, 3.139...",21,0.544633
4,8937134734f869debcab8f23d77465b4caaa85df,2.291377e-08,"[5.7117070000000005, 1.720231, 4.6471350000000...",13,0.007973
5,969cc6fb74e076a68e36a04409cb9d3765757508,2.027912e-08,"[3.4073237428571423, 1.1621857042857144, 3.108...",33,-0.026968
6,9d6f0ead607ac2a6c2460e4d14fb439a146b7dec,1.053757e-08,"[4.161702376333333, 1.2648024989500002, 3.5553...",11,-0.156162
7,b64cdd1a0bd907e5e00b39e345194768e330d652,1.114394e-07,"[5.250598792949999, 1.7013668240675, 4.4101130...",109,1.182013
8,b80344d063b5ccb3212f76538f3d9e43d87dca9e,9.314409e-08,"[4.228528145627906, 1.3972740461465116, 3.6467...",104,0.939377
9,bd4c6e843f00bd476847fb75c47b4fb430a06856,1.362466e-07,"[4.991013333333332, 1.3573476916666667, 4.1345...",11,1.511009


In [None]:
# TEST CODE

# triplets.head(120)['user_id'].nunique()
gb = triplets.head(5).groupby('user_id')
def yolo(vals):
    print(vals)
    return len(vals)
def polo(vals):
#     print(vals)
    return [len(vals), 1]

df = gb.agg([yolo, polo])
df.columns = df.columns.droplevel(0)
df = df.reset_index()
# df.columns
df.head()