In [1]:
import pandas as pd
import numpy as np
import time
from scipy.spatial.distance import pdist
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from multiprocessing import Pool
from scipy.sparse import coo_matrix
from scipy.sparse import save_npz
import os

In [2]:
embedding_cols = [
    'year',
    'acousticness',
    'danceability',
    'duration_ms',
    'energy',
    'instrumentalness',
    'key',
    'liveness',
    'loudness',
    'mode',
    'speechiness',
    'tempo',
    'time_signature',
    'valence'
]

msd = pd.read_hdf('data/full_msd_with_audio_features.h5', key='df')[['song_id'] + embedding_cols]
# this is because song_id is in the format:  "b'SOSIYAD12A8C14097F'"
msd['song_id'] = msd['song_id'].str.slice(start=2, stop=-1)
msd.head()

Unnamed: 0,song_id,year,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,SOSIYAD12A8C14097F,2006,0.0142,0.471,254537,0.91,0.764,7,0.403,-4.847,1,0.109,175.816,4,0.282
1,SOHPHTP12A8C13BF53,0,0.843,0.479,162173,0.315,4e-06,9,0.0911,-12.951,0,0.0991,97.886,4,0.309
2,SOFVVGL12A8C13C32F,1999,0.307,0.678,188493,0.787,8e-05,9,0.714,-6.344,1,0.0355,128.181,4,0.969
3,SOHXIRQ12AAA15CF81,2008,0.147,0.804,278600,0.676,0.919,3,0.0797,-8.48,0,0.0437,94.994,4,0.527
4,SOJHDEN12AB018B650,2006,0.129,0.604,267200,0.603,0.0,11,0.185,-4.419,0,0.0507,124.088,4,0.399


In [3]:
col_names = ['user_id', 'song_id', 'play_count']
train_triplets = pd.read_csv('data/train_triplets.txt', sep='\t', names=col_names)
test_visible_triplets = pd.read_csv('data/EvalDataYear1MSDWebsite/year1_test_triplets_visible.txt', sep='\t', names=col_names)
test_hidden_triplets = pd.read_csv('data/EvalDataYear1MSDWebsite/year1_test_triplets_hidden.txt', sep='\t', names=col_names)

train_triplets['is_test'] = False
test_visible_triplets['is_test'] = True
test_hidden_triplets['is_test'] = True

train_triplets['is_hidden'] = False
test_visible_triplets['is_hidden'] = False
test_hidden_triplets['is_hidden'] = True

In [4]:
all_triplets = train_triplets.append(test_visible_triplets, ignore_index=True).append(test_hidden_triplets, ignore_index=True)
all_niplets = all_triplets.merge(msd, left_on='song_id', right_on='song_id')

visible_niplets = all_niplets.loc[all_niplets['is_hidden'] == False]
hidden_niplets = all_niplets.loc[all_niplets['is_hidden'] == True]

In [5]:
all_niplets.shape

(20639742, 19)

In [6]:
visible_niplets.shape

(20107787, 19)

In [7]:
hidden_niplets.shape

(531955, 19)

In [8]:
#This is to remove the users that have test data but not hidden data
visible_users = visible_niplets['user_id'].drop_duplicates()
hidden_users = hidden_niplets['user_id'].drop_duplicates()
blacklisted_users = hidden_users[~hidden_users.isin(visible_users)].values

hidden_niplets = hidden_niplets.loc[~hidden_niplets['user_id'].isin(blacklisted_users)]
all_niplets = all_niplets.loc[~all_niplets['user_id'].isin(blacklisted_users)]

In [9]:
len(blacklisted_users)

4202

In [10]:
all_niplets.shape

(20629059, 19)

In [11]:
visible_niplets.shape

(20107787, 19)

In [12]:
hidden_niplets.shape

(521272, 19)

In [13]:
all_niplets.head()

Unnamed: 0,user_id,song_id,play_count,is_test,is_hidden,year,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,False,False,0,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389
1,7c86176941718984fed11b7c0674ff04c029b480,SOAKIMP12A8C130995,1,False,False,0,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389
2,76235885b32c4e8c82760c340dc54f9b608d7d7e,SOAKIMP12A8C130995,3,False,False,0,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389
3,250c0fa2a77bc6695046e7c47882ecd85c42d748,SOAKIMP12A8C130995,1,False,False,0,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389
4,3f73f44560e822344b0fb7c6b463869743eb9860,SOAKIMP12A8C130995,6,False,False,0,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389


In [14]:
# Get the full dataset since I will need to put this in COO matrix. It will be difficult matching up the song ids of test and train
# if they are treated independently. Putting them together and then splitting the train and test set when creating the COO matrix is the
# easiest way of ensuring that the indices remain correct.

# Build the train and test CSR matrices.

all_niplets['user_id'] = all_niplets['user_id'].astype("category")
all_niplets['song_id'] = all_niplets['song_id'].astype("category")

num_users = all_niplets['user_id'].cat.codes.unique().shape[0]
num_songs = all_niplets['song_id'].cat.codes.unique().shape[0]

train_data = all_niplets.loc[all_niplets['is_hidden'] == False]
test_data = all_niplets.loc[all_niplets['is_hidden'] == True]

rows_train = train_data['song_id'].cat.codes.copy()
cols_train = train_data['user_id'].cat.codes.copy()
data_train = train_data['play_count'].astype(np.float32)

rows_test = test_data['song_id'].cat.codes.copy()
cols_test = test_data['user_id'].cat.codes.copy()
data_test = test_data['play_count'].astype(np.float32)

train_plays = coo_matrix((data_train, (rows_train, cols_train)), shape=(num_songs, num_users)).tocsr()
test_plays = coo_matrix((data_test, (rows_test, cols_test)), shape=(num_songs, num_users)).tocsr()

user_mapping = np.vstack((all_niplets['user_id'].cat.codes.copy().values,
                          all_niplets['user_id'].values)).T
song_mapping = np.vstack((all_niplets['song_id'].cat.codes.copy().values,
                          all_niplets['song_id'].values)).T

user_id_to_user_index = pd.DataFrame(columns=['sparse_index', 'user_id'], data=user_mapping).drop_duplicates()
song_id_to_song_index = pd.DataFrame(columns=['sparse_index', 'song_id'], data=song_mapping).drop_duplicates()

In [15]:
user_id_to_user_index.head()

Unnamed: 0,sparse_index,user_id
0,796068,b80344d063b5ccb3212f76538f3d9e43d87dca9e
1,538494,7c86176941718984fed11b7c0674ff04c029b480
2,510903,76235885b32c4e8c82760c340dc54f9b608d7d7e
3,159634,250c0fa2a77bc6695046e7c47882ecd85c42d748
4,273980,3f73f44560e822344b0fb7c6b463869743eb9860


In [16]:
user_id_to_user_index.shape

(1107613, 2)

In [17]:
song_id_to_song_index.head()

Unnamed: 0,sparse_index,song_id
0,4785,SOAKIMP12A8C130995
2457,7052,SOAPDEY12A81C210A9
3238,14453,SOBFOVM12A58A7D494
3710,20354,SOBSUJE12A6D4F8CF5
4409,21408,SOBVFZR12A6D4F8AE3


In [18]:
song_id_to_song_index.shape

(168493, 2)

In [19]:
# Save to file
user_id_to_user_index.to_hdf('data/user_mapping.h5', key='df', mode='w')
song_id_to_song_index.to_hdf('data/song_mapping.h5', key='df', mode='w')

save_npz('data/train_sparse', train_plays)
save_npz('data/test_sparse', test_plays)

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values] [items->['sparse_index', 'user_id']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values] [items->['sparse_index', 'song_id']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [20]:
song_df = song_id_to_song_index.merge(msd.drop_duplicates(), left_on='song_id', right_on='song_id')
song_df.set_index('sparse_index', inplace=True)

In [21]:
song_df.head()

Unnamed: 0_level_0,song_id,year,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
sparse_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
4785,SOAKIMP12A8C130995,0,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389
7052,SOAPDEY12A81C210A9,1974,0.588,0.809,156200,0.831,0.0135,9,0.197,-4.027,0,0.0288,102.83,4,0.92
14453,SOBFOVM12A58A7D494,0,0.000111,0.598,141440,0.76,0.244,7,0.101,-7.372,0,0.0293,143.948,4,0.833
20354,SOBSUJE12A6D4F8CF5,2006,0.119,0.607,246410,0.504,0.000412,0,0.102,-8.176,0,0.0327,126.051,4,0.0396
21408,SOBVFZR12A6D4F8AE3,2002,0.172,0.722,171173,0.501,0.534,1,0.119,-10.62,0,0.0314,123.195,4,0.402


In [22]:
song_df.shape

(168503, 15)

In [23]:
visible_niplets_with_sparse_index = visible_niplets.merge(song_id_to_song_index, left_on='song_id', right_on='song_id')

In [24]:
visible_niplets_with_sparse_index.head()

Unnamed: 0,user_id,song_id,play_count,is_test,is_hidden,year,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,sparse_index
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,False,False,0,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389,4785
1,7c86176941718984fed11b7c0674ff04c029b480,SOAKIMP12A8C130995,1,False,False,0,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389,4785
2,76235885b32c4e8c82760c340dc54f9b608d7d7e,SOAKIMP12A8C130995,3,False,False,0,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389,4785
3,250c0fa2a77bc6695046e7c47882ecd85c42d748,SOAKIMP12A8C130995,1,False,False,0,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389,4785
4,3f73f44560e822344b0fb7c6b463869743eb9860,SOAKIMP12A8C130995,6,False,False,0,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389,4785


In [25]:
assert visible_niplets_with_sparse_index.shape[0] == visible_niplets.shape[0]

In [26]:
print('Grouping df...')
gb = visible_niplets_with_sparse_index.groupby('user_id')
gb_list = list(gb)

Grouping df...


In [27]:
# Using multiprocessing.Pool.map and play_count weightings
# LET n = number of songs
# LET m = number of audio features

feature_MUSIC_dict = {
    'danceability': np.array([-0.37, 0.05, -0.35, 0.08, 0.43]),
    'energy': np.array([-0.64, -0.46, -0.13, 0.66, -0.03]),
    'instrumentalness': np.array([0.20, -0.47, 0.28, 0.09, -0.01]),
    'liveness': np.array([-0.69, -0.12, -0.07, 0.43, 0.02]),
    'loudness': np.array([-0.58, -0.19, -0.44, 0.79, -0.21]),
    'valence': np.array([-0.04, 0.18, 0.24, -0.34, 0.18]),
}
# feature_MUSIC_matrix -> m x 5 matrix, where m is the number of audio features in feature_MUSIC_dict
feature_MUSIC_matrix = [MUSIC for MUSIC in feature_MUSIC_dict.values()]

# TODO: use play counts and scale song_vectors before calculating pdist
def get_cosine_list_dissimilarity(sub_df):
    # song_vectors -> n x m matrix, where m is the number of audio features in the embedding_cols
    song_vectors = sub_df[embedding_cols].values
    if len(song_vectors) == 1:
        return None
    return np.mean(pdist(song_vectors, 'cosine'))

def get_MUSIC(sub_df):
    # song_vectors -> n x m matrix, where m is the number of audio features in feature_MUSIC_dict
    song_vectors = sub_df[list(feature_MUSIC_dict.keys())].values
    
    # unweighted_MUSIC_vals -> n x 5 matrix
    unweighted_MUSIC_vals = song_vectors @ feature_MUSIC_matrix
    
    # returns the average MUSIC values weighted by their play_counts
    return list(np.average(unweighted_MUSIC_vals, weights=sub_df['play_count'].values, axis=0))

def get_is_test(sub_df):
    return sub_df['is_test'].values[0]

def get_song_ids(sub_df):
    return sub_df['song_id'].to_list()

def get_song_sparse_indices(sub_df):
    return sub_df['sparse_index'].to_list()

def get_row(gb_item):
    user_id, sub_df = gb_item
    return (user_id, get_MUSIC(sub_df), sub_df.shape[0], get_is_test(sub_df), get_song_sparse_indices(sub_df))

columns = ['user_id', 'MUSIC', 'num_songs', 'is_test', 'song_sparse_indices']
print('Starting pool.map...')
user_df_data = Pool(os.cpu_count()).map(func=get_row, iterable=gb_list, chunksize=625)
print('Creating dataframe...')
user_df = pd.DataFrame.from_records(user_df_data, columns=columns)
user_df.head()

Starting pool.map...
Creating dataframe...


Unnamed: 0,user_id,MUSIC,num_songs,is_test,song_sparse_indices
0,00000b722001882066dff9d2da8a775658053ea0,"[4.229812033333332, 1.4042373883333334, 3.7184...",3,False,"[20321, 34133, 31702]"
1,00001638d6189236866af9bbf309ae6c2347ffdc,"[4.910766939999999, 1.6201183160000001, 4.2151...",6,False,"[63130, 56895, 107394, 14282, 6830, 48918]"
2,0000175652312d12576d9e6b84f600caa24c4715,"[3.9929606913333338, 1.3756423253666668, 3.569...",6,False,"[22878, 165852, 164119, 78906, 70413, 130812]"
3,00001cf0dce3fb22b0df0f3a1d9cd21e38385372,"[4.160212249999999, 1.38550505, 3.48416005, -6...",3,False,"[13587, 41248, 113455]"
4,0000267bde1b3a70ea75cf2b2d216cb828e3202b,"[5.020851199999999, 1.2992664299999999, 4.2901...",9,False,"[17569, 90464, 111039, 130773, 131152, 134996,..."


In [33]:
user_df.shape

(1107613, 5)

In [34]:
user_df_with_sparse_index = user_df.merge(user_id_to_user_index, left_on='user_id', right_on='user_id')
user_df_with_sparse_index.set_index('sparse_index', inplace=True)

In [35]:
user_df_with_sparse_index.head()

Unnamed: 0_level_0,user_id,MUSIC,num_songs,is_test,song_sparse_indices
sparse_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,00000b722001882066dff9d2da8a775658053ea0,"[4.229812033333332, 1.4042373883333334, 3.7184...",3,False,"[20321, 34133, 31702]"
1,00001638d6189236866af9bbf309ae6c2347ffdc,"[4.910766939999999, 1.6201183160000001, 4.2151...",6,False,"[63130, 56895, 107394, 14282, 6830, 48918]"
2,0000175652312d12576d9e6b84f600caa24c4715,"[3.9929606913333338, 1.3756423253666668, 3.569...",6,False,"[22878, 165852, 164119, 78906, 70413, 130812]"
3,00001cf0dce3fb22b0df0f3a1d9cd21e38385372,"[4.160212249999999, 1.38550505, 3.48416005, -6...",3,False,"[13587, 41248, 113455]"
4,0000267bde1b3a70ea75cf2b2d216cb828e3202b,"[5.020851199999999, 1.2992664299999999, 4.2901...",9,False,"[17569, 90464, 111039, 130773, 131152, 134996,..."


In [36]:
user_df_with_sparse_index.shape

(1107613, 5)

In [37]:
user_df_with_sparse_index.to_hdf('data/user_df.h5', key='df', mode='w')
song_df.to_hdf('data/song_df.h5', key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->['user_id', 'MUSIC', 'song_sparse_indices']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)
