In [1]:
import pandas as pd
import numpy as np
import time
from scipy.spatial.distance import pdist
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from multiprocessing import Pool
from scipy.sparse import coo_matrix
from scipy.sparse import save_npz
import os

In [2]:
msd = pd.read_hdf('data/full_msd_with_audio_features.h5', key='df')

# this is because song_id is in the format:  "b'SOSIYAD12A8C14097F'"
for messed_up_column in ['song_id', 'track_id', 'artist_mbid', 'artist_name', 'title']:
    msd[messed_up_column] = msd[messed_up_column].str.slice(start=2, stop=-1)

msd.head()

Unnamed: 0,artist_mbid,artist_name,artist_playmeid,release,release_7digitalid,song_hotttnesss,song_id,title,track_7digitalid,track_id,...,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,29762c82-bb92-4acd-b1fb-09cc4da250d2,Joe Satriani,8426,b'Super Colossal',308005,0.693272,SOSIYAD12A8C14097F,It's So Good,3473087,TRCCCMQ128F42AE752,...,0.91,0.764,7,0.403,-4.847,1,0.109,175.816,4,0.282
1,,LANDA DANIEL,-1,b'Konec',162603,,SOHPHTP12A8C13BF53,A KDO TEDA VODDELAL TOHO FANDU?,1723114,TRCCCFZ128F4283A22,...,0.315,4e-06,9,0.0911,-12.951,0,0.0991,97.886,4,0.309
2,e96c89d3-b013-48c2-96a6-da06d2eda534,Middle Of The Road,26778,b'MediaMarkt - Collection',302021,0.266955,SOFVVGL12A8C13C32F,Bonjour ca va,3412731,TRCCCJT128F429FFF6,...,0.787,8e-05,9,0.714,-6.344,1,0.0355,128.181,4,0.969
3,ec475fb8-d454-406e-b473-7f0b0d815f9d,Nickodemus,-1,b'Turntables On The Hudson Lunar New Year 4707',479992,0.690676,SOHXIRQ12AAA15CF81,Endangered Species,5327529,TRCCCSK128F92EE3B2,...,0.676,0.919,3,0.0797,-8.48,0,0.0437,94.994,4,0.527
4,975d94b4-7ca8-4eec-ae48-1aa4660995d3,Tyrese,7102,b'Alter Ego',308128,0.734211,SOJHDEN12AB018B650,Gotta Get You,3476419,TRCCCEW128F42AF457,...,0.603,0.0,11,0.185,-4.419,0,0.0507,124.088,4,0.399


In [3]:
msd.shape

(447499, 25)

In [4]:
# track_genres = pd.read_table('http://www.ifs.tuwien.ac.at/mir/msd/partitions/msd-MAGD-genreAssignment.cls',names=['TrackID','Genre'],sep='\t')
track_genres = pd.read_table(
    'data/msd-MAGD-genreAssignment.cls',
    names=['track_id', 'genre'],
    sep='\t')
track_genres.head()

  """


Unnamed: 0,track_id,genre
0,TRAAAAK128F9318786,Pop_Rock
1,TRAAAAV128F421A322,Pop_Rock
2,TRAAAAW128F429D538,Rap
3,TRAAABD128F429CF47,Pop_Rock
4,TRAAACV128F423E09E,Pop_Rock


In [5]:
msd = msd.merge(track_genres, how='left', left_on='track_id', right_on='track_id')
msd = msd.drop_duplicates(subset='song_id')
msd.head()

Unnamed: 0,artist_mbid,artist_name,artist_playmeid,release,release_7digitalid,song_hotttnesss,song_id,title,track_7digitalid,track_id,...,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre
0,29762c82-bb92-4acd-b1fb-09cc4da250d2,Joe Satriani,8426,b'Super Colossal',308005,0.693272,SOSIYAD12A8C14097F,It's So Good,3473087,TRCCCMQ128F42AE752,...,0.764,7,0.403,-4.847,1,0.109,175.816,4,0.282,Pop_Rock
1,,LANDA DANIEL,-1,b'Konec',162603,,SOHPHTP12A8C13BF53,A KDO TEDA VODDELAL TOHO FANDU?,1723114,TRCCCFZ128F4283A22,...,4e-06,9,0.0911,-12.951,0,0.0991,97.886,4,0.309,
2,e96c89d3-b013-48c2-96a6-da06d2eda534,Middle Of The Road,26778,b'MediaMarkt - Collection',302021,0.266955,SOFVVGL12A8C13C32F,Bonjour ca va,3412731,TRCCCJT128F429FFF6,...,8e-05,9,0.714,-6.344,1,0.0355,128.181,4,0.969,
3,ec475fb8-d454-406e-b473-7f0b0d815f9d,Nickodemus,-1,b'Turntables On The Hudson Lunar New Year 4707',479992,0.690676,SOHXIRQ12AAA15CF81,Endangered Species,5327529,TRCCCSK128F92EE3B2,...,0.919,3,0.0797,-8.48,0,0.0437,94.994,4,0.527,
4,975d94b4-7ca8-4eec-ae48-1aa4660995d3,Tyrese,7102,b'Alter Ego',308128,0.734211,SOJHDEN12AB018B650,Gotta Get You,3476419,TRCCCEW128F42AF457,...,0.0,11,0.185,-4.419,0,0.0507,124.088,4,0.399,RnB


In [6]:
msd.shape

(426545, 26)

In [7]:
msd.groupby('genre')['song_id'].nunique(dropna=False)

genre
Avant_Garde          462
Blues               2935
Children             220
Classical            305
Comedy_Spoken        851
Country             5256
Easy_Listening       802
Electronic         16097
Folk                2714
Holiday               82
International       7268
Jazz                7352
Latin               8924
New Age             1646
Pop_Rock          110879
Rap                 8806
Reggae              3028
Religious           3658
RnB                 5927
Stage               1073
Vocal               2614
Name: song_id, dtype: int64

In [8]:
col_names = ['user_id', 'song_id', 'play_count']
train_triplets = pd.read_csv('data/train_triplets.txt', sep='\t', names=col_names)
test_visible_triplets = pd.read_csv('data/EvalDataYear1MSDWebsite/year1_test_triplets_visible.txt', sep='\t', names=col_names)
test_hidden_triplets = pd.read_csv('data/EvalDataYear1MSDWebsite/year1_test_triplets_hidden.txt', sep='\t', names=col_names)

train_triplets['is_test'] = False
test_visible_triplets['is_test'] = True
test_hidden_triplets['is_test'] = True

train_triplets['is_hidden'] = False
test_visible_triplets['is_hidden'] = False
test_hidden_triplets['is_hidden'] = True

In [9]:
all_triplets = train_triplets.append(test_visible_triplets, ignore_index=True).append(test_hidden_triplets, ignore_index=True)
all_niplets = all_triplets.merge(msd, left_on='song_id', right_on='song_id')

visible_niplets = all_niplets.loc[all_niplets['is_hidden'] == False]
hidden_niplets = all_niplets.loc[all_niplets['is_hidden'] == True]

In [10]:
all_niplets.shape

(18141049, 30)

In [11]:
visible_niplets.shape

(17673404, 30)

In [12]:
hidden_niplets.shape

(467645, 30)

In [13]:
#This is to remove the users that have test data but not hidden data
visible_users = visible_niplets['user_id'].drop_duplicates()
hidden_users = hidden_niplets['user_id'].drop_duplicates()
blacklisted_users = hidden_users[~hidden_users.isin(visible_users)].values

hidden_niplets = hidden_niplets.loc[~hidden_niplets['user_id'].isin(blacklisted_users)]
all_niplets = all_niplets.loc[~all_niplets['user_id'].isin(blacklisted_users)]

In [14]:
len(blacklisted_users)

4202

In [15]:
all_niplets.shape

(18131796, 30)

In [16]:
visible_niplets.shape

(17673404, 30)

In [17]:
hidden_niplets.shape

(458392, 30)

In [18]:
all_niplets.head()

Unnamed: 0,user_id,song_id,play_count,is_test,is_hidden,artist_mbid,artist_name,artist_playmeid,release,release_7digitalid,...,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,False,False,ff6e677f-91dd-4986-a174-8db0474b1799,Jack Johnson,759,b'Thicker Than Water',192496,...,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389,
1,7c86176941718984fed11b7c0674ff04c029b480,SOAKIMP12A8C130995,1,False,False,ff6e677f-91dd-4986-a174-8db0474b1799,Jack Johnson,759,b'Thicker Than Water',192496,...,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389,
2,76235885b32c4e8c82760c340dc54f9b608d7d7e,SOAKIMP12A8C130995,3,False,False,ff6e677f-91dd-4986-a174-8db0474b1799,Jack Johnson,759,b'Thicker Than Water',192496,...,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389,
3,250c0fa2a77bc6695046e7c47882ecd85c42d748,SOAKIMP12A8C130995,1,False,False,ff6e677f-91dd-4986-a174-8db0474b1799,Jack Johnson,759,b'Thicker Than Water',192496,...,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389,
4,3f73f44560e822344b0fb7c6b463869743eb9860,SOAKIMP12A8C130995,6,False,False,ff6e677f-91dd-4986-a174-8db0474b1799,Jack Johnson,759,b'Thicker Than Water',192496,...,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389,


In [19]:
# Get the full dataset since I will need to put this in COO matrix. It will be difficult matching up the song ids of test and train
# if they are treated independently. Putting them together and then splitting the train and test set when creating the COO matrix is the
# easiest way of ensuring that the indices remain correct.

# Build the train and test CSR matrices.

all_niplets['user_id'] = all_niplets['user_id'].astype("category")
all_niplets['song_id'] = all_niplets['song_id'].astype("category")

num_users = all_niplets['user_id'].cat.codes.unique().shape[0]
num_songs = all_niplets['song_id'].cat.codes.unique().shape[0]

train_data = all_niplets.loc[all_niplets['is_hidden'] == False]
test_data = all_niplets.loc[all_niplets['is_hidden'] == True]

rows_train = train_data['song_id'].cat.codes.copy()
cols_train = train_data['user_id'].cat.codes.copy()
data_train = train_data['play_count'].astype(np.float32)

rows_test = test_data['song_id'].cat.codes.copy()
cols_test = test_data['user_id'].cat.codes.copy()
data_test = test_data['play_count'].astype(np.float32)

train_plays = coo_matrix((data_train, (rows_train, cols_train)), shape=(num_songs, num_users)).tocsr()
test_plays = coo_matrix((data_test, (rows_test, cols_test)), shape=(num_songs, num_users)).tocsr()

user_mapping = np.vstack((all_niplets['user_id'].cat.codes.copy().values,
                          all_niplets['user_id'].values)).T
song_mapping = np.vstack((all_niplets['song_id'].cat.codes.copy().values,
                          all_niplets['song_id'].values)).T

user_id_to_user_index = pd.DataFrame(columns=['sparse_index', 'user_id'], data=user_mapping).drop_duplicates()
song_id_to_song_index = pd.DataFrame(columns=['sparse_index', 'song_id'], data=song_mapping).drop_duplicates()

In [20]:
user_id_to_user_index.head()

Unnamed: 0,sparse_index,user_id
0,796068,b80344d063b5ccb3212f76538f3d9e43d87dca9e
1,538494,7c86176941718984fed11b7c0674ff04c029b480
2,510903,76235885b32c4e8c82760c340dc54f9b608d7d7e
3,159634,250c0fa2a77bc6695046e7c47882ecd85c42d748
4,273980,3f73f44560e822344b0fb7c6b463869743eb9860


In [21]:
user_id_to_user_index.shape

(1107613, 2)

In [22]:
song_id_to_song_index.head()

Unnamed: 0,sparse_index,song_id
0,4785,SOAKIMP12A8C130995
2457,7052,SOAPDEY12A81C210A9
3238,14453,SOBFOVM12A58A7D494
3710,20354,SOBSUJE12A6D4F8CF5
4409,21408,SOBVFZR12A6D4F8AE3


In [23]:
song_id_to_song_index.shape

(168493, 2)

In [24]:
# Save to file
user_id_to_user_index.to_hdf('data/user_mapping.h5', key='df', mode='w')
song_id_to_song_index.to_hdf('data/song_mapping.h5', key='df', mode='w')

save_npz('data/train_sparse', train_plays)
save_npz('data/test_sparse', test_plays)

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values] [items->['sparse_index', 'user_id']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values] [items->['sparse_index', 'song_id']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [25]:
#this is for removing duplicates of song_id with different track_id
print(msd['song_id'].unique().shape)
print(msd['song_id'].shape)

(426545,)
(426545,)


In [26]:
song_df = song_id_to_song_index.merge(msd, left_on='song_id', right_on='song_id')
song_df.set_index('sparse_index', inplace=True)

In [27]:
song_df.head()

Unnamed: 0_level_0,song_id,artist_mbid,artist_name,artist_playmeid,release,release_7digitalid,song_hotttnesss,title,track_7digitalid,track_id,...,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre
sparse_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4785,SOAKIMP12A8C130995,ff6e677f-91dd-4986-a174-8db0474b1799,Jack Johnson,759,b'Thicker Than Water',192496,0.649006,The Cove,2093263,TRIQAUQ128F42435AD,...,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389,
7052,SOAPDEY12A81C210A9,8a7cf497-dc5c-4523-932d-3fcbc9a69d38,Billy Preston,2976,b'To Die For',275856,0.826375,Nothing from Nothing,3072847,TRIRLYL128F42539D1,...,0.0135,9,0.197,-4.027,0,0.0288,102.83,4,0.92,
14453,SOBFOVM12A58A7D494,87ebbe67-3910-4521-a418-4fe53eb912b7,The Dead 60s,3412,"b""Nick & Norah's Infinite Playlist - Original ...",512793,0.754628,Riot Radio (Soundtrack Version),5674853,TRAHZNE128F9341B86,...,0.244,7,0.101,-7.372,0,0.0293,143.948,4,0.833,
20354,SOBSUJE12A6D4F8CF5,abb91078-f7db-41f2-8f07-7f37bb739143,Jorge Drexler,4783,b'10 + Downloaded',168831,0.265861,12 segundos de oscuridad,1788507,TRPLAXZ128F4292406,...,0.000412,0,0.102,-8.176,0,0.0327,126.051,4,0.0396,
21408,SOBVFZR12A6D4F8AE3,023d64c9-93db-4a20-8c5c-2efa1a53481a,Josh Rouse,39051,b'Under Cold Blue Stars',38692,0.645846,Ears To The Ground (Album Version),413960,TREGAVI128F147C1CA,...,0.534,1,0.119,-10.62,0,0.0314,123.195,4,0.402,Pop_Rock


In [28]:
song_df.shape

(168493, 26)

In [29]:
song_df.to_hdf('data/song_df.h5', key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->['song_id', 'artist_mbid', 'artist_name', 'release', 'title', 'track_id', 'spotify_id', 'genre']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [30]:
visible_niplets_with_sparse_index = visible_niplets.merge(song_id_to_song_index, left_on='song_id', right_on='song_id')

In [31]:
visible_niplets_with_sparse_index.head()

Unnamed: 0,user_id,song_id,play_count,is_test,is_hidden,artist_mbid,artist_name,artist_playmeid,release,release_7digitalid,...,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre,sparse_index
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,False,False,ff6e677f-91dd-4986-a174-8db0474b1799,Jack Johnson,759,b'Thicker Than Water',192496,...,7,0.128,-15.423,1,0.0445,123.685,4,0.389,,4785
1,7c86176941718984fed11b7c0674ff04c029b480,SOAKIMP12A8C130995,1,False,False,ff6e677f-91dd-4986-a174-8db0474b1799,Jack Johnson,759,b'Thicker Than Water',192496,...,7,0.128,-15.423,1,0.0445,123.685,4,0.389,,4785
2,76235885b32c4e8c82760c340dc54f9b608d7d7e,SOAKIMP12A8C130995,3,False,False,ff6e677f-91dd-4986-a174-8db0474b1799,Jack Johnson,759,b'Thicker Than Water',192496,...,7,0.128,-15.423,1,0.0445,123.685,4,0.389,,4785
3,250c0fa2a77bc6695046e7c47882ecd85c42d748,SOAKIMP12A8C130995,1,False,False,ff6e677f-91dd-4986-a174-8db0474b1799,Jack Johnson,759,b'Thicker Than Water',192496,...,7,0.128,-15.423,1,0.0445,123.685,4,0.389,,4785
4,3f73f44560e822344b0fb7c6b463869743eb9860,SOAKIMP12A8C130995,6,False,False,ff6e677f-91dd-4986-a174-8db0474b1799,Jack Johnson,759,b'Thicker Than Water',192496,...,7,0.128,-15.423,1,0.0445,123.685,4,0.389,,4785


In [32]:
assert visible_niplets_with_sparse_index.shape[0] == visible_niplets.shape[0]

In [33]:
print('Grouping df...')
gb = visible_niplets_with_sparse_index.groupby('user_id')
gb_list = list(gb)

Grouping df...


In [34]:
# Using multiprocessing.Pool.map and play_count weightings
# LET n = number of songs
# LET m = number of audio features

# Values from this paper: https://projects.ori.org/lrg/PDFs_papers/RentfrowEtal2012MUSICReplicationMP.pdf
# spotify features from this endpoint: - https://developer.spotify.com/documentation/web-api/reference/tracks/get-several-audio-features/
feature_MUSIC_dict = {
    'danceability': np.array([-0.37, 0.05, -0.35, 0.08, 0.43]),
    'energy': np.array([-0.64, -0.46, -0.13, 0.66, -0.03]),
    'instrumentalness': np.array([0.20, -0.47, 0.28, 0.09, -0.01]),
    'liveness': np.array([-0.69, -0.12, -0.07, 0.43, 0.02]),
    'loudness': np.array([-0.58, -0.19, -0.44, 0.79, -0.21]),
    'valence': np.array([-0.04, 0.18, 0.24, -0.34, 0.18]),
}
# feature_MUSIC_matrix -> m x 5 matrix, where m is the number of audio features in feature_MUSIC_dict
feature_MUSIC_matrix = [MUSIC for MUSIC in feature_MUSIC_dict.values()]

def get_MUSIC(sub_df):
    # song_vectors -> n x m matrix, where m is the number of audio features in feature_MUSIC_dict
    song_vectors = sub_df[list(feature_MUSIC_dict.keys())].values
    
    # unweighted_MUSIC_vals -> n x 5 matrix
    unweighted_MUSIC_vals = song_vectors @ feature_MUSIC_matrix
    
    # returns the average MUSIC values weighted by their play_counts
    return list(np.average(unweighted_MUSIC_vals, weights=sub_df['play_count'].values, axis=0))

def get_is_test(sub_df):
    return sub_df['is_test'].values[0]

def get_song_ids(sub_df):
    return sub_df['song_id'].to_list()

def get_song_sparse_indices(sub_df):
    return sub_df['sparse_index'].to_list()

def get_row(gb_item):
    user_id, sub_df = gb_item
    return (user_id, get_MUSIC(sub_df), sub_df.shape[0], get_is_test(sub_df), get_song_sparse_indices(sub_df))

columns = ['user_id', 'MUSIC', 'num_songs', 'is_test', 'song_sparse_indices']
print('Starting pool.map...')
user_df_data = Pool(os.cpu_count()).map(func=get_row, iterable=gb_list, chunksize=625)
print('Creating dataframe...')
user_df = pd.DataFrame.from_records(user_df_data, columns=columns)
user_df.head()

Starting pool.map...
Creating dataframe...


Unnamed: 0,user_id,MUSIC,num_songs,is_test,song_sparse_indices
0,00000b722001882066dff9d2da8a775658053ea0,"[4.229812033333332, 1.4042373883333334, 3.7184...",3,False,"[20321, 34133, 31702]"
1,00001638d6189236866af9bbf309ae6c2347ffdc,"[4.910766939999999, 1.6201183160000001, 4.2151...",6,False,"[63130, 56895, 107394, 14282, 6830, 48918]"
2,0000175652312d12576d9e6b84f600caa24c4715,"[3.9929606913333338, 1.3756423253666668, 3.569...",6,False,"[22878, 165852, 164119, 78906, 70413, 130812]"
3,00001cf0dce3fb22b0df0f3a1d9cd21e38385372,"[4.160212249999999, 1.38550505, 3.48416005, -6...",3,False,"[13587, 41248, 113455]"
4,0000267bde1b3a70ea75cf2b2d216cb828e3202b,"[5.020851199999999, 1.2992664299999999, 4.2901...",9,False,"[17569, 90464, 111039, 130773, 131152, 134996,..."


In [35]:
user_df.shape

(1107613, 5)

In [36]:
user_df_with_sparse_index = user_df.merge(user_id_to_user_index, left_on='user_id', right_on='user_id')
user_df_with_sparse_index.set_index('sparse_index', inplace=True)

In [37]:
user_df_with_sparse_index.head()

Unnamed: 0_level_0,user_id,MUSIC,num_songs,is_test,song_sparse_indices
sparse_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,00000b722001882066dff9d2da8a775658053ea0,"[4.229812033333332, 1.4042373883333334, 3.7184...",3,False,"[20321, 34133, 31702]"
1,00001638d6189236866af9bbf309ae6c2347ffdc,"[4.910766939999999, 1.6201183160000001, 4.2151...",6,False,"[63130, 56895, 107394, 14282, 6830, 48918]"
2,0000175652312d12576d9e6b84f600caa24c4715,"[3.9929606913333338, 1.3756423253666668, 3.569...",6,False,"[22878, 165852, 164119, 78906, 70413, 130812]"
3,00001cf0dce3fb22b0df0f3a1d9cd21e38385372,"[4.160212249999999, 1.38550505, 3.48416005, -6...",3,False,"[13587, 41248, 113455]"
4,0000267bde1b3a70ea75cf2b2d216cb828e3202b,"[5.020851199999999, 1.2992664299999999, 4.2901...",9,False,"[17569, 90464, 111039, 130773, 131152, 134996,..."


In [38]:
user_df_with_sparse_index.shape

(1107613, 5)

In [39]:
user_df_with_sparse_index.to_hdf('data/user_df.h5', key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->['user_id', 'MUSIC', 'song_sparse_indices']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)
