In [25]:
import os
os.environ['MKL_NUM_THREADS'] = '1'
import time
import tqdm
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
from scipy.sparse import csr_matrix
from scipy.spatial import KDTree
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k
from random import shuffle, randint
from collections import Counter

In [84]:
def build_cf_model(train_plays):
    als_params = {'factors': 16,
                  'dtype': np.float32,
                  'iterations': 15,
                  'calculate_training_loss': True}
    model = AlternatingLeastSquares(**als_params)

    #perform operation to approximate confidence intervals
    #paper doesn't specify an alpha value, so just guess alpha=1
    alpha = 1
    train_plays.data = 1 + np.log(alpha*train_plays.data)
    model.fit(train_plays)
    return model

def build_kdtree(user_MUSIC_df):
    MUSIC_vectors = user_MUSIC_df['MUSIC'].values.tolist()
    return KDTree(MUSIC_vectors)

def get_closest_MUSIC_user_ids(user_id, k, user_MUSIC_df, kdtree):
    user_MUSIC = user_MUSIC_df.loc[user_MUSIC_df['user_id'] == user_id]['MUSIC'].tolist()[0]

    distances, indices = kdtree.query(user_MUSIC, k)
    closest_userids = []

    for index in indices:
        closest_userids.append(user_MUSIC_df.iloc[index, 3])

    return closest_userids


def get_knn_top_m_songs(user_id, k, m, kdtree, user_playlist_df, user_MUSIC_df):
    closest_userids = get_closest_MUSIC_user_ids(user_id, k, user_MUSIC_df, kdtree)
    closest_user_songs = []
    for i in range(len(closest_userids)):
        closest_user_songs.append(user_playlist_df.loc[user_playlist_df[
            'user_id'] == closest_userids[i]]['playlist'].tolist()[0])

    closest_user_songs = [item for sublist in closest_user_songs for item in sublist]
    counted_closest_user_songs = Counter(closest_user_songs)
    top_m_songs = [i[0] for i in counted_closest_user_songs.most_common()[:m]]

    return top_m_songs

def get_recommendations(user_id, user_sparse_index, cf_model, kdtree, train_plays,
                        user_playlist_df, user_MUSIC_df, n, m, k):
        n_songs = [song_tuple[0] for song_tuple in cf_model.recommend(userid=user_sparse_index,
                                                                      user_items=train_plays.transpose(),
                                                                      N=n)]
        m_songs = get_knn_top_m_songs(user_id=user_id,
                                      k=k,
                                      m=m,
                                      kdtree=kdtree,
                                      user_playlist_df=user_playlist_df,
                                      user_MUSIC_df=user_MUSIC_df)
        return shuffle(n_songs + m_songs)

In [None]:
print('Loading data...')
user_playlist_df = pd.read_hdf('data/userid_playlist.h5', key='df')
user_MUSIC_df = pd.read_hdf('data/user_MUSIC_num_songs.h5', key='df')

# train_plays, test_plays -> num_songs x num_users CSR matrix
train_plays = load_npz('data/train_sparse.npz')
test_plays = load_npz('data/test_sparse.npz')

# songs -> CSR_row_index: song_id
songs = pd.read_hdf('data/song_mapping.h5', key='df')

# users -> CSR_col_index: user_id
users = pd.read_hdf('data/user_mapping.h5', key='df')

print('Fitting CF model...')
cf_model = build_cf_model(train_plays)
print('Building KDTree...')
kdtree = build_kdtree(user_MUSIC_df=user_MUSIC_df)

In [96]:
for user_id, user_sparse_index in users.head(1000)[['user', 'sparse_index']].values:
    try:
        song_ids = get_recommendations(user_id=user_id,
                                       user_sparse_index=user_sparse_index,
                                       cf_model=cf_model,
                                       kdtree=kdtree,
                                       train_plays=train_plays,
                                       user_playlist_df=user_playlist_df,
                                       user_MUSIC_df=user_MUSIC_df,
                                       n=10,
                                       m=10,
                                       k=100)
        print(song_ids)
        
    except Exception:
#         print('This user_id does not have a MUSIC value')
        continue

In [9]:
ls data

[34mEvalDataYear1MSDWebsite[m[m/         song_mapping.h5
audio_features.npy               spotify_audio_features.h5
full_msd.csv                     spotify_msd_id_pairs.h5
full_msd.h5                      [31mtest_sparse.npz[m[m*
full_msd_with_audio_features.h5  [31mtrain_sparse.npz[m[m*
full_msd_with_genre.h5           triplet_subset.csv
full_msd_with_spotify.h5         triplets.h5
msd-MAGD-genreAssignment.cls     user_MUSIC_num_songs.h5
msd_subset_metadata.csv          user_div_MUSIC_divpref.h5
msd_subset_usage.csv             user_mapping.h5
niplets.h5                       [31muserid_playlist.h5[m[m*


In [19]:
# # train_plays, test_plays -> num_songs x num_users CSR matrix      
# test_plays = load_npz('data/test_sparse.npz')
# # songs -> CSR_row_index: song_id
# songs = pd.read_hdf('data/song_mapping.h5', key='df')
# # songs -> CSR_col_index: user_id
# users = pd.read_hdf('data/user_mapping.h5', key='df')



# # LET n = number of songs
# # LET m = number of audio features
# feature_MUSIC_dict = {
#     'danceability': np.array([-0.37, 0.05, -0.35, 0.08, 0.43]),
#     'energy': np.array([-0.64, -0.46, -0.13, 0.66, -0.03]),
#     'instrumentalness': np.array([0.20, -0.47, 0.28, 0.09, -0.01]),
#     'liveness': np.array([-0.69, -0.12, -0.07, 0.43, 0.02]),
#     'loudness': np.array([-0.58, -0.19, -0.44, 0.79, -0.21]),
#     'valence': np.array([-0.04, 0.18, 0.24, -0.34, 0.18]),
# }
# # feature_MUSIC_matrix -> m x 5 matrix, where m is the number of audio features in feature_MUSIC_dict
# feature_MUSIC_matrix = [MUSIC for MUSIC in feature_MUSIC_dict.values()]

# def get_MUSIC(sub_df):
#     # song_vectors -> n x m matrix, where m is the number of audio features in feature_MUSIC_dict
#     song_vectors = sub_df[list(feature_MUSIC_dict.keys())].values
    
#     # unweighted_MUSIC_vals -> n x 5 matrix
#     unweighted_MUSIC_vals = song_vectors @ feature_MUSIC_matrix

#     return list(np.mean(unweighted_MUSIC_vals, axis=0))

# MUSIC = get_MUSIC(df)
# MUSIC

In [34]:
# songs -> CSR_row_index: song_id
songs = pd.read_hdf('data/song_mapping.h5', key='df')

# users -> CSR_col_index: user_id
users = pd.read_hdf('data/user_mapping.h5', key='df')

In [58]:
for a,b in users.head(3)[['user', 'sparse_index']].values:
    print(a,b)

(1012631, 3)

In [6]:
# spotify['msd_id'] = spotify['msd_id'].str.encode('utf-8')
# spotify['msd_id'] = spotify['msd_id'].astype(str)
# msd['track_id'] = msd['track_id'].astype(str)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 8.11 µs
CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.15 µs


6

In [59]:
users.head()

Unnamed: 0,sparse_index,user
0,9,00007a02388c208ea7176479f6ae06f8224355b3
1,9,00007a02388c208ea7176479f6ae06f8224355b3
2,9,00007a02388c208ea7176479f6ae06f8224355b3
3,9,00007a02388c208ea7176479f6ae06f8224355b3
4,9,00007a02388c208ea7176479f6ae06f8224355b3


In [64]:
user_MUSIC_df.head()

Unnamed: 0,MUSIC,num_songs,user_id
0,"[4.229812033333332, 1.4042373883333334, 3.7184...",3,00000b722001882066dff9d2da8a775658053ea0
1,"[4.910766939999999, 1.6201183160000001, 4.2151...",6,00001638d6189236866af9bbf309ae6c2347ffdc
2,"[3.9929606913333338, 1.3756423253666668, 3.569...",6,0000175652312d12576d9e6b84f600caa24c4715
3,"[4.160212249999999, 1.38550505, 3.48416005, -6...",3,00001cf0dce3fb22b0df0f3a1d9cd21e38385372
4,"[5.020851199999999, 1.2992664299999999, 4.2901...",9,0000267bde1b3a70ea75cf2b2d216cb828e3202b


In [62]:
user_playlist_df.tail()

Unnamed: 0,user_id,playlist
1019313,a6c32fb03a210c135bd944b9b35849fd63a7765e,"[SOAKDLX12A67ADAFC5, SOBTAOG12A6D4F9359, SOEQA..."
1019314,8305c896f42308824da7d4386f4b9ee584281412,"[SOABJBU12A8C13F63F, SOACIPG12A8AE47E1C, SOADJ..."
1019315,ca99d94daa9d5231643a08aac2f3bfb645e73b09,"[SOBMAEL12A58A7B52F, SOBMZTG12A8C13AD44, SOCCL..."
1019316,cf8289419383259189afe6bb50c5115fd84f1064,"[SOADLEU12AB017C851, SOAIMFG12AB0189AC2, SOASB..."
1019317,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,"[SOABJBU12A8C13F63F, SODJQXO12A6D4F697D, SOEIS..."


In [25]:
# songs = songs.rename(index=str, columns={'track': 'sparse_index', 'sparse_index': 'track'})
# songs.head()
# songs.to_hdf('data/song_mapping.h5', key='df')

# users = users.rename(index=str, columns={'user': 'sparse_index', 'sparse_index': 'user'})
# users.head(100)
# users.to_hdf('data/user_mapping.h5', key='df')

In [75]:
users.head(1)['user'].values[0]

'00007a02388c208ea7176479f6ae06f8224355b3'

In [76]:
user_MUSIC_df.head(1)['user_id'].values[0]

'00000b722001882066dff9d2da8a775658053ea0'

In [78]:
user_MUSIC_df.loc[user_MUSIC_df['user_id'] == users.head(1)['user'].values[0].encode()]

Unnamed: 0,MUSIC,num_songs,user_id


In [93]:
user_MUSIC_df.loc[user_MUSIC_df['user_id'] == users.head(100)['user'].values[99]]

Unnamed: 0,MUSIC,num_songs,user_id
