In [2]:
import os
os.environ['MKL_NUM_THREADS'] = '1'
import time
import tqdm
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
from scipy.sparse import csr_matrix
from scipy.spatial import KDTree
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k
from random import shuffle, randint
from collections import Counter

In [3]:
def build_cf_model(train_plays):
    als_params = {'factors': 16,
                  'dtype': np.float32,
                  'iterations': 15,
                  'calculate_training_loss': True}
    model = AlternatingLeastSquares(**als_params)

    #perform operation to approximate confidence intervals
    #paper doesn't specify an alpha value, so just guess alpha=1
    alpha = 1
    train_plays.data = 1 + np.log(alpha*train_plays.data)
    model.fit(train_plays)
    return model

def build_kdtree(user_MUSIC_df):
    MUSIC_vectors = user_MUSIC_df['MUSIC'].values.tolist()
    return KDTree(MUSIC_vectors)

def get_closest_MUSIC_user_ids(user_id, k, user_MUSIC_df, kdtree):
    user_MUSIC = user_MUSIC_df.loc[user_MUSIC_df['user_id'] == user_id]['MUSIC'].tolist()[0]

    distances, indices = kdtree.query(user_MUSIC, k)
    closest_userids = []

    for index in indices:
        closest_userids.append(user_MUSIC_df.iloc[index, 3])

    return closest_userids


def get_knn_top_m_songs(user_id, k, m, kdtree, user_playlist_df, user_MUSIC_df):
    closest_userids = get_closest_MUSIC_user_ids(user_id, k, user_MUSIC_df, kdtree)
    closest_user_songs = []
    for i in range(len(closest_userids)):
        closest_user_songs.append(user_playlist_df.loc[user_playlist_df[
            'user_id'] == closest_userids[i]]['playlist'].tolist()[0])

    closest_user_songs = [item for sublist in closest_user_songs for item in sublist]
    counted_closest_user_songs = Counter(closest_user_songs)
    top_m_songs = [i[0] for i in counted_closest_user_songs.most_common()[:m]]

    return top_m_songs

def get_recommendations(user_id, user_sparse_index, cf_model, kdtree, train_plays,
                        user_playlist_df, user_MUSIC_df, n, m, k):
        n_songs = [song_tuple[0] for song_tuple in cf_model.recommend(userid=user_sparse_index,
                                                                      user_items=train_plays.transpose(),
                                                                      N=n)]
        m_songs = get_knn_top_m_songs(user_id=user_id,
                                      k=k,
                                      m=m,
                                      kdtree=kdtree,
                                      user_playlist_df=user_playlist_df,
                                      user_MUSIC_df=user_MUSIC_df)
        return shuffle(n_songs + m_songs)

In [4]:
print('Loading data...')
user_playlist_df = pd.read_hdf('data/userid_playlist.h5', key='df')
user_MUSIC_df = pd.read_hdf('data/user_MUSIC_num_songs.h5', key='df')

# train_plays, test_plays -> num_songs x num_users CSR matrix
train_plays = load_npz('data/train_sparse.npz')
test_plays = load_npz('data/test_sparse.npz')

# songs -> CSR_row_index: song_id
songs = pd.read_hdf('data/song_mapping.h5', key='df')

# users -> CSR_col_index: user_id
users = pd.read_hdf('data/user_mapping.h5', key='df')

print('Fitting CF model...')
cf_model = build_cf_model(train_plays)
print('Building KDTree...')
kdtree = build_kdtree(user_MUSIC_df=user_MUSIC_df)

Loading data...
Fitting CF model...


100%|██████████████████████████████████████████████████████████████████████| 15.0/15 [01:52<00:00,  7.41s/it, loss=0.000166]


Building KDTree...


In [5]:
for user_id, user_sparse_index in users.head(1000)[['user', 'sparse_index']].values:
    try:
        song_ids = get_recommendations(user_id=user_id,
                                       user_sparse_index=user_sparse_index,
                                       cf_model=cf_model,
                                       kdtree=kdtree,
                                       train_plays=train_plays,
                                       user_playlist_df=user_playlist_df,
                                       user_MUSIC_df=user_MUSIC_df,
                                       n=10,
                                       m=10,
                                       k=100)
        print(song_ids)
        
    except Exception:
#         print('This user_id does not have a MUSIC value')
        continue

In [6]:
ls data

 Volume in drive C has no label.
 Volume Serial Number is CA13-F640

 Directory of C:\Users\Ian\Documents\GitHub\rec-sys\data

03/02/2019  08:05 PM    <DIR>          .
03/02/2019  08:05 PM    <DIR>          ..
03/01/2019  12:49 PM       904,351,168 song_mapping.h5
03/01/2019  12:49 PM         5,296,379 test_sparse.npz
03/01/2019  12:49 PM       166,764,672 train_sparse.npz
03/01/2019  12:48 PM       956,995,552 user_mapping.h5
03/02/2019  07:56 PM       161,080,024 user_MUSIC_num_songs.h5
03/02/2019  08:04 PM       527,753,856 userid_playlist.h5
               6 File(s)  2,722,241,651 bytes
               2 Dir(s)   6,927,454,208 bytes free


In [None]:
# # train_plays, test_plays -> num_songs x num_users CSR matrix      
# test_plays = load_npz('data/test_sparse.npz')
# # songs -> CSR_row_index: song_id
# songs = pd.read_hdf('data/song_mapping.h5', key='df')
# # songs -> CSR_col_index: user_id
# users = pd.read_hdf('data/user_mapping.h5', key='df')



# # LET n = number of songs
# # LET m = number of audio features
# feature_MUSIC_dict = {
#     'danceability': np.array([-0.37, 0.05, -0.35, 0.08, 0.43]),
#     'energy': np.array([-0.64, -0.46, -0.13, 0.66, -0.03]),
#     'instrumentalness': np.array([0.20, -0.47, 0.28, 0.09, -0.01]),
#     'liveness': np.array([-0.69, -0.12, -0.07, 0.43, 0.02]),
#     'loudness': np.array([-0.58, -0.19, -0.44, 0.79, -0.21]),
#     'valence': np.array([-0.04, 0.18, 0.24, -0.34, 0.18]),
# }
# # feature_MUSIC_matrix -> m x 5 matrix, where m is the number of audio features in feature_MUSIC_dict
# feature_MUSIC_matrix = [MUSIC for MUSIC in feature_MUSIC_dict.values()]

# def get_MUSIC(sub_df):
#     # song_vectors -> n x m matrix, where m is the number of audio features in feature_MUSIC_dict
#     song_vectors = sub_df[list(feature_MUSIC_dict.keys())].values
    
#     # unweighted_MUSIC_vals -> n x 5 matrix
#     unweighted_MUSIC_vals = song_vectors @ feature_MUSIC_matrix

#     return list(np.mean(unweighted_MUSIC_vals, axis=0))

# MUSIC = get_MUSIC(df)
# MUSIC

In [None]:
# songs -> CSR_row_index: song_id
songs = pd.read_hdf('data/song_mapping.h5', key='df')

# users -> CSR_col_index: user_id
users = pd.read_hdf('data/user_mapping.h5', key='df')

In [None]:
for a,b in users.head(3)[['user', 'sparse_index']].values:
    print(a,b)

In [None]:
# spotify['msd_id'] = spotify['msd_id'].str.encode('utf-8')
# spotify['msd_id'] = spotify['msd_id'].astype(str)
# msd['track_id'] = msd['track_id'].astype(str)

In [None]:
users.head()

In [None]:
user_MUSIC_df.head()

In [None]:
user_playlist_df.tail()

In [None]:
# songs = songs.rename(index=str, columns={'track': 'sparse_index', 'sparse_index': 'track'})
# songs.head()
# songs.to_hdf('data/song_mapping.h5', key='df')

# users = users.rename(index=str, columns={'user': 'sparse_index', 'sparse_index': 'user'})
# users.head(100)
# users.to_hdf('data/user_mapping.h5', key='df')

In [None]:
users.head(1)['user'].values[0]

In [None]:
user_MUSIC_df.head(1)['user_id'].values[0]

In [None]:
user_MUSIC_df.loc[user_MUSIC_df['user_id'] == users.head(1)['user'].values[0].encode()]

In [None]:
user_MUSIC_df.loc[user_MUSIC_df['user_id'] == users.head(100)['user'].values[99]]