In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
import itertools
# import ALSpkNN
from implicit.evaluation import mean_average_precision_at_k

In [2]:
user_df = pd.read_hdf('data/user_df.h5', key='df')[['user_id', 'sparse_index', 'MUSIC', 'song_ids']]
user_df.set_index('sparse_index', inplace=True)

song_df = pd.read_hdf('data/song_df.h5', key='df')[['song_id', 'sparse_index']]
song_df.set_index('song_id', inplace=True)

# train_plays, test_plays -> num_songs x num_users CSR matrix
train_plays = load_npz('data/train_sparse.npz')
test_plays = load_npz('data/test_sparse.npz')

# # songs -> CSR_row_index: song_id
# songs_mapping = pd.read_hdf('data/song_mapping.h5', key='df')
# songs_mapping.set_index('song_id', inplace=True)

# # users -> CSR_col_index: user_id
# users_mapping = pd.read_hdf('data/user_mapping.h5', key='df')
# users_mapping.set_index('sparse_index', inplace=True)

In [3]:
user_df.head()

Unnamed: 0_level_0,user_id,MUSIC,song_ids
sparse_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,00000b722001882066dff9d2da8a775658053ea0,"[4.229812033333332, 1.4042373883333334, 3.7184...","[SOBSSGK12A6D4F9EF1, SOCZQCY12AC468E40F, SOCTX..."
1,00001638d6189236866af9bbf309ae6c2347ffdc,"[4.910766939999999, 1.6201183160000001, 4.2151...","[SOFXSRW12A6D4F3B77, SOFFWTH12A6310D9E8, SOLOD..."
2,0000175652312d12576d9e6b84f600caa24c4715,"[3.9929606913333338, 1.3756423253666668, 3.569...","[SOBYRTY12AB0181EDB, SOYWZXA12A8C138274, SOYFP..."
3,00001cf0dce3fb22b0df0f3a1d9cd21e38385372,"[4.160212249999999, 1.38550505, 3.48416005, -6...","[SOBDRND12A8C13FD08, SODRFRJ12A8C144167, SOMMJ..."
4,0000267bde1b3a70ea75cf2b2d216cb828e3202b,"[5.020851199999999, 1.2992664299999999, 4.2901...","[SOBMSCQ12AAF3B51B7, SOJERWB12A8C13E654, SOMCH..."


In [4]:
song_df.head()

Unnamed: 0_level_0,sparse_index
song_id,Unnamed: 1_level_1
SOAKIMP12A8C130995,4785
SOAPDEY12A81C210A9,7052
SOBFOVM12A58A7D494,14453
SOBSUJE12A6D4F8CF5,20354
SOBVFZR12A6D4F8AE3,21408


In [5]:
user_df.shape

(1107613, 3)

In [6]:
song_df.shape

(168503, 1)

In [12]:
from implicit.als import AlternatingLeastSquares
import os
os.environ['MKL_NUM_THREADS'] = '1'
from scipy.spatial import KDTree
import numpy as np
from collections import Counter
import utilities
import time
from random import shuffle

def get_baseline_cf_model():
    als_params = {
        'factors': 16,
        'dtype': np.float32,
        'iterations': 2,
        'calculate_training_loss': True
    }
    cf_model = AlternatingLeastSquares(**als_params)
    return cf_model

def weight_cf_matrix(csr_mat, alpha):
    #don't want to modify original incase it gets put into other models
    weighted_csr_mat = csr_mat.copy()
    weighted_csr_mat.data = 1 + np.log(alpha * csr_mat.data)
    return weighted_csr_mat
    
class ALSpkNN():
    '''
    knn_frac = % of KNN recommendations
    k = # of neighbours for KNN
    '''
    def __init__(self, user_df, song_df, k=100, knn_frac=0.5, cf_weighting_alpha=1):        
        self.user_df = user_df
        self.song_df = song_df
        self.cf_weighting_alpha = cf_weighting_alpha
        self.knn_frac = knn_frac
        self.k = k
        self.kdtree = KDTree(user_df['MUSIC'].tolist())
        
        #build the collaborative filtering model with params hardcoded
        als_params = {
            'factors': 16,
            'dtype': np.float32,
            'iterations': 2,
            'calculate_training_loss': True
        }
        self.cf_model = AlternatingLeastSquares(**als_params)

    def fit(self, train_csr):
        #don't want to modify original incase it gets put into other models
        weighted_train_csr = weight_cf_matrix(train_csr, self.cf_weighting_alpha)
        self.cf_model.fit(weighted_train_csr)

    # Returns list of song_ids
    def get_knn_top_m_song_ids(self, user_id, m):
        user_MUSIC = self.user_df.loc[self.user_df['user_id'] == user_id]['MUSIC'].values[0]
        distances, indices = self.kdtree.query(user_MUSIC, self.k, p=1)
        # TODO: maybe sort closest_user_ids by distance if they are not already sorted?
        
        closest_user_ids = self.user_df.iloc[indices]['user_id'].to_list()
        
        # closest_user_songs -> list of lists of song_ids, len(closest_user_songs) == k
        closest_user_songs = self.user_df.loc[self.user_df['user_id'].isin(closest_user_ids)]['song_ids'].values
        
        # closest_user_songs_flat -> list of song_ids
        closest_user_songs_flat = itertools.chain.from_iterable(closest_user_songs)
        
        top_m_songs = [i[0] for i in Counter(closest_user_songs_flat).most_common(m)]
        return top_m_songs

    # Returns [(song_sparse_index, confidence)]
    def recommend(self, user_sparse_index, train_plays, N):
        # m -> number of songs from KNN recs
        m = int(np.round(self.knn_frac*N))
        # n -> number of songs from CF recs
        n = N - m
        
        n_songs = self.cf_model.recommend(userid=user_sparse_index, user_items=train_plays.transpose(), N=n)
        
        user_id = self.user_df.loc[user_sparse_index]['user_id']
        m_song_ids = self.get_knn_top_m_song_ids(user_id=user_id, m=m)
        m_songs = self.song_df.loc[m_song_ids]['sparse_index'].tolist()

        #I don't think score/confidence is used in MAP@k function, so it doesn't matter what value is filled
        hopefully_unimportant_val = 0.69

        m_songs = [(song, hopefully_unimportant_val) for song in m_songs]
        rec_list = utilities.concat_shuffle(n_songs, m_songs)
        return rec_list[:N]
    
#things that can be optimized:
# - The alpha value for confidence of the matrix factorization algorithm
# - The number of iterations in the als algorithm
# - the joining of n and m in the recommendation algirhtm
print("Building model...")
model = ALSpkNN(user_df, song_df, k=100, knn_frac=0.5, cf_weighting_alpha=1)
print("Fitting model...")
model.fit(train_plays)
recs = model.recommend(user_sparse_index=21, train_plays=train_plays, N=5)
print(recs)

Building model...
Fitting model...



  0%|          | 0/2 [00:00<?, ?it/s][A
 25%|██▌       | 0.5/2 [00:00<00:02,  1.37s/it][A
 50%|█████     | 1.0/2 [00:01<00:01,  1.53s/it][A
 50%|█████     | 1.0/2 [00:01<00:01,  1.53s/it, loss=0.000146][A
 75%|███████▌  | 1.5/2 [00:02<00:00,  1.67s/it, loss=0.000146][A
100%|██████████| 2.0/2 [00:03<00:00,  1.79s/it, loss=0.000146][A
100%|██████████| 2.0/2 [00:03<00:00,  1.79s/it, loss=0.00014] [A
[A

[(88905, 0.69), (47158, 0.2676571), (146882, 0.40411413), (146882, 0.69), (78979, 0.27775383)]


In [None]:
print("Evaluating ALSpKNN")
start = time.time()
MAPk = mean_average_precision_at_k(
    model,
    train_plays.transpose(),
    test_plays.transpose(),
    K=5,
    show_progress=False,
    num_threads=0)

print("MAPK for ALSpKNN is: " + str(MAPk))
print(f'Calculation took {time.time() - start}s')

Evaluating ALSpKNN


In [15]:
print("Building and fitting the baseline CF model")
baseline_cf_model = get_baseline_cf_model()
weighted_train_csr = weight_cf_matrix(train_plays, alpha=1)
baseline_cf_model.fit(weighted_train_csr)

print("Evaluating the baseline CF model")
start = time.time()

# setting num_threads = 0 yields a 3x speedup on Nolan's MBP
MAPk = mean_average_precision_at_k(
    model=baseline_cf_model,
    train_user_items=train_plays.transpose(),
    test_user_items=test_plays.transpose(),
    K=5,
    show_progress=False,
    num_threads=0)

print("MAPK for baseline CF is: " + str(MAPk))
print(f'Calculation took {time.time() - start}s')

# MAPK for baseline CF is: 0.02496744369967655
# Calculation took 93.62211418151855s

Building and fitting the baseline CF model




  0%|          | 0/2 [00:00<?, ?it/s][A[A

 25%|██▌       | 0.5/2 [00:00<00:02,  1.41s/it][A[A

 50%|█████     | 1.0/2 [00:01<00:01,  1.58s/it][A[A

 50%|█████     | 1.0/2 [00:01<00:01,  1.58s/it, loss=0.000146][A[A

 75%|███████▌  | 1.5/2 [00:02<00:00,  1.72s/it, loss=0.000146][A[A

100%|██████████| 2.0/2 [00:03<00:00,  1.89s/it, loss=0.000146][A[A

100%|██████████| 2.0/2 [00:04<00:00,  1.89s/it, loss=0.000139][A[A

[A[A

Evaluating the baseline CF model
MAPK for baseline CF is: 0.02496744369967655
Calculation took 93.62211418151855s


# Test Code

In [39]:
knn_frac = 0.5
N = 7
m = int(np.round(knn_frac*N))
n = N - m
print(m)
print(type(n))

4
<class 'int'>


In [43]:
user_df.loc[user_df['user_id'] == '00000b722001882066dff9d2da8a775658053ea0']['MUSIC'].values[0]

[4.229812033333332,
 1.4042373883333334,
 3.718447513333333,
 -6.4863809183333325,
 2.166463065]

In [46]:
user_df.head()

Unnamed: 0,MUSIC,is_test,num_songs,song_ids,user_id
0,"[4.229812033333332, 1.4042373883333334, 3.7184...",False,3,"[SOBSSGK12A6D4F9EF1, SOCZQCY12AC468E40F, SOCTX...",00000b722001882066dff9d2da8a775658053ea0
1,"[4.910766939999999, 1.6201183160000001, 4.2151...",False,6,"[SOFXSRW12A6D4F3B77, SOFFWTH12A6310D9E8, SOLOD...",00001638d6189236866af9bbf309ae6c2347ffdc
2,"[3.9929606913333338, 1.3756423253666668, 3.569...",False,6,"[SOBYRTY12AB0181EDB, SOYWZXA12A8C138274, SOYFP...",0000175652312d12576d9e6b84f600caa24c4715
3,"[4.160212249999999, 1.38550505, 3.48416005, -6...",False,3,"[SOBDRND12A8C13FD08, SODRFRJ12A8C144167, SOMMJ...",00001cf0dce3fb22b0df0f3a1d9cd21e38385372
4,"[5.020851199999999, 1.2992664299999999, 4.2901...",False,9,"[SOBMSCQ12AAF3B51B7, SOJERWB12A8C13E654, SOMCH...",0000267bde1b3a70ea75cf2b2d216cb828e3202b


In [61]:
user_df.iloc[[0,1,2]]['user_id']

0    00000b722001882066dff9d2da8a775658053ea0
1    00001638d6189236866af9bbf309ae6c2347ffdc
2    0000175652312d12576d9e6b84f600caa24c4715
Name: user_id, dtype: object

In [51]:
sample_user_ids = ['00000b722001882066dff9d2da8a775658053ea0', '00001638d6189236866af9bbf309ae6c2347ffdc']
user_df.loc[user_df['user_id'].isin(sample_user_ids)]['song_ids'].values

array([list(['SOBSSGK12A6D4F9EF1', 'SOCZQCY12AC468E40F', 'SOCTXQW12A6D4F70AD']),
       list(['SOFXSRW12A6D4F3B77', 'SOFFWTH12A6310D9E8', 'SOLODPO12AB017F217', 'SOBFEDK12A8C13BB25', 'SOAORYL12A67AD8187', 'SOEKYTM12A8C13CBF4'])],
      dtype=object)

In [56]:
# a = [1,1,2,3,3,4]
a = [[1,1],[2,3,3,4]]
Counter(itertools.chain.from_iterable(a)).most_common(2)

[(1, 2), (3, 2)]

In [83]:
lol = users_mapping
lol.set_index('sparse_index', inplace=True)
lol.index.name = 'sparse_index'
lol.head()

Unnamed: 0_level_0,user
sparse_index,Unnamed: 1_level_1
9,00007a02388c208ea7176479f6ae06f8224355b3
21,00014a76ed063e1a749171a253bca9d9a0ff1782
22,00015189668691680bb1a2e58afde1541ec92ced
29,0001ff7aa2667c8d8b945317b88adaed1c0b9dc2
31,00020fcd8b01986a6a85b896ccde6c49f35142ad


In [86]:
lol.loc[21]['user']

'00014a76ed063e1a749171a253bca9d9a0ff1782'

In [82]:
yolo = songs_mapping
# yolo.head()
yolo.set_index('track', inplace=True)
yolo.head()

Unnamed: 0_level_0,sparse_index
track,Unnamed: 1_level_1
SOOFKYO12AF72A2640,221367
SOIHOIQ12A8C138593,132143
SOYIZSN12A6701E0BB,363912
SODYZAD12A58A7A525,63740
SOXLWPN12A8C143667,351788


In [99]:
yolo.loc[['SOOFKYO12AF72A2640', 'SOIHOIQ12A8C138593']]['sparse_index'].values

array([221367, 132143], dtype=object)