In [55]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
import itertools
# import ALSpkNN
from implicit.evaluation import mean_average_precision_at_k

In [101]:
print("Loading data...")
user_df = pd.read_hdf('data/user_df.h5', key='df')

# train_plays, test_plays -> num_songs x num_users CSR matrix
train_plays = load_npz('data/train_sparse.npz')
test_plays = load_npz('data/test_sparse.npz')

# songs -> CSR_row_index: song_id
songs_mapping = pd.read_hdf('data/song_mapping.h5', key='df')
songs_mapping.set_index('song_id', inplace=True)

# users -> CSR_col_index: user_id
users_mapping = pd.read_hdf('data/user_mapping.h5', key='df')
users_mapping.set_index('sparse_index', inplace=True)

Loading data...


In [102]:
user_df.head()

Unnamed: 0,MUSIC,is_test,num_songs,song_ids,user_id
0,"[4.229812033333332, 1.4042373883333334, 3.7184...",False,3,"[SOBSSGK12A6D4F9EF1, SOCZQCY12AC468E40F, SOCTX...",00000b722001882066dff9d2da8a775658053ea0
1,"[4.910766939999999, 1.6201183160000001, 4.2151...",False,6,"[SOFXSRW12A6D4F3B77, SOFFWTH12A6310D9E8, SOLOD...",00001638d6189236866af9bbf309ae6c2347ffdc
2,"[3.9929606913333338, 1.3756423253666668, 3.569...",False,6,"[SOBYRTY12AB0181EDB, SOYWZXA12A8C138274, SOYFP...",0000175652312d12576d9e6b84f600caa24c4715
3,"[4.160212249999999, 1.38550505, 3.48416005, -6...",False,3,"[SOBDRND12A8C13FD08, SODRFRJ12A8C144167, SOMMJ...",00001cf0dce3fb22b0df0f3a1d9cd21e38385372
4,"[5.020851199999999, 1.2992664299999999, 4.2901...",False,9,"[SOBMSCQ12AAF3B51B7, SOJERWB12A8C13E654, SOMCH...",0000267bde1b3a70ea75cf2b2d216cb828e3202b


In [103]:
songs_mapping.head()

Unnamed: 0_level_0,sparse_index
song_id,Unnamed: 1_level_1
SOAKIMP12A8C130995,4785
SOAPDEY12A81C210A9,7052
SOBFOVM12A58A7D494,14453
SOBSUJE12A6D4F8CF5,20354
SOBVFZR12A6D4F8AE3,21408


In [104]:
users_mapping.head()

Unnamed: 0_level_0,user_id
sparse_index,Unnamed: 1_level_1
796068,b80344d063b5ccb3212f76538f3d9e43d87dca9e
538494,7c86176941718984fed11b7c0674ff04c029b480
510903,76235885b32c4e8c82760c340dc54f9b608d7d7e
159634,250c0fa2a77bc6695046e7c47882ecd85c42d748
273980,3f73f44560e822344b0fb7c6b463869743eb9860


In [105]:
users_mapping.shape

(1107613, 1)

In [106]:
songs_mapping.shape

(168493, 1)

In [107]:
from implicit.als import AlternatingLeastSquares
import os
os.environ['MKL_NUM_THREADS'] = '1'
from scipy.spatial import KDTree
import numpy as np
from collections import Counter
import utilities
import time
from random import shuffle

class ALSpkNN():
    
    '''
    knn_frac = % of KNN recommendations
    k = # of neighbours for KNN
    '''
    def __init__(self, user_df, user_mapping, song_mapping, k=100, knn_frac=0.5, cf_weighting_alpha=1):        
        self.user_mapping = user_mapping
        self.song_mapping = song_mapping
        self.user_df = user_df
        self.cf_weighting_alpha = cf_weighting_alpha
        self.knn_frac = knn_frac
        self.k = k
        self.kdtree = KDTree(user_df['MUSIC'].tolist())
        
        #build the collaborative filtering model with params hardcoded
        als_params = {
            'factors': 16,
            'dtype': np.float32,
            'iterations': 2,
            'calculate_training_loss': True
        }
        self.cf_model = AlternatingLeastSquares(**als_params)

    def fit(self, train_csr):
        #don't want to modify original incase it gets put into other models
        weighted_train_csr = train_csr.copy()
        weighted_train_csr.data = 1 + np.log(self.cf_weighting_alpha * train_csr.data)
        self.cf_model.fit(weighted_train_csr)

    # Returns list of song_ids
    def get_knn_top_m_song_ids(self, user_id, m):

        user_MUSIC = self.user_df.loc[self.user_df['user_id'] == user_id]['MUSIC'].values[0]
        distances, indices = self.kdtree.query(user_MUSIC, self.k, p=1)
        # TODO: maybe sort closest_user_ids by distance if they are not already sorted?
        
        closest_user_ids = self.user_df.iloc[indices]['user_id'].to_list()
        
        # closest_user_songs -> list of lists of song_ids, len(closest_user_songs) == k
        closest_user_songs = self.user_df.loc[self.user_df['user_id'].isin(closest_user_ids)]['song_ids'].values
        
        # closest_user_songs_flat -> list of song_ids
        closest_user_songs_flat = itertools.chain.from_iterable(closest_user_songs)
        
        top_m_songs = [i[0] for i in Counter(closest_user_songs_flat).most_common(m)]
        return top_m_songs

    # Returns [(song_sparse_index, confidence)]
    def recommend(self, user_sparse_index, train_plays, N):
        # m -> number of songs from KNN recs
        m = int(np.round(knn_frac*N))
        # n -> number of songs from CF recs
        n = N - m
        
        n_songs = self.cf_model.recommend(userid=user_sparse_index, user_items=train_plays.transpose(), N=n)
        
        user_id = self.user_mapping.loc[user_sparse_index]['user']

        m_song_ids = self.get_knn_top_m_song_ids(user_id=user_id, m=m)
        m_songs = self.song_mapping[m_song_ids]['sparse_index'].tolist()

        #I don't think score/confidence is used in MAP@k function, so it doesn't matter what value is filled
        hopefully_unimportant_val = 0.69

        m_songs = [(song, hopefully_unimportant_val) for song in m_songs]
        rec_list = utilities.concat_shuffle(n_songs, m_songs)
        return rec_list[:N]
    
#things that can be optimized:
# - The alpha value for confidence of the matrix factorization algorithm
# - The number of iterations in the als algorithm
# - the joining of n and m in the recommendation algirhtm
print("Building model...")
model = ALSpkNN(user_df, users_mapping, songs_mapping, k=100, knn_frac=0.5, cf_weighting_alpha=1)
print("Fitting model...")
model.fit(train_plays)
recs = model.recommend(user_sparse_index=21, train_plays=train_plays, N=5)
print(recs)

Building model...
Fitting model...





  0%|          | 0/2 [00:00<?, ?it/s][A[A[A


 25%|██▌       | 0.5/2 [00:00<00:02,  1.42s/it][A[A[A


 50%|█████     | 1.0/2 [00:01<00:01,  1.60s/it][A[A[A


 50%|█████     | 1.0/2 [00:01<00:01,  1.60s/it, loss=0.000146][A[A[A


 75%|███████▌  | 1.5/2 [00:02<00:00,  1.66s/it, loss=0.000146][A[A[A


100%|██████████| 2.0/2 [00:03<00:00,  1.73s/it, loss=0.000146][A[A[A


100%|██████████| 2.0/2 [00:03<00:00,  1.73s/it, loss=0.00014] [A[A[A


[A[A[A

KeyError: 'user'

In [65]:
#example_sparse_user_id_in_all_df = 1119317
print("Evaluating the Model")

MAPk = mean_average_precision_at_k(
    model,
    train_plays.transpose(),
    test_plays.transpose(),
    K=5)
#     show_progress=True,
#     num_threads=0)

print("MAPK is: " + str(MAPk))

Evaluating the Model




  0%|          | 0/1119318 [00:00<?, ?it/s][A[A

  0%|          | 10/1119318 [00:00<9:36:50, 32.34it/s][A[A

  0%|          | 22/1119318 [00:00<8:54:34, 34.90it/s][A[A

  0%|          | 24/1119318 [00:00<18:56:02, 16.42it/s][A[A

  0%|          | 30/1119318 [00:01<17:26:56, 17.82it/s][A[A

  0%|          | 32/1119318 [00:01<24:38:16, 12.62it/s][A[A

  0%|          | 48/1119318 [00:01<18:49:48, 16.51it/s][A[A

  0%|          | 54/1119318 [00:01<17:17:49, 17.97it/s][A[A

  0%|          | 66/1119318 [00:02<14:12:21, 21.89it/s][A[A

  0%|          | 70/1119318 [00:02<16:19:51, 19.04it/s][A[A

  0%|          | 85/1119318 [00:02<13:07:28, 23.69it/s][A[A

  0%|          | 92/1119318 [00:03<12:44:39, 24.40it/s][A[A

  0%|          | 96/1119318 [00:03<17:23:39, 17.87it/s][A[A

  0%|          | 104/1119318 [00:03<18:25:15, 16.88it/s][A[A

  0%|          | 107/1119318 [00:04<21:33:29, 14.42it/s][A[A

  0%|          | 109/1119318 [00:04<30:05:47, 10.33it/s][A[A

 

IndexError: index 0 is out of bounds for axis 0 with size 0

In [39]:
knn_frac = 0.5
N = 7
m = int(np.round(knn_frac*N))
n = N - m
print(m)
print(type(n))

4
<class 'int'>


In [43]:
user_df.loc[user_df['user_id'] == '00000b722001882066dff9d2da8a775658053ea0']['MUSIC'].values[0]

[4.229812033333332,
 1.4042373883333334,
 3.718447513333333,
 -6.4863809183333325,
 2.166463065]

In [46]:
user_df.head()

Unnamed: 0,MUSIC,is_test,num_songs,song_ids,user_id
0,"[4.229812033333332, 1.4042373883333334, 3.7184...",False,3,"[SOBSSGK12A6D4F9EF1, SOCZQCY12AC468E40F, SOCTX...",00000b722001882066dff9d2da8a775658053ea0
1,"[4.910766939999999, 1.6201183160000001, 4.2151...",False,6,"[SOFXSRW12A6D4F3B77, SOFFWTH12A6310D9E8, SOLOD...",00001638d6189236866af9bbf309ae6c2347ffdc
2,"[3.9929606913333338, 1.3756423253666668, 3.569...",False,6,"[SOBYRTY12AB0181EDB, SOYWZXA12A8C138274, SOYFP...",0000175652312d12576d9e6b84f600caa24c4715
3,"[4.160212249999999, 1.38550505, 3.48416005, -6...",False,3,"[SOBDRND12A8C13FD08, SODRFRJ12A8C144167, SOMMJ...",00001cf0dce3fb22b0df0f3a1d9cd21e38385372
4,"[5.020851199999999, 1.2992664299999999, 4.2901...",False,9,"[SOBMSCQ12AAF3B51B7, SOJERWB12A8C13E654, SOMCH...",0000267bde1b3a70ea75cf2b2d216cb828e3202b


In [61]:
user_df.iloc[[0,1,2]]['user_id']

0    00000b722001882066dff9d2da8a775658053ea0
1    00001638d6189236866af9bbf309ae6c2347ffdc
2    0000175652312d12576d9e6b84f600caa24c4715
Name: user_id, dtype: object

In [51]:
sample_user_ids = ['00000b722001882066dff9d2da8a775658053ea0', '00001638d6189236866af9bbf309ae6c2347ffdc']
user_df.loc[user_df['user_id'].isin(sample_user_ids)]['song_ids'].values

array([list(['SOBSSGK12A6D4F9EF1', 'SOCZQCY12AC468E40F', 'SOCTXQW12A6D4F70AD']),
       list(['SOFXSRW12A6D4F3B77', 'SOFFWTH12A6310D9E8', 'SOLODPO12AB017F217', 'SOBFEDK12A8C13BB25', 'SOAORYL12A67AD8187', 'SOEKYTM12A8C13CBF4'])],
      dtype=object)

In [56]:
# a = [1,1,2,3,3,4]
a = [[1,1],[2,3,3,4]]
Counter(itertools.chain.from_iterable(a)).most_common(2)

[(1, 2), (3, 2)]

In [83]:
lol = users_mapping
lol.set_index('sparse_index', inplace=True)
lol.index.name = 'sparse_index'
lol.head()

Unnamed: 0_level_0,user
sparse_index,Unnamed: 1_level_1
9,00007a02388c208ea7176479f6ae06f8224355b3
21,00014a76ed063e1a749171a253bca9d9a0ff1782
22,00015189668691680bb1a2e58afde1541ec92ced
29,0001ff7aa2667c8d8b945317b88adaed1c0b9dc2
31,00020fcd8b01986a6a85b896ccde6c49f35142ad


In [86]:
lol.loc[21]['user']

'00014a76ed063e1a749171a253bca9d9a0ff1782'

In [82]:
yolo = songs_mapping
# yolo.head()
yolo.set_index('track', inplace=True)
yolo.head()

Unnamed: 0_level_0,sparse_index
track,Unnamed: 1_level_1
SOOFKYO12AF72A2640,221367
SOIHOIQ12A8C138593,132143
SOYIZSN12A6701E0BB,363912
SODYZAD12A58A7A525,63740
SOXLWPN12A8C143667,351788


In [99]:
yolo.loc[['SOOFKYO12AF72A2640', 'SOIHOIQ12A8C138593']]['sparse_index'].values

array([221367, 132143], dtype=object)