In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
import itertools
# import ALSpkNN
import time
# from implicit.evaluation import mean_average_precision_at_k

In [2]:
user_df = pd.read_hdf('data/user_df.h5', key='df')[['user_id', 'sparse_index', 'MUSIC', 'song_ids']]
user_df.set_index('sparse_index', inplace=True)

song_df = pd.read_hdf('data/song_df.h5', key='df')[['song_id', 'sparse_index']]
song_df.set_index('song_id', inplace=True)

# train_plays, test_plays -> num_songs x num_users CSR matrix
train_plays = load_npz('data/train_sparse.npz')
test_plays = load_npz('data/test_sparse.npz')

# # songs -> CSR_row_index: song_id
# songs_mapping = pd.read_hdf('data/song_mapping.h5', key='df')
# songs_mapping.set_index('song_id', inplace=True)

# # users -> CSR_col_index: user_id
# users_mapping = pd.read_hdf('data/user_mapping.h5', key='df')
# users_mapping.set_index('sparse_index', inplace=True)

In [26]:
test_plays.nnz

458392

In [35]:
coo = test_plays.tocoo()
print(f'len rows: {len(coo.row)}')
print(f'len col: {len(coo.col)}')

len rows: 458392
len col: 458392


In [3]:
user_df.head()

Unnamed: 0_level_0,user_id,MUSIC,song_ids
sparse_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,00000b722001882066dff9d2da8a775658053ea0,"[4.229812033333332, 1.4042373883333334, 3.7184...","[SOBSSGK12A6D4F9EF1, SOCZQCY12AC468E40F, SOCTX..."
1,00001638d6189236866af9bbf309ae6c2347ffdc,"[4.910766939999999, 1.6201183160000001, 4.2151...","[SOFXSRW12A6D4F3B77, SOFFWTH12A6310D9E8, SOLOD..."
2,0000175652312d12576d9e6b84f600caa24c4715,"[3.9929606913333338, 1.3756423253666668, 3.569...","[SOBYRTY12AB0181EDB, SOYWZXA12A8C138274, SOYFP..."
3,00001cf0dce3fb22b0df0f3a1d9cd21e38385372,"[4.160212249999999, 1.38550505, 3.48416005, -6...","[SOBDRND12A8C13FD08, SODRFRJ12A8C144167, SOMMJ..."
4,0000267bde1b3a70ea75cf2b2d216cb828e3202b,"[5.020851199999999, 1.2992664299999999, 4.2901...","[SOBMSCQ12AAF3B51B7, SOJERWB12A8C13E654, SOMCH..."


In [4]:
song_df.head()

Unnamed: 0_level_0,sparse_index
song_id,Unnamed: 1_level_1
SOAKIMP12A8C130995,4785
SOAPDEY12A81C210A9,7052
SOBFOVM12A58A7D494,14453
SOBSUJE12A6D4F8CF5,20354
SOBVFZR12A6D4F8AE3,21408


In [5]:
user_df.shape

(1107613, 3)

In [6]:
song_df.shape

(168503, 1)

In [7]:
test_plays.shape

(168493, 1107613)

In [8]:
len(test_plays.indices)

458392

In [9]:
test_plays.indices[:100]

array([  49143,  787772,  109151,   66449,   83772,  111555,  198281,
        287061,  376151,  384957,  398788,  408648,  416998,  434386,
        586799,  622743,  698878,  726211,  769562,  810818,  863604,
        896862,  965170, 1086690,  758223,  875592,  926358, 1000342,
       1076076,  605153,  563467,   13436,  108695,  538736,  877772,
        952790,  327273,  869363,    8502,  416624,   48417,  849814,
        260213,  676714,  850119,  887777, 1063869,  212451,  900438,
         28039,   83610,  140538,  199479,  216648,  218345,  236168,
        257469,  278395,  285993,  348678,  355035,  359660,  363542,
        369556,  384475,  396037,  399130,  400412,  424980,  464854,
        466695,  473425,  527962,  541112,  555926,  621073,  624586,
        669853,  711671,  739013,  743164,  755922,  801258,  808653,
        840179,  840866,  847948,  855896,  879143,  942893,  969308,
        986141, 1023296, 1024617, 1029356, 1046200, 1048407, 1053872,
         12738,   18

In [10]:
len(test_plays.indptr)

168494

In [11]:
test_plays.transpose().indptr[:100]

array([  0,   0,   0,   0,   2,   2,   2,   3,   3,  24,  24,  24,  25,
        25,  29,  29,  29,  29,  29,  30,  30,  31,  31,  31,  31,  31,
        35,  36,  38,  38,  40,  40,  40,  42,  47,  49,  49,  49,  98,
        98,  98, 125, 125, 125, 125, 125, 125, 125, 126, 127, 127, 128,
       128, 128, 128, 128, 129, 130, 131, 136, 136, 136, 136, 136, 136,
       136, 139, 144, 151, 151, 151, 151, 151, 152, 154, 156, 156, 156,
       167, 169, 169, 170, 170, 171, 173, 173, 173, 173, 173, 173, 173,
       228, 229, 230, 230, 230, 232, 233, 233, 233], dtype=int32)

In [12]:
test_user_items = test_plays.transpose()

In [13]:
test_user_items.shape

(1107613, 168493)

In [14]:
test_user_items.indptr[:100]

array([  0,   0,   0,   0,   2,   2,   2,   3,   3,  24,  24,  24,  25,
        25,  29,  29,  29,  29,  29,  30,  30,  31,  31,  31,  31,  31,
        35,  36,  38,  38,  40,  40,  40,  42,  47,  49,  49,  49,  98,
        98,  98, 125, 125, 125, 125, 125, 125, 125, 126, 127, 127, 128,
       128, 128, 128, 128, 129, 130, 131, 136, 136, 136, 136, 136, 136,
       136, 139, 144, 151, 151, 151, 151, 151, 152, 154, 156, 156, 156,
       167, 169, 169, 170, 170, 171, 173, 173, 173, 173, 173, 173, 173,
       228, 229, 230, 230, 230, 232, 233, 233, 233], dtype=int32)

In [15]:
{2,3} & {2,4}

{2}

In [16]:
[i for i in range(1,2)]

[1]

In [17]:
from numpy import array
from scipy.sparse import csr_matrix
A = array([[0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 0, 0], [0, 0, 2, 0, 0, 1], [0, 0, 0, 2, 0, 0]])
print(A)

S = csr_matrix(A)
print(S)
print(f'shape: {S.shape}')
print(f'indices: {S.indices}')
print(f'indptr: {S.indptr}')

S_T = S.transpose()
print(S_T)
print(f'shape: {S_T.shape}')
print(f'indices: {S_T.indices}')
print(f'indptr: {S_T.indptr}')


[[0 0 0 0 0 0]
 [1 0 0 1 0 0]
 [0 0 2 0 0 1]
 [0 0 0 2 0 0]]
  (1, 0)	1
  (1, 3)	1
  (2, 2)	2
  (2, 5)	1
  (3, 3)	2
shape: (4, 6)
indices: [0 3 2 5 3]
indptr: [0 0 2 4 5]
  (0, 1)	1
  (3, 1)	1
  (2, 2)	2
  (5, 2)	1
  (3, 3)	2
shape: (6, 4)
indices: [0 3 2 5 3]
indptr: [0 0 2 4 5]


In [18]:
from numpy import array
from scipy.sparse import csr_matrix
A = array([[0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 0, 0], [0, 0, 2, 0, 0, 1], [0, 0, 0, 2, 0, 0]])
print(A.T)
print(A.T.shape)

S = csr_matrix(A.T)
print(S)
print(f'shape: {S.shape}')
print(f'indices: {S.indices}')
print(f'indptr: {S.indptr}')

S_T = S.transpose()
print(S_T)
print(f'shape: {S_T.shape}')
print(f'indices: {S_T.indices}')
print(f'indptr: {S_T.indptr}')

[[0 1 0 0]
 [0 0 0 0]
 [0 0 2 0]
 [0 1 0 2]
 [0 0 0 0]
 [0 0 1 0]]
(6, 4)
  (0, 1)	1
  (2, 2)	2
  (3, 1)	1
  (3, 3)	2
  (5, 2)	1
shape: (6, 4)
indices: [1 2 1 3 2]
indptr: [0 1 1 2 4 4 5]
  (1, 0)	1
  (2, 2)	2
  (1, 3)	1
  (3, 3)	2
  (2, 5)	1
shape: (4, 6)
indices: [1 2 1 3 2]
indptr: [0 1 1 2 4 4 5]


In [19]:
from ALSpkNN import get_baseline_cf_model, weight_cf_matrix

# print("Building and fitting the baseline CF model")
baseline_cf_model = get_baseline_cf_model()
# weighted_train_csr = weight_cf_matrix(train_plays, alpha=1)
baseline_cf_model.fit(train_plays)

# print("Evaluating the baseline CF model")
start = time.time()

# setting num_threads = 0 yields a 3x speedup on Nolan's MBP
MAPk = mean_average_precision_at_k(
    model=baseline_cf_model,
    train_user_items=train_plays.transpose(),
    test_user_items=test_plays.transpose(),
    K=5,
    show_progress=True,
    num_threads=0)

print("MAPK for baseline CF is: " + str(MAPk))
print(f'Calculation took {time.time() - start}s')

# MAPK for baseline CF is: 0.021571399459982602
# Calculation took 103.20778608322144s

# MAPK for baseline CF is: 0.02496744369967655
# Calculation took 93.62211418151855s

# MAPK for baseline CF is: 0.01910300963831448
# Calculation took 103.18802404403687s

100%|██████████| 2.0/2 [00:04<00:00,  2.10s/it, loss=0.000249]
100%|██████████| 1107613/1107613 [01:41<00:00, 10916.69it/s]

MAPK for baseline CF is: 0.01910300963831448
Calculation took 103.18802404403687s





In [37]:
ls

ALSpKNN.ipynb                    evaluate_models.py
ALSpkNN.py                       evaluation.py
Map_spotify_user_to_MUSIC.ipynb  gen_user_recs.py
README.md                        [34mold_code[m[m/
Spotify_CF.ipynb                 test_ALSpKNN.py
[34m__pycache__[m[m/                     utilities.py
create_all_data.ipynb            [34mweb_prototype[m[m/
[34mdata[m[m/                            yolo.pyx
[34mdata_processing[m[m/


In [36]:
from yolo import mean_average_precision_at_k
from ALSpkNN import get_baseline_cf_model, weight_cf_matrix
# from evaluation import py_mean_average_precision_at_k

# print("Building and fitting the baseline CF model")
baseline_cf_model = get_baseline_cf_model()
# weighted_train_csr = weight_cf_matrix(train_plays, alpha=1)
baseline_cf_model.fit(train_plays)

# print("Evaluating the baseline CF model")
start = time.time()

# setting num_threads = 0 yields a 3x speedup on Nolan's MBP
MAPk = mean_average_precision_at_k(
    model=baseline_cf_model,
    train_user_items=train_plays.transpose(),
    test_user_items=test_plays.transpose(),
    K=5,
    show_progress=True,
    num_threads=0)

print("MAPK for baseline CF is: " + str(MAPk))
print(f'Calculation took {time.time() - start}s')


ModuleNotFoundError: No module named 'yolo'

In [21]:
from ALSpkNN import get_baseline_cf_model, weight_cf_matrix
from evaluation import py_mean_average_precision_at_k

# print("Building and fitting the baseline CF model")
baseline_cf_model = get_baseline_cf_model()
# weighted_train_csr = weight_cf_matrix(train_plays, alpha=1)
baseline_cf_model.fit(train_plays)

# print("Evaluating the baseline CF model")
start = time.time()

# setting num_threads = 0 yields a 3x speedup on Nolan's MBP
MAPk = py_mean_average_precision_at_k(
    model=baseline_cf_model,
    train_user_items=train_plays.transpose(),
    test_user_items=test_plays.transpose(),
    K=5,
    show_progress=True,
    num_threads=0)

print("MAPK for baseline CF is: " + str(MAPk))
print(f'Calculation took {time.time() - start}s')

100%|██████████| 2.0/2 [00:04<00:00,  1.99s/it, loss=0.000249]
  0%|          | 0/1107613 [00:00<?, ?it/s]


UnboundLocalError: local variable 'precisions_sum' referenced before assignment

In [61]:
from ALSpkNN import ALSpkNN

print("Building model...")
model = ALSpkNN(user_df, song_df, k=100, knn_frac=0.5, cf_weighting_alpha=1)
print("Fitting model...")
model.fit(train_plays)
recs = model.recommend(user_sparse_index=12345, train_plays_transpose=train_plays, N=5)
print(recs)

Building model...


TypeError: __init__() missing 1 required positional argument: 'song_mapping'

In [None]:
start = time.time()
MAPk = mean_average_precision_at_k(
    model,
    train_plays.transpose(),
    test_plays.transpose(),
    K=5)
#     show_progress=False,
#     num_threads=0)

print("MAPK for ALSpKNN is: " + str(MAPk))
print(f'Calculation took {time.time() - start}s')

 73%|███████▎  | 808721/1107613 [3:03:36<5082:43:41, 61.22s/it]

# Test Code

In [39]:
knn_frac = 0.5
N = 7
m = int(np.round(knn_frac*N))
n = N - m
print(m)
print(type(n))

4
<class 'int'>


In [43]:
user_df.loc[user_df['user_id'] == '00000b722001882066dff9d2da8a775658053ea0']['MUSIC'].values[0]

[4.229812033333332,
 1.4042373883333334,
 3.718447513333333,
 -6.4863809183333325,
 2.166463065]

In [46]:
user_df.head()

Unnamed: 0,MUSIC,is_test,num_songs,song_ids,user_id
0,"[4.229812033333332, 1.4042373883333334, 3.7184...",False,3,"[SOBSSGK12A6D4F9EF1, SOCZQCY12AC468E40F, SOCTX...",00000b722001882066dff9d2da8a775658053ea0
1,"[4.910766939999999, 1.6201183160000001, 4.2151...",False,6,"[SOFXSRW12A6D4F3B77, SOFFWTH12A6310D9E8, SOLOD...",00001638d6189236866af9bbf309ae6c2347ffdc
2,"[3.9929606913333338, 1.3756423253666668, 3.569...",False,6,"[SOBYRTY12AB0181EDB, SOYWZXA12A8C138274, SOYFP...",0000175652312d12576d9e6b84f600caa24c4715
3,"[4.160212249999999, 1.38550505, 3.48416005, -6...",False,3,"[SOBDRND12A8C13FD08, SODRFRJ12A8C144167, SOMMJ...",00001cf0dce3fb22b0df0f3a1d9cd21e38385372
4,"[5.020851199999999, 1.2992664299999999, 4.2901...",False,9,"[SOBMSCQ12AAF3B51B7, SOJERWB12A8C13E654, SOMCH...",0000267bde1b3a70ea75cf2b2d216cb828e3202b


In [61]:
user_df.iloc[[0,1,2]]['user_id']

0    00000b722001882066dff9d2da8a775658053ea0
1    00001638d6189236866af9bbf309ae6c2347ffdc
2    0000175652312d12576d9e6b84f600caa24c4715
Name: user_id, dtype: object

In [51]:
sample_user_ids = ['00000b722001882066dff9d2da8a775658053ea0', '00001638d6189236866af9bbf309ae6c2347ffdc']
user_df.loc[user_df['user_id'].isin(sample_user_ids)]['song_ids'].values

array([list(['SOBSSGK12A6D4F9EF1', 'SOCZQCY12AC468E40F', 'SOCTXQW12A6D4F70AD']),
       list(['SOFXSRW12A6D4F3B77', 'SOFFWTH12A6310D9E8', 'SOLODPO12AB017F217', 'SOBFEDK12A8C13BB25', 'SOAORYL12A67AD8187', 'SOEKYTM12A8C13CBF4'])],
      dtype=object)

In [56]:
# a = [1,1,2,3,3,4]
a = [[1,1],[2,3,3,4]]
Counter(itertools.chain.from_iterable(a)).most_common(2)

[(1, 2), (3, 2)]

In [83]:
lol = users_mapping
lol.set_index('sparse_index', inplace=True)
lol.index.name = 'sparse_index'
lol.head()

Unnamed: 0_level_0,user
sparse_index,Unnamed: 1_level_1
9,00007a02388c208ea7176479f6ae06f8224355b3
21,00014a76ed063e1a749171a253bca9d9a0ff1782
22,00015189668691680bb1a2e58afde1541ec92ced
29,0001ff7aa2667c8d8b945317b88adaed1c0b9dc2
31,00020fcd8b01986a6a85b896ccde6c49f35142ad


In [86]:
lol.loc[21]['user']

'00014a76ed063e1a749171a253bca9d9a0ff1782'

In [82]:
yolo = songs_mapping
# yolo.head()
yolo.set_index('track', inplace=True)
yolo.head()

Unnamed: 0_level_0,sparse_index
track,Unnamed: 1_level_1
SOOFKYO12AF72A2640,221367
SOIHOIQ12A8C138593,132143
SOYIZSN12A6701E0BB,363912
SODYZAD12A58A7A525,63740
SOXLWPN12A8C143667,351788


In [99]:
yolo.loc[['SOOFKYO12AF72A2640', 'SOIHOIQ12A8C138593']]['sparse_index'].values

array([221367, 132143], dtype=object)

In [42]:
%load_ext cython

In [48]:
%%cython?

In [47]:
%%cython –-cplus

import tqdm
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix
import cython
from cython.operator import dereference
from cython.parallel import parallel, prange
from libc.stdlib cimport malloc, free
from libc.string cimport memset
from libc.math cimport fmin

from libcpp.unordered_set cimport unordered_set

@cython.boundscheck(False)
def mean_average_precision_at_k(model, train_user_items, test_user_items, int K=10,
                                show_progress=True, int num_threads=1):
    """ Calculates MAP@K for a given trained model
    Parameters
    ----------
    model : RecommenderBase
        The fitted recommendation model to test
    train_user_items : csr_matrix
        Sparse matrix of user by item that contains elements that were used in training the model
    test_user_items : csr_matrix
        Sparse matrix of user by item that contains withheld elements to test on
    K : int
        Number of items to test on
    show_progress : bool, optional
        Whether to show a progress bar
    num_threads : int, optional
        The number of threads to use for testing. Specifying 0 means to default
        to the number of cores on the machine. Note: aside from the ALS and BPR
        models, setting this to more than 1 will likely hurt performance rather than
        help.
    Returns
    -------
    float
        the calculated MAP@k
    """
    # TODO: there is a fair amount of boilerplate here that is cut and paste
    # from precision_at_k. refactor it out.
    if not isinstance(train_user_items, csr_matrix):
        train_user_items = train_user_items.tocsr()

    if not isinstance(test_user_items, csr_matrix):
        test_user_items = test_user_items.tocsr()

    cdef int users = test_user_items.shape[0], u, i, total = 0
    cdef double mean_ap = 0, ap = 0, relevant = 0
    cdef int[:] test_indptr = test_user_items.indptr
    cdef int[:] test_indices = test_user_items.indices

    cdef int * ids
    cdef unordered_set[int] * likes

    progress = tqdm.tqdm(total=users, disable=not show_progress)

    with nogil, parallel(num_threads=num_threads):
        ids = <int *> malloc(sizeof(int) * K)
        likes = new unordered_set[int]()
        try:
            for u in prange(1000, schedule='guided'):
                # if we don't have any test items, skip this user
                if test_indptr[u] == test_indptr[u+1]:
                    with gil:
                        progress.update(1)
                    continue
                memset(ids, 0, sizeof(int) * K)

                with gil:
                    recs = model.recommend(u, train_user_items, N=K)
                    for i in range(len(recs)):
                        ids[i] = recs[i][0]
                    progress.update(1)

                # mostly we're going to be blocked on the gil here,
                # so try to do actual scoring without it
                likes.clear()
                for i in range(test_indptr[u], test_indptr[u+1]):
                    likes.insert(test_indices[i])

                ap = 0
                relevant = 0
                for i in range(K):
                    if likes.find(ids[i]) != likes.end():
                        relevant = relevant + 1
                        ap = ap + relevant / (i + 1)
                mean_ap += ap / fmin(K, likes.size())
                total += 1
        finally:
            free(ids)
            del likes

    progress.close()
    return mean_ap / total


UsageError: unrecognized arguments: –-cplus
