In [None]:
import os

if not os.getcwd().endswith("src"):
    %cd ..
%pwd

/home/xqz-u/master/FACT/FACT/src


'/home/xqz-u/master/FACT/FACT/src'

In [None]:
import implicit
import numpy as np
import pandas as pd
import scipy

import config



In [None]:
user_artist_df = pd.read_csv(config.LASTFM_DIR / "user_artists.dat", sep="\t")
user_artist_df = user_artist_df.rename(columns={"userID": "user", "artistID": "item"})
user_artist_df

Unnamed: 0,user,item,weight
0,2,51,13883
1,2,52,11690
2,2,53,11351
3,2,54,10300
4,2,55,8983
...,...,...,...
92829,2100,18726,337
92830,2100,18727,297
92831,2100,18728,281
92832,2100,18729,280


In [None]:
# steps:
#     1. keep only top-2500 most listened artists DONE
#     2. pre-process raw counts with log transforms (is it just taking the log?) DONE
#     3. transform into full user-item preference matrix DONE
#     4. split into 70/10/20 train/val/test sets, save the seeds used
#     5. use Implicit library to fit a matrix factorization, using
#        grid-search on hyperparms defined in appendix C.2
#     6. generalize to MovieLens dataset, gpu etc.

In [None]:
# filter only top k artists
k = 2500
top_k_artists = np.array(user_artist_df.groupby("item")["weight"].sum().sort_values(ascending=False).index)[:k]
user_artist_df = user_artist_df.loc[user_artist_df["item"].isin(top_k_artists)]
assert set(user_artist_df["item"]) == set(top_k_artists)
# log-transform
user_artist_df = user_artist_df.copy() # avoid SettingWithCopy warning
user_artist_df.loc[:, "weight"] = np.log(user_artist_df["weight"])
user_artist_df

Unnamed: 0,user,item,weight
0,2,51,9.538420
1,2,52,9.366489
2,2,53,9.337061
3,2,54,9.239899
4,2,55,9.103089
...,...,...,...
92795,2100,1276,7.032624
92796,2100,1281,6.350886
92797,2100,2749,6.276643
92798,2100,2765,6.124683


In [None]:
user_artist_df = user_artist_df.pivot(index="user", columns="item", values="weight").fillna(0)
user_artist_df

item,2,6,7,8,9,10,12,15,18,19,...,18125,18126,18127,18205,18206,18434,18435,18558,18559,18575
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2095,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2096,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2097,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2099,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# implicit wants sparse matrices (user, item), the docs say (item, user) but they are outdated,
# look at their source code instead
sparse_df = scipy.sparse.csr_matrix(user_artist_df.values)
sparse_df

<1880x2500 sparse matrix of type '<class 'numpy.float64'>'
	with 67009 stored elements in Compressed Sparse Row format>

In [None]:
import time

# initialize a model
# NOTE hyperparameters mentioned in paper:
#     regularization -> "regularization"
#     confidence weight -> is it "alpha"? check Hu, Koren and Volinsky (2008)
#                          or is it bm25 weights as in here
#                          https://github.com/benfred/implicit/blob/871e0c7229b012108131b6211cd617e23a3b24bf/examples/tutorial_lastfm.ipynb
#                          (I don't think so, it should be mentioned in the paper)
model = implicit.als.AlternatingLeastSquares(factors=50, calculate_training_loss=True
                                             # regularization=0.1,
                                             # alpha=1.0
                                            )

start = time.time()
# train the model on a sparse matrix of item/user/confidence weights
model.fit(sparse_df)
print(f"elapsed time: {time.time() - start}")

  0%|          | 0/15 [00:00<?, ?it/s]

elapsed time: 6.913877010345459


In [None]:
print(model.user_factors.shape, model.item_factors.shape)
# NOTE this dot product is basically matrix completion
model.user_factors @ model.item_factors.T

(1880, 50) (2500, 50)


array([[-0.10829641, -0.11628836, -0.46695313, ..., -0.01436631,
        -0.01338313,  0.05232564],
       [ 0.03210636, -0.01173768, -0.07893225, ...,  0.00226942,
         0.00231467, -0.00813876],
       [ 0.11879817,  0.06920086, -0.17183183, ..., -0.05037339,
        -0.04876595,  0.01020042],
       ...,
       [-0.06132386,  0.0859365 ,  0.24173555, ..., -0.01015856,
        -0.009132  , -0.01473843],
       [ 0.02174152, -0.0274522 , -0.1082935 , ...,  0.00801721,
         0.00801332, -0.02827725],
       [ 0.01453499,  0.36707014,  0.06106527, ...,  0.00793931,
         0.00827053, -0.01064318]], dtype=float32)

In [None]:
loss = []

def als_loss_callback(iteration, rel_time, iter_loss: float):
    ...

def fit_ground_truth(user_item: scipy.sparse.csr_matrix, model: implicit.als.AlternatingLeastSquares):
    
    ...