In [None]:
import os

if not os.getcwd().endswith("src"):
    %cd ..
%pwd

/home/xqz-u/master/FACT/FACT/src


'/home/xqz-u/master/FACT/FACT/src'

In [None]:
import implicit
import numpy as np
import pandas as pd
import scipy

import config



In [None]:
user_artist_df = pd.read_csv(config.LASTFM_DIR / "user_artists.dat", sep="\t")
user_artist_df = user_artist_df.rename(columns={"userID": "user", "artistID": "item"})
user_artist_df

Unnamed: 0,user,item,weight
0,2,51,13883
1,2,52,11690
2,2,53,11351
3,2,54,10300
4,2,55,8983
...,...,...,...
92829,2100,18726,337
92830,2100,18727,297
92831,2100,18728,281
92832,2100,18729,280


In [None]:
# steps:
#     1. keep only top-2500 most listened artists DONE
#     2. pre-process raw counts with log transforms (is it just taking the log?) DONE
#     3. transform into full user-item preference matrix DONE
#     4. split into 70/10/20 train/val/test sets, save the seeds used
#     5. use Implicit library to fit a matrix factorization, using
#        grid-search on hyperparms defined in appendix C.2
#     6. generalize to MovieLens dataset, gpu etc.

In [None]:
# filter only top k artists
k = 2500
top_k_artists = np.array(user_artist_df.groupby("item")["weight"].sum().sort_values(ascending=False).index)[:k]
user_artist_df = user_artist_df.loc[user_artist_df["item"].isin(top_k_artists)]
assert set(user_artist_df["item"]) == set(top_k_artists)
# log-transform
user_artist_df = user_artist_df.copy() # avoid SettingWithCopy warning
user_artist_df.loc[:, "weight"] = np.log(user_artist_df["weight"])
user_artist_df

Unnamed: 0,user,item,weight
0,2,51,9.538420
1,2,52,9.366489
2,2,53,9.337061
3,2,54,9.239899
4,2,55,9.103089
...,...,...,...
92795,2100,1276,7.032624
92796,2100,1281,6.350886
92797,2100,2749,6.276643
92798,2100,2765,6.124683


In [None]:
user_item_df = user_artist_df.pivot(index="user", columns="item", values="weight").fillna(0)
user_item_df

item,2,6,7,8,9,10,12,15,18,19,...,18125,18126,18127,18205,18206,18434,18435,18558,18559,18575
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2095,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2096,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2097,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2099,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# implicit wants sparse matrices (user, item), the docs say (item, user) but they are outdated,
# look at their source code instead
user_item_csr = scipy.sparse.csr_matrix(user_item_df.values)
user_item_csr

<1880x2500 sparse matrix of type '<class 'numpy.float64'>'
	with 67009 stored elements in Compressed Sparse Row format>

In [None]:
# split into 0.7 train 0.2 val 0.1 test
from implicit import evaluation
import numpy as np

seed = 42
rng = np.random.default_rng(seed=seed)

train_csr, tmp_csr = evaluation.train_test_split(user_item_csr, train_percentage=0.7, random_state=seed)
val_csr, test_csr = evaluation.train_test_split(tmp_csr, train_percentage=2/3, random_state=seed)
train_csr, val_csr

(<1880x2500 sparse matrix of type '<class 'numpy.float64'>'
 	with 47004 stored elements in Compressed Sparse Row format>,
 <1880x2500 sparse matrix of type '<class 'numpy.float64'>'
 	with 13362 stored elements in Compressed Sparse Row format>)

In [None]:
from typing import Dict, Sequence
from scipy import sparse
import itertools as it
import pprint



# NOTE add possibility of using different evaluation metric if needed
def grid_search(train_mat: sparse.csr_matrix, valid_mat: sparse.csr_matrix, hyperparams: Dict[str, Sequence], best_model_path: str, best_model_hyperp_path: str) -> implicit.als.AlternatingLeastSquares:
    print("Hyperparameters in grid search:")
    pprint.pprint(hyperparams)
    hyperparams_comb = list(map(lambda vals: dict(zip(hyperparams.keys(), vals)), list(it.product(*hyperparams.values()))))

    best_model_score, best_model, best_model_hyperp = -1.0, None, None

    for hyperparams in hyperparams_comb:
        model = implicit.als.AlternatingLeastSquares(**hyperparams)
        model.fit(train_mat)
        score = evaluation.ranking_metrics_at_k(model, train_mat, valid_mat)["map"]

        if score > best_model_score:
            print(f"Best model found! old score: {score} new map {score} hyperp: {hyperparams}")
            best_model_score = score
            best_model = deepcopy(model)
            best_model_hyperp = hyperparams

    best_model.save(best_model_path)
    print(f"Saved best model to {best_model_path}")
    with open(best_model_hyperp_path, "w") as fd:
        fd.write("factor,regularizer,alpha\n")
        fd.write(",".join(list(map(str, best_model_hyperp))))
    print(f"Saved best model hyperparams to {best_model_hyperp_path}")
    return best_model

In [None]:
gt_hyperp = {
    "factors": [16, 32, 64, 128],
    "regularization": [0.01, 0.1, 1.0, 10.0],
    "alpha": [0.1, 1.0, 10.0, 100.0]
}

best_gt_path = config.MODELS_DIR / "lastfm_gt_best.npz"
best_gt_hyperp_path = config.MODELS_DIR / "lastfm_gt_best_hyperp.txt"

In [None]:
best_model = grid_search(train_csr, val_csr, gt_hyperp, best_gt_path, best_gt_hyperp_path)

Hyperparameters in grid search:
{'alpha': [100.0], 'factors': [16, 32], 'regularization': [0.1]}


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1839 [00:00<?, ?it/s]

Best model found! old score: 0.03246602210154813 new map 0.03246602210154813 hyperp: {'factors': 16, 'regularization': 0.1, 'alpha': 100.0}


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1839 [00:00<?, ?it/s]

Best model found! old score: 0.04987967763284003 new map 0.04987967763284003 hyperp: {'factors': 32, 'regularization': 0.1, 'alpha': 100.0}
Saved best model to ../models/lastfm_gt_best.npz
Saved best model hyperparams to ../models/lastfm_gt_best_hyperp.txt


In [None]:
# low-rank matrix completion
ground_truth = best_model.user_factors @ best_model.item_factors.T

In [None]:
indices = [(i, j) for i in range(ground_truth.shape[0]) for j in range(ground_truth.shape[1])]
# we mask 80% of the ground truth data because in section 5.1 they say:
# the simulated recommender system estimates relevance scores using low-rank
# matrix completion (Bell and Sejnowski 1995) on a training sample of 20% of
# the ground truth preferences
kept_preferences = rng.choice(indices, size=int(0.2*len(indices)), replace=False)
print(len(kept_preferences))

ground_truth_masked = np.zeros_like(ground_truth)
for i, j in kept_preferences:
    ground_truth_masked[i, j] = ground_truth[i, j]
ground_truth_masked_sparse = scipy.sparse.csr_matrix(ground_truth_masked)

rec_train_csr, tmp_csr = evaluation.train_test_split(ground_truth_masked_sparse, train_percentage=0.7, random_state=seed)
rec_val_csr, rec_test_csr = evaluation.train_test_split(tmp_csr, train_percentage=2/3, random_state=seed)
rec_train_csr, rec_val_csr

940000


(<1880x2500 sparse matrix of type '<class 'numpy.float32'>'
 	with 636319 stored elements in Compressed Sparse Row format>,
 <1880x2500 sparse matrix of type '<class 'numpy.float32'>'
 	with 110590 stored elements in Compressed Sparse Row format>)

In [None]:
recommender_hyperp = {
    "factors": [2**i for i in range(9)],
    "regularization": [10**(i-3) for i in range(4)],
    "alpha": [0.1, 1.0, 10.0, 100.0]
}

best_recommender_path = config.MODELS_DIR / "lastfm_best.npz"
best_recommender_hyperp_path = config.MODELS_DIR / "lastfm_best_hyperp.txt"

In [None]:
best_recommender = grid_search(rec_train_csr, rec_val_csr, recommender_hyperp, best_recommender_path, best_recommender_hyperp_path)

Hyperparameters in grid search:
{'alpha': [0.1, 1.0, 10.0, 100.0],
 'factors': [1, 2, 4, 8, 16, 32, 64, 128, 256],
 'regularization': [0.001, 0.01, 0.1, 1]}


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1867 [00:00<?, ?it/s]

Best model found! old score: 0.015075411703692419 new map 0.015075411703692419 hyperp: {'factors': 1, 'regularization': 0.001, 'alpha': 0.1}


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1867 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1867 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1867 [00:00<?, ?it/s]

Best model found! old score: 0.015089439810918186 new map 0.015089439810918186 hyperp: {'factors': 1, 'regularization': 0.001, 'alpha': 100.0}


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1867 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

KeyboardInterrupt: 