In [1]:
import os
import sys
from pathlib import Path

# setting proper working directory
PROJECT_DIRECTORY = Path(os.path.abspath('')).resolve().parents[0]
sys.path.extend([str(PROJECT_DIRECTORY)])

print(f'Python {sys.version} on {sys.platform}')
print('Project directory: ', PROJECT_DIRECTORY)

Python 3.9.19 (main, Jul  7 2024, 08:52:44) 
[Clang 15.0.0 (clang-1500.3.9.4)] on darwin
Project directory:  /Users/markus/Documents/privat/Studium/Diplomarbeit/ResearchSeminarMusicRecommender2024


In [2]:
import numpy as np
import pandas as pd
from implicit.evaluation import leave_k_out_split
from src.utilities.Helper import load_data, create_sparse_matrix
from src.utilities.MfAlgorithms import MFAlgorithms, MatrixFactorizationRecommender
from src.utilities.Metrics import Evaluation, Metrics

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
FILENAME = PROJECT_DIRECTORY / "data/processed/user_item_interaction_FILTERED_ANONYMIZED.txt"
DATASET = "real"
ROWS = 100000
TRAIN_TEST_SPLIT_STRATEGY = 42
FOLDS = 5

In [4]:
db_interaction = load_data(FILENAME, rows = ROWS, dataset=DATASET)
print(db_interaction.shape)
print(db_interaction.keys())

if ROWS is not None:
    # ONLY FOR SUBSETS: drop users below median interactions
    threshold = np.median(db_interaction['userID'].value_counts())
    print("Median user interactions: ", threshold)
    # for manual values to remove
    threshold = 20
    filter_users = db_interaction['userID'].value_counts() >= threshold
    filter_users = filter_users[filter_users].index.tolist()
    db_interaction = db_interaction[db_interaction['userID'].isin(filter_users)].reset_index(drop=True)
    filter_items = db_interaction['itemID'].value_counts() >= threshold
    filter_items = filter_items[filter_items].index.tolist()
    db_interaction = db_interaction[db_interaction['itemID'].isin(filter_items)].reset_index(drop=True)

    print("The new size is: ", db_interaction.shape)

sparse_user_item_interaction, user_index, item_index = create_sparse_matrix(db_interaction, dataset=DATASET)

print("Number of (users, items): ", sparse_user_item_interaction.shape)

print(sparse_user_item_interaction.getnnz())

n_total = sparse_user_item_interaction.shape[0]*sparse_user_item_interaction.shape[1]
n_ratings = sparse_user_item_interaction.nnz
sparsity = n_ratings/n_total
print(f"Matrix sparsity: {round(sparsity*100,2)}%")

(100000, 2)
Index(['userID', 'itemID'], dtype='object')
Median user interactions:  8.0
The new size is:  (24395, 2)
Number of (users, items):  (1005, 519)
24395
Matrix sparsity: 4.68%


In [5]:
#train_set, test_set = train_test_split(sparse_user_item_interaction, user_index, item_index, train_percentage=0.8, k=FOLDS, split_strategy=TRAIN_TEST_SPLIT_STRATEGY)
train_set, test_set = leave_k_out_split(sparse_user_item_interaction, K=10, random_state=42)

In [6]:
np.random.seed(3)
#NUMBER_USERS = 10
NUMB_EVAL_USERS = sparse_user_item_interaction.shape[0]
TOP_N = 10
EVAL_USERS = np.random.choice(user_index.cat.categories, NUMB_EVAL_USERS, replace=False)
EVAL_USERS_IDX = [user_index.cat.codes[user_index==user].unique()[0] for user in EVAL_USERS]
# print(f'CustomerIDs: {EVAL_USERS}')
if TRAIN_TEST_SPLIT_STRATEGY == "cross-fold":
    print(f'Total downloads per customer: {sparse_user_item_interaction[EVAL_USERS_IDX].getnnz(axis=1)}')
    for fold in range(FOLDS):
        print(f'Total downloads per customer in train: {train_set[fold][EVAL_USERS_IDX].getnnz(axis=1)}')
        print(f'Total downloads per customer in test: {test_set[fold][EVAL_USERS_IDX].getnnz(axis=1)}')
else:
    print(f'Total downloads per customer: {sparse_user_item_interaction[EVAL_USERS_IDX].getnnz(axis=1)}')
    print(f'Total downloads per customer in train: {train_set[EVAL_USERS_IDX].getnnz(axis=1)}')
    print(f'Total downloads per customer in test: {test_set[EVAL_USERS_IDX].getnnz(axis=1)}')

Total downloads per customer: [40  1 14 ...  9  3 56]
Total downloads per customer in train: [30  1  4 ...  9  3 46]
Total downloads per customer in test: [10  0 10 ...  0  0 30]


In [7]:
bpr_topN = MatrixFactorizationRecommender(MFAlgorithms)
bpr_topN.add_algorithm('bpr_algorithm')

In [11]:
bpr_results_list = []

In [12]:
# list of metrics to apply
metrics_list = ['MatchCount', 'Precision', 'MR', 'MRR', 'MAP', 'NDCG','Coverage', 'APLT', 'ARP']

# Instantiate the Evaluation class
evaluator = Evaluation(Metrics, sparse_user_item_interaction)

# Add metrics from the Metrics class
for metric in metrics_list:
    evaluator.add_metric(metric)

def evaluate(evaluators_predictions):# Evaluate metrics for each evaluator and store results
    results = []
    for evaluator_name, recommendations in evaluators_predictions.items():
        result = {
            'Evaluator': evaluator_name,
            'MatchCount': evaluator.evaluate('MatchCount', recommendations, test_set, user_index, item_index),
            'Precision': evaluator.evaluate('Precision', recommendations, test_set, user_index, item_index),
            'MR': evaluator.evaluate('MR', recommendations, test_set, user_index, item_index),
            'MRR': evaluator.evaluate('MRR', recommendations, test_set, user_index, item_index),
            'MAP': evaluator.evaluate('MAP', recommendations, test_set, user_index, item_index),
            'NDCG': evaluator.evaluate('NDCG', recommendations, test_set, user_index, item_index),
            'Coverage': evaluator.evaluate('Coverage', recommendations, test_set, user_index, item_index),
            'APLT': evaluator.evaluate('APLT', recommendations, test_set, user_index, item_index, threshold=0.2),
            'ARP': evaluator.evaluate('ARP', recommendations, test_set, user_index, item_index)
        }
        results.append(result)

    # Create a DataFrame from the results
    df = pd.DataFrame(results)
    #df.set_index('Evaluator', inplace=True)
    return df

In [63]:
factors = [75]
regularization = [0.5]
alpha = [1]
iterations = [200]
learning_rate = 0.00001

In [64]:
steps = len(factors) * len(regularization) * len(alpha) * len(iterations)
i = 1

for factor in factors:
    print(f"Progress: {i/steps *100:.2f}%")
    for reg in regularization:
        for a in alpha:
            for iter in iterations:
                bpr_topN.fit(user_item_matrix=train_set, learning_rate=learning_rate, factors=factor, regularization=reg, alpha=a, iterations=iter, verify_negative_samples=True, random_state=42)
                BPRRecoms = bpr_topN.recommend(EVAL_USERS, train_set, user_index, item_index, TOP_N)
                evaluators_predictions = {'BPRMatrixFactorization': BPRRecoms}
                temp_df = evaluate(evaluators_predictions)
                temp_df['a'] = a
                temp_df['regularization'] = reg
                temp_df['factors'] = factor
                temp_df['iterations'] = iter
                bpr_results_list.append(temp_df)
                i += 1
print(f"Progress: {i/steps *100:.2f}%")     
bpr_results_df = pd.concat(bpr_results_list)

Progress: 100.00%


100%|██████████| 200/200 [00:01<00:00, 177.02it/s, train_auc=52.16%, skipped=15.77%]


Progress: 200.00%


In [65]:
print(bpr_results_df)

                Evaluator  MatchCount  Precision        MR       MRR  \
0  BPRMatrixFactorization         264   0.026269  0.218905  0.076018   
0  BPRMatrixFactorization         173   0.017214  0.155224  0.040683   
0  BPRMatrixFactorization         382   0.038010  0.290547  0.115109   
0  BPRMatrixFactorization         297   0.029552  0.236816  0.087195   
0  BPRMatrixFactorization         355   0.035323  0.270647  0.089696   
0  BPRMatrixFactorization         395   0.039303  0.287562  0.082103   
0  BPRMatrixFactorization         643   0.063980  0.366169  0.195646   
0  BPRMatrixFactorization         672   0.066866  0.370149  0.180252   
0  BPRMatrixFactorization         516   0.051343  0.343284  0.181434   
0  BPRMatrixFactorization         681   0.067761  0.363184  0.201070   
0  BPRMatrixFactorization         682   0.067861  0.375124  0.201392   
0  BPRMatrixFactorization         679   0.067562  0.359204  0.199679   
0  BPRMatrixFactorization         679   0.067562  0.359204  0.19

In [None]:
bpr_results_df = bpr_results_df.reset_index(drop=True)
# als_results_df.to_csv("../data/evaluation/parameter_tuning/als-mf-top{TOP_N}-{NUMB_EVAL_USERS}user-leaveK-{ROWS}Rows.txt", sep="\t", encoding='utf-16', index=False)