## Parameter Search for Alternating Least Squares Algorithm

- conduct grid search for a given set of parameters (a, factors, iterations, regularization)
- export each parameter setting with the achieved metrics to .txt file

## 0. Import Modules

In [1]:
import os
import sys
from pathlib import Path

# setting proper working directory
PROJECT_DIRECTORY = Path(os.path.abspath('')).resolve().parents[0]
sys.path.extend([str(PROJECT_DIRECTORY)])

print(f'Python {sys.version} on {sys.platform}')
print('Project directory: ', PROJECT_DIRECTORY)

Python 3.9.19 | packaged by conda-forge | (main, Mar 20 2024, 12:38:46) [MSC v.1929 64 bit (AMD64)] on win32
Project directory:  C:\Users\s8347434\Documents\RecSys2024


In [2]:
import numpy as np
import pandas as pd
from implicit.evaluation import leave_k_out_split
from src.utilities.Helper import load_data, create_sparse_matrix
from src.utilities.MfAlgorithms import MFAlgorithms, MatrixFactorizationRecommender
from src.utilities.Metrics import Evaluation, Metrics

  from .autonotebook import tqdm as notebook_tqdm


## 1. Read the Dataset

In [3]:
FILENAME = PROJECT_DIRECTORY / "data/processed/user_item_interaction_FILTERED_ANONYMIZED.txt"
DATASET = "real"
ROWS = 5000000
TRAIN_TEST_SPLIT_STRATEGY = 42
FOLDS = 5

In [4]:
db_interaction = load_data(FILENAME, rows = ROWS, dataset=DATASET)
print(db_interaction.shape)
print(db_interaction.keys())

if ROWS is not None:
    # ONLY FOR SUBSETS: drop users below median interactions
    threshold = np.median(db_interaction['userID'].value_counts())
    print("Median user interactions: ", threshold)
    # for manual values to remove
    threshold = 20
    filter_users = db_interaction['userID'].value_counts() >= threshold
    filter_users = filter_users[filter_users].index.tolist()

    db_interaction = db_interaction[db_interaction['userID'].isin(filter_users)].reset_index(drop=True)
    print("The new size is: ", db_interaction.shape)

sparse_user_item_interaction, user_index, item_index = create_sparse_matrix(db_interaction, dataset=DATASET)

print("Number of (users, items): ", sparse_user_item_interaction.shape)

print(sparse_user_item_interaction.getnnz())

n_total = sparse_user_item_interaction.shape[0]*sparse_user_item_interaction.shape[1]
n_ratings = sparse_user_item_interaction.nnz
sparsity = n_ratings/n_total
print(f"Matrix sparsity: {round(sparsity*100,2)}%")

(5000000, 2)
Index(['userID', 'itemID'], dtype='object')
Median user interactions:  78.0
The new size is:  (4968819, 2)
Number of (users, items):  (26585, 31445)
4968819
Matrix sparsity: 0.59%


## 1.1 Train/Test Split

In [5]:
#train_set, test_set = train_test_split(sparse_user_item_interaction, user_index, item_index, train_percentage=0.8, k=FOLDS, split_strategy=TRAIN_TEST_SPLIT_STRATEGY)
train_set, test_set = leave_k_out_split(sparse_user_item_interaction, K=10, random_state=42)

In [6]:
np.random.seed(3)
NUMBER_USERS = 10
NUMB_EVAL_USERS = 100
TOP_N = 10
EVAL_USERS = np.random.choice(user_index.cat.categories, NUMB_EVAL_USERS, replace=False)
EVAL_USERS_IDX = [user_index.cat.codes[user_index==user].unique()[0] for user in EVAL_USERS]
# print(f'CustomerIDs: {EVAL_USERS}')
if TRAIN_TEST_SPLIT_STRATEGY == "cross-fold":
    print(f'Total downloads per customer: {sparse_user_item_interaction[EVAL_USERS_IDX].getnnz(axis=1)}')
    for fold in range(FOLDS):
        print(f'Total downloads per customer in train: {train_set[fold][EVAL_USERS_IDX].getnnz(axis=1)}')
        print(f'Total downloads per customer in test: {test_set[fold][EVAL_USERS_IDX].getnnz(axis=1)}')
else:
    print(f'Total downloads per customer: {sparse_user_item_interaction[EVAL_USERS_IDX].getnnz(axis=1)}')
    print(f'Total downloads per customer in train: {train_set[EVAL_USERS_IDX].getnnz(axis=1)}')
    print(f'Total downloads per customer in test: {test_set[EVAL_USERS_IDX].getnnz(axis=1)}')

Total downloads per customer: [  58  345   34   96   41  251  256  410   30   95  261  385  116   66
   29   64   77   66   32  192  200   87   46   25  101   65  148  251
  428  116  320   64  548   51   36   76   68  911   26  113   71  557
  523  579  108   45  111  125  396   67  426  132  160   32  117  167
   25  303  201   64  603   91 2634   51  366   29   55  119  163   26
  227   34  109   51   72  252  169  151  905   42  112  658  132   85
   52   22  133   20  107   32   73   78   74  141   41   98  141  124
   95   31]
Total downloads per customer in train: [  48  335   24   86   31  241  246  400   20   85  251  375  106   56
   19   54   67   56   22  182  190   77   36   15   91   55  138  241
  418  106  310   54  538   41   26   66   58  901   16  103   61  547
  513  569   98   35  101  115  386   57  416  122  150   22  107  157
   15  293  191   54  593   81 2624   41  356   19   45  109  153   16
  217   24   99   41   62  242  159  141  895   32  102  648  122  

## 1.2 Prepare the Algorithm and Metrics

In [7]:
als_topN = MatrixFactorizationRecommender(MFAlgorithms)
als_topN.add_algorithm('als_algorithm')

In [8]:
# list of metrics to apply
metrics_list = ['MatchCount', 'Precision', 'MR', 'MRR', 'MAP', 'NDCG','Coverage', 'APLT', 'ARP']

# Instantiate the Evaluation class
evaluator = Evaluation(Metrics, sparse_user_item_interaction)

# Add metrics from the Metrics class
for metric in metrics_list:
    evaluator.add_metric(metric)

def evaluate(evaluators_predictions):# Evaluate metrics for each evaluator and store results
    results = []
    for evaluator_name, recommendations in evaluators_predictions.items():
        result = {
            'Evaluator': evaluator_name,
            'MatchCount': evaluator.evaluate('MatchCount', recommendations, test_set, user_index, item_index),
            'Precision': evaluator.evaluate('Precision', recommendations, test_set, user_index, item_index),
            'MR': evaluator.evaluate('MR', recommendations, test_set, user_index, item_index),
            'MRR': evaluator.evaluate('MRR', recommendations, test_set, user_index, item_index),
            'MAP': evaluator.evaluate('MAP', recommendations, test_set, user_index, item_index),
            'NDCG': evaluator.evaluate('NDCG', recommendations, test_set, user_index, item_index),
            'Coverage': evaluator.evaluate('Coverage', recommendations, test_set, user_index, item_index),
            'APLT': evaluator.evaluate('APLT', recommendations, test_set, user_index, item_index, threshold=0.2),
            'ARP': evaluator.evaluate('ARP', recommendations, test_set, user_index, item_index)
        }
        results.append(result)

    # Create a DataFrame from the results
    df = pd.DataFrame(results)
    #df.set_index('Evaluator', inplace=True)
    return df

In [10]:
als_results_list = []

## 2. Start the Parameter Search

In [9]:
factors = [25, 30, 35, 40, 45, 50, 55, 60, 65, 120, 200, 300]
regularization = [0.3, 0.7, 1, 1.3, 1.7]
alpha = [0.8, 1, 1.2]
iterations = [10, 20, 40]

In [None]:
steps = len(factors) * len(regularization) * len(alpha) * len(iterations)
i = 1

for factor in factors:
    print(f"Progress: {i/steps *100:.2f}%")
    for reg in regularization:
        for a in alpha:
            for iter in iterations:
                als_topN.fit(user_item_matrix=train_set, factors=factor, regularization=reg, alpha=a, iterations=iter, random_state=42)
                ALSRecoms = als_topN.recommend(EVAL_USERS, train_set, user_index, item_index, TOP_N)
                evaluators_predictions = {'ALSMatrixFactorization': ALSRecoms}
                temp_df = evaluate(evaluators_predictions)
                temp_df['a'] = a
                temp_df['regularization'] = reg
                temp_df['factors'] = factor
                temp_df['iterations'] = iter
                als_results_list.append(temp_df)
                i += 1
print(f"Progress: {i/steps *100:.2f}%")     
als_results_df = pd.concat(als_results_list)

In [12]:
als_results_df = als_results_df.reset_index(drop=True)
# als_results_df.to_csv("../data/evaluation/parameter_tuning/als-mf-top{TOP_N}-{NUMB_EVAL_USERS}user-leaveK-{ROWS}Rows.txt", sep="\t", encoding='utf-16', index=False)