## Parameter Search for User-Based Neighborhood Algorithm

- conduct grid search for a given set of parameters (alpha, beta, neighborhood size, q)
- export each parameter setting with the achieved metrics to .txt file

## 0. Import Modules

In [1]:
import os
import sys
from pathlib import Path

# setting proper working directory
PROJECT_DIRECTORY = Path(os.path.abspath('')).resolve().parents[0]
sys.path.extend([str(PROJECT_DIRECTORY)])

print(f'Python {sys.version} on {sys.platform}')
print('Project directory: ', PROJECT_DIRECTORY)

Python 3.9.19 | packaged by conda-forge | (main, Mar 20 2024, 12:38:46) [MSC v.1929 64 bit (AMD64)] on win32
Project directory:  C:\Users\s8347434\Documents\RecSys2024


In [2]:
import numpy as np
import pandas as pd
from implicit.evaluation import leave_k_out_split
from src.utilities.Helper import load_data, create_sparse_matrix
from src.utilities.NeighborAlgorithms import NeighborhoodAlgorithms, NeighborhoodRecommender
from src.utilities.Metrics import Evaluation, Metrics

  from .autonotebook import tqdm as notebook_tqdm


## 1. Read the Dataset

In [3]:
FILENAME = PROJECT_DIRECTORY / "data/processed/user_item_interaction_FILTERED_ANONYMIZED.txt"
DATASET = "real"
ROWS = 500000
TRAIN_TEST_SPLIT_STRATEGY = 42
FOLDS = 5

In [4]:
db_interaction = load_data(FILENAME, rows = ROWS, dataset=DATASET)
print(db_interaction.shape)
print(db_interaction.keys())

if ROWS is not None:
    # ONLY FOR SUBSETS: drop users below median interactions
    threshold = np.median(db_interaction['userID'].value_counts())
    print("Median user interactions: ", threshold)
    # for manual values to remove
    threshold = 20
    filter_users = db_interaction['userID'].value_counts() >= threshold
    filter_users = filter_users[filter_users].index.tolist()

    db_interaction = db_interaction[db_interaction['userID'].isin(filter_users)].reset_index(drop=True)
    print("The new size is: ", db_interaction.shape)

sparse_user_item_interaction, user_index, item_index = create_sparse_matrix(db_interaction, dataset=DATASET)

print("Number of (users, items): ", sparse_user_item_interaction.shape)

print(sparse_user_item_interaction.getnnz())

n_total = sparse_user_item_interaction.shape[0]*sparse_user_item_interaction.shape[1]
n_ratings = sparse_user_item_interaction.nnz
sparsity = n_ratings/n_total
print(f"Matrix sparsity: {round(sparsity*100,2)}%")

(500000, 2)
Index(['userID', 'itemID'], dtype='object')
Median user interactions:  16.0
The new size is:  (451034, 2)
Number of (users, items):  (5183, 28081)
451034
Matrix sparsity: 0.31%


## 1.1 Train/Test Split

In [5]:
#train_set, test_set = train_test_split(sparse_user_item_interaction, user_index, item_index, train_percentage=0.8, k=FOLDS, split_strategy=TRAIN_TEST_SPLIT_STRATEGY)
train_set, test_set = leave_k_out_split(sparse_user_item_interaction, K=10, random_state=42)

In [6]:
np.random.seed(2)
NUMBER_USERS = 10
NUMB_EVAL_USERS = 100
TOP_N = 10
EVAL_USERS = np.random.choice(user_index.cat.categories, NUMB_EVAL_USERS, replace=False)
EVAL_USERS_IDX = [user_index.cat.codes[user_index==user].unique()[0] for user in EVAL_USERS]
# print(f'CustomerIDs: {EVAL_USERS}')
if TRAIN_TEST_SPLIT_STRATEGY == "cross-fold":
    print(f'Total downloads per customer: {sparse_user_item_interaction[EVAL_USERS_IDX].getnnz(axis=1)}')
    for fold in range(FOLDS):
        print(f'Total downloads per customer in train: {train_set[fold][EVAL_USERS_IDX].getnnz(axis=1)}')
        print(f'Total downloads per customer in test: {test_set[fold][EVAL_USERS_IDX].getnnz(axis=1)}')
else:
    print(f'Total downloads per customer: {sparse_user_item_interaction[EVAL_USERS_IDX].getnnz(axis=1)}')
    print(f'Total downloads per customer in train: {train_set[EVAL_USERS_IDX].getnnz(axis=1)}')
    print(f'Total downloads per customer in test: {test_set[EVAL_USERS_IDX].getnnz(axis=1)}')

Total downloads per customer: [  97   23  143   42   35  139   21  385   24   45   33   23   23   82
   20   56   23   21   35   41   27   56   20   28   43   76   24   25
   35   77   37   65  102   97  297  364   70   31  175   47   34   25
  240  120   53  468   62   81   38   28   22   41   60   26   23   41
   58   32   27   38   41 1150   26   48  199  107   20  122   20   27
   39   20   58   43   41   26   20   39   43   91   50   78  120   29
   65  119   22   27   42   23   37  125   20   30   30   20   82   22
   85  110]
Total downloads per customer in train: [  87   13  133   32   25  129   11  375   14   35   23   13   13   72
   10   46   13   11   25   31   17   46   10   18   33   66   14   15
   25   67   27   55   92   87  287  354   60   21  165   37   24   15
  230  110   43  458   52   71   28   18   12   31   50   16   13   31
   48   22   17   28   31 1140   16   38  189   97   10  112   10   17
   29   10   48   33   31   16   10   29   33   81   40   68  110  

## 1.2 Prepare the Algorithms and Metrics

In [7]:
user_based_topN = NeighborhoodRecommender(NeighborhoodAlgorithms)
user_based_topN.add_algorithm('user_based_neighborhood')
user_based_iterative_async_topN = NeighborhoodRecommender(NeighborhoodAlgorithms)
user_based_iterative_async_topN.add_algorithm('user_based_iterative_asym_neighborhood')

In [8]:
# list of metrics to apply
metrics_list = ['MatchCount', 'Precision', 'MR', 'MRR', 'MAP', 'NDCG','Coverage', 'APLT', 'ARP']

# Instantiate the Evaluation class
evaluator = Evaluation(Metrics, sparse_user_item_interaction)

# Add metrics from the Metrics class
for metric in metrics_list:
    evaluator.add_metric(metric)

def evaluate(evaluators_predictions):# Evaluate metrics for each evaluator and store results
    results = []
    for evaluator_name, recommendations in evaluators_predictions.items():
        result = {
            'Evaluator': evaluator_name,
            'MatchCount': evaluator.evaluate('MatchCount', recommendations, test_set, user_index, item_index),
            'Precision': evaluator.evaluate('Precision', recommendations, test_set, user_index, item_index),
            'MR': evaluator.evaluate('MR', recommendations, test_set, user_index, item_index),
            'MRR': evaluator.evaluate('MRR', recommendations, test_set, user_index, item_index),
            'MAP': evaluator.evaluate('MAP', recommendations, test_set, user_index, item_index),
            'NDCG': evaluator.evaluate('NDCG', recommendations, test_set, user_index, item_index),
            'Coverage': evaluator.evaluate('Coverage', recommendations, test_set, user_index, item_index),
            'APLT': evaluator.evaluate('APLT', recommendations, test_set, user_index, item_index, threshold=0.2),
            'ARP': evaluator.evaluate('ARP', recommendations, test_set, user_index, item_index)
        }
        results.append(result)

    # Create a DataFrame from the results
    df = pd.DataFrame(results)
    #df.set_index('Evaluator', inplace=True)
    return df

In [9]:
results_list = []
results_iterative_asym_list = []

## 2. Start the Parameter Search

In [10]:
alpha = [0, 0.2, 0.4, 0.5, 0.6, 0.8, 1]
locality = [1, 2, 3, 4, 5, 6]
neighborhood_size = [5, 10, 20, 50, 100, 200]

In [None]:
steps = len(alpha) * len(locality) * len(neighborhood_size)
i = 1

for a in alpha:
    for q in locality:
        print(f"Progress: {i/steps *100:.2f}%")
        user_based_topN.fit(user_item_matrix=train_set, alpha=a, q=q)
        for n in neighborhood_size:
            UserBasedRecoms = user_based_topN.recommend(EVAL_USERS, train_set, user_index, item_index, TOP_N, neighborhood_size=n, already_interacted=[])
            evaluators_predictions = {'UserKNN': UserBasedRecoms}
            temp_df = evaluate(evaluators_predictions)
            temp_df['alpha'] = a
            temp_df['q'] = q
            temp_df['neighborhood_size'] = n
            results_list.append(temp_df)
            i += 1
print(f"Progress: {i/steps *100:.2f}%")
results_df = pd.concat(results_list)

In [12]:
results_df = results_df.reset_index(drop=True)
# results_df.to_csv(f"../data/evaluation/parameter_tuning/user-based-top{TOP_N}-{NUMB_EVAL_USERS}user-leaveK-{ROWS}Rows.txt", sep="\t", encoding='utf-16', index=False)

In [14]:
alpha = [0, 0.2, 0.4, 0.5, 0.6, 0.8, 1]
beta = [0.3, 0.5, 0.7,  0.9, 1, 1.2, 1.4]
locality = [1, 2, 3, 4, 5, 6]
neighborhood_size = [10, 50, 100, 200, 300]

In [None]:
steps = len(alpha) * len(beta) * len(locality) * len(neighborhood_size)
i = 1
for a in alpha:
    for q in locality:
        user_based_iterative_async_topN.fit(user_item_matrix=train_set, alpha=a, q=q)
        print(f"Progress: {i/steps *100:.2f}%")
        for b in beta:
            for n in neighborhood_size:
                UserBasedAsymRecoms = user_based_iterative_async_topN.recommend(EVAL_USERS, train_set, user_index, item_index, TOP_N, neighborhood_size=n, beta=b)
                evaluators_predictions = {'UserIterativeAsymKNN': UserBasedAsymRecoms}
                temp_df = evaluate(evaluators_predictions)
                temp_df['alpha'] = a
                temp_df['q'] = q
                temp_df['beta'] = b
                temp_df['neighborhood_size'] = n
                results_iterative_asym_list.append(temp_df)
                i += 1

results_iterative_asym_df = pd.concat(results_iterative_asym_list)

In [None]:
results_iterative_asym_df = results_iterative_asym_df.reset_index(drop=True)
# results_iterative_asym_df.to_csv(f"../data/evaluation/parameter_tuning/user-based-iterative-asym-top{TOP_N}-{NUMB_EVAL_USERS}user-leaveK-{ROWS}Rows.txt", sep="\t", encoding='utf-16', index=False)

In [16]:
alpha = [0, 0.2, 0.4, 0.5, 0.6, 0.8, 1]
beta = [0.3, 0.5, 0.7,  0.9, 1, 1.2, 1.4]
locality = [1, 2, 3, 4, 5, 6]
neighborhood_size = [None]

In [None]:
steps = len(alpha) * len(beta) * len(locality) * len(neighborhood_size)
i = 1
for a in alpha:
    for q in locality:
        user_based_iterative_async_topN.fit(user_item_matrix=train_set, alpha=a, q=q)
        print(f"Progress: {i/steps *100:.2f}%")
        for b in beta:
            for n in neighborhood_size:
                UserBasedAsymRecoms = user_based_iterative_async_topN.recommend(EVAL_USERS, train_set, user_index, item_index, TOP_N, neighborhood_size=n, beta=b)
                evaluators_predictions = {'UserIterativeAsymKNN': UserBasedAsymRecoms}
                temp_df = evaluate(evaluators_predictions)
                temp_df['alpha'] = a
                temp_df['q'] = q
                temp_df['beta'] = b
                temp_df['neighborhood_size'] = n
                results_iterative_asym_list.append(temp_df)
                i += 1

results_iterative_asym_df = pd.concat(results_iterative_asym_list)

In [18]:
results_iterative_asym_df = results_iterative_asym_df.reset_index(drop=True)
# results_iterative_asym_df.to_csv(f"../data/evaluation/parameter_tuning/user-based-iterative-asym-top{TOP_N}-full_neighborhood-{NUMB_EVAL_USERS}user-leaveK-{ROWS}Rows.txt", sep="\t", encoding='utf-16', index=False)