## Parameter Search for Item-Based Neighborhood Algorithm

- conduct grid search for a given set of parameters (alpha, beta, neighborhood size, q)
- export each parameter setting with the achieved metrics to .txt file

## 0. Import Modules

In [5]:
import os
import sys
from pathlib import Path

# setting proper working directory
PROJECT_DIRECTORY = Path(os.path.abspath('')).resolve().parents[0]
sys.path.extend([str(PROJECT_DIRECTORY)])

print(f'Python {sys.version} on {sys.platform}')
print('Project directory: ', PROJECT_DIRECTORY)

Python 3.9.19 | packaged by conda-forge | (main, Mar 20 2024, 12:38:46) [MSC v.1929 64 bit (AMD64)] on win32
Project directory:  C:\Users\s8347434\Documents\RecSys2024


In [6]:
import numpy as np
import pandas as pd
from implicit.evaluation import leave_k_out_split
from src.utilities.Helper import load_data, create_sparse_matrix
from src.utilities.NeighborAlgorithms import NeighborhoodAlgorithms, NeighborhoodRecommender
from src.utilities.Metrics import Evaluation, Metrics

## 1. Read the Dataset

In [7]:
FILENAME = PROJECT_DIRECTORY / "data/processed/user_item_interaction_FILTERED_ANONYMIZED.txt"
DATASET = "real"
ROWS = 100000
TRAIN_TEST_SPLIT_STRATEGY = 42
FOLDS = 5

In [8]:
db_interaction = load_data(FILENAME, rows = ROWS, dataset=DATASET)
print(db_interaction.shape)
print(db_interaction.keys())

if ROWS is not None:
    # ONLY FOR SUBSETS: drop users below median interactions
    threshold = np.median(db_interaction['userID'].value_counts())
    print("Median user interactions: ", threshold)
    # for manual values to remove
    threshold = 20
    filter_users = db_interaction['userID'].value_counts() >= threshold
    filter_users = filter_users[filter_users].index.tolist()

    db_interaction = db_interaction[db_interaction['userID'].isin(filter_users)].reset_index(drop=True)
    print("The new size is: ", db_interaction.shape)

sparse_user_item_interaction, user_index, item_index = create_sparse_matrix(db_interaction, dataset=DATASET)

print("Number of (users, items): ", sparse_user_item_interaction.shape)

print(sparse_user_item_interaction.getnnz())

n_total = sparse_user_item_interaction.shape[0]*sparse_user_item_interaction.shape[1]
n_ratings = sparse_user_item_interaction.nnz
sparsity = n_ratings/n_total
print(f"Matrix sparsity: {round(sparsity*100,2)}%")

(100000, 2)
Index(['userID', 'itemID'], dtype='object')
Median user interactions:  8.0
The new size is:  (83220, 2)
Number of (users, items):  (1051, 19684)
83220
Matrix sparsity: 0.4%


## 1.1 Train/Test Split

In [9]:
#train_set, test_set = train_test_split(sparse_user_item_interaction, user_index, item_index, train_percentage=0.8, k=FOLDS, split_strategy=TRAIN_TEST_SPLIT_STRATEGY)
train_set, test_set = leave_k_out_split(sparse_user_item_interaction, K=10, random_state=42)

In [10]:
np.random.seed(3)
NUMBER_USERS = 10
NUMB_EVAL_USERS = 100
TOP_N = 10
EVAL_USERS = np.random.choice(user_index.cat.categories, NUMB_EVAL_USERS, replace=False)
EVAL_USERS_IDX = [user_index.cat.codes[user_index==user].unique()[0] for user in EVAL_USERS]
# print(f'CustomerIDs: {EVAL_USERS}')
if TRAIN_TEST_SPLIT_STRATEGY == "cross-fold":
    print(f'Total downloads per customer: {sparse_user_item_interaction[EVAL_USERS_IDX].getnnz(axis=1)}')
    for fold in range(FOLDS):
        print(f'Total downloads per customer in train: {train_set[fold][EVAL_USERS_IDX].getnnz(axis=1)}')
        print(f'Total downloads per customer in test: {test_set[fold][EVAL_USERS_IDX].getnnz(axis=1)}')
else:
    print(f'Total downloads per customer: {sparse_user_item_interaction[EVAL_USERS_IDX].getnnz(axis=1)}')
    print(f'Total downloads per customer in train: {train_set[EVAL_USERS_IDX].getnnz(axis=1)}')
    print(f'Total downloads per customer in test: {test_set[EVAL_USERS_IDX].getnnz(axis=1)}')

Total downloads per customer: [ 454   61  141   74   37   25   32   20  101   36   32   53   62   98
   38   23   24   37   39   25 1220   42   26   55   22   62   20   23
   90   42  143   70  133   24   41   35   35  169   75   89  115   30
   45  158   43  142   21   31   56   26   37   25  446   50  141   55
   34   23   23   29   24   21   93   47   23  258  104   56   24  184
  102   88   46   45  367  196   21   34   20   54  105   31   21  146
   35  142   39   94   56   99   23   83  451   22  165   21   92   27
  194  165]
Total downloads per customer in train: [ 444   51  131   64   27   15   22   10   91   26   22   43   52   88
   28   13   14   27   29   15 1210   32   16   45   12   52   10   13
   80   32  133   60  123   14   31   25   25  159   65   79  105   20
   35  148   33  132   11   21   46   16   27   15  436   40  131   45
   24   13   13   19   14   11   83   37   13  248   94   46   14  174
   92   78   36   35  357  186   11   24   10   44   95   21   11  

## 1.2 Prepare the Algorithms and Metrics

In [11]:
item_based_iterative_async_topN = NeighborhoodRecommender(NeighborhoodAlgorithms)
item_based_iterative_async_topN.add_algorithm('item_based_iterative_asym_neighborhood')

In [12]:
# list of metrics to apply
metrics_list = ['MatchCount', 'Precision', 'MR', 'MRR', 'MAP', 'NDCG','Coverage', 'APLT', 'ARP']

# Instantiate the Evaluation class
evaluator = Evaluation(Metrics, sparse_user_item_interaction)

# Add metrics from the Metrics class
for metric in metrics_list:
    evaluator.add_metric(metric)

def evaluate(evaluators_predictions):# Evaluate metrics for each evaluator and store results
    results = []
    for evaluator_name, recommendations in evaluators_predictions.items():
        result = {
            'Evaluator': evaluator_name,
            'MatchCount': evaluator.evaluate('MatchCount', recommendations, test_set, user_index, item_index),
            'Precision': evaluator.evaluate('Precision', recommendations, test_set, user_index, item_index),
            'MR': evaluator.evaluate('MR', recommendations, test_set, user_index, item_index),
            'MRR': evaluator.evaluate('MRR', recommendations, test_set, user_index, item_index),
            'MAP': evaluator.evaluate('MAP', recommendations, test_set, user_index, item_index),
            'NDCG': evaluator.evaluate('NDCG', recommendations, test_set, user_index, item_index),
            'Coverage': evaluator.evaluate('Coverage', recommendations, test_set, user_index, item_index),
            'APLT': evaluator.evaluate('APLT', recommendations, test_set, user_index, item_index, threshold=0.2),
            'ARP': evaluator.evaluate('ARP', recommendations, test_set, user_index, item_index)
        }
        results.append(result)

    # Create a DataFrame from the results
    df = pd.DataFrame(results)
    #df.set_index('Evaluator', inplace=True)
    return df

In [20]:
results_iterative_asym_list = []

## 2. Start the Parameter Search

In [21]:
alpha = [0, 0.2, 0.4, 0.5, 0.6, 0.8, 1]
beta = [0.3, 0.5, 0.7,  0.9, 1, 1.2, 1.4]
locality = [1, 2, 3, 4, 5, 6]
neighborhood_size = [10, 50, 100, 200, 300]

In [None]:
steps = len(alpha) * len(beta) * len(locality) * len(neighborhood_size)
i = 1
for a in alpha:
    for q in locality:
        print(f"Progress: {i/steps *100:.2f}%")
        item_based_iterative_async_topN.fit(user_item_matrix=train_set, alpha=a, q=q)
        for b in beta:
            for n in neighborhood_size:
                ItemBasedIterativeAsymRecoms = item_based_iterative_async_topN.recommend(EVAL_USERS, train_set, user_index, item_index, TOP_N, neighborhood_size=n, beta=b)
                evaluators_predictions = {'ItemIterativeAsymKNN': ItemBasedIterativeAsymRecoms}
                temp_df = evaluate(evaluators_predictions)
                temp_df['alpha'] = a
                temp_df['q'] = q
                temp_df['beta'] = b
                temp_df['neighborhood_size'] = n
                results_iterative_asym_list.append(temp_df)
                i += 1
        
results_iterative_asym_df = pd.concat(results_iterative_asym_list)

In [23]:
results_iterative_asym_df = results_iterative_asym_df.reset_index(drop=True)
# results_iterative_asym_df.to_csv(f"../data/evaluation/parameter_tuning/item-based-iterative-asym-top{TOP_N}-{NUMB_EVAL_USERS}user-leaveK-{ROWS}Rows.txt", sep="\t", encoding='utf-16', index=False)

In [None]:
alpha = [0, 0.2, 0.4, 0.5, 0.6, 0.8, 1]
beta = [0.3, 0.5, 0.7,  0.9, 1, 1.2, 1.4]
locality = [1, 2, 3, 4, 5, 6]
neighborhood_size = [None]

In [None]:
steps = len(alpha) * len(beta) * len(locality) * len(neighborhood_size)
i = 1
for a in alpha:
    for q in locality:
        print(f"Progress: {i/steps *100:.2f}%")
        item_based_iterative_async_topN.fit(user_item_matrix=train_set, alpha=a, q=q)
        for b in beta:
            for n in neighborhood_size:
                ItemBasedIterativeAsymRecoms = item_based_iterative_async_topN.recommend(EVAL_USERS, train_set, user_index, item_index, TOP_N, neighborhood_size=n, beta=b)
                evaluators_predictions = {'ItemIterativeAsymKNN': ItemBasedIterativeAsymRecoms}
                temp_df = evaluate(evaluators_predictions)
                temp_df['alpha'] = a
                temp_df['q'] = q
                temp_df['beta'] = b
                temp_df['neighborhood_size'] = n
                results_iterative_asym_list.append(temp_df)
                i += 1
        
results_iterative_asym_df = pd.concat(results_iterative_asym_list)

In [None]:
results_iterative_asym_df = results_iterative_asym_df.reset_index(drop=True)
# results_iterative_asym_df.to_csv(f"../data/evaluation/parameter_tuning/item-based-iterative-asym-top{TOP_N}-full_neighborhood-{NUMB_EVAL_USERS}user-leaveK-{ROWS}Rows.txt", sep="\t", encoding='utf-16', index=False)