In [1]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)
os.chdir('../..')

In [2]:
from typing import List, Union
import numpy as np
from tqdm.auto import tqdm
import os 
from pathlib import Path
from src.utils.metrics import run_all_metrics_nfold

In [3]:
def get_nfold_metrics(
    predicts_path: Path,
    methods_with_subfolders: List[str] = ["implicit", "recbole", "replay"],
    k: Union[int, List[int]] = [5, 10, 20, 100],
    n: int = 100,
    fold_method: str = "random",
    excluded_percentage: float = 0.2
) -> None:
    """
    Generate and save evaluation metrics for various recommendation algorithms using n-fold cross-validation.

    Parameters:
        predicts_path (str): Path to the directory containing prediction results.
        methods_with_subfolders (List[str], optional): List of method names with subfolders. 
            Default is ['implicit', 'recbole', 'replay'].
        cutoff_values (int or List[int], optional): Cutoff value(s) for evaluating the top-k recommendations. Default is [5, 10, 20, 100].
        n (int, optional): Number of folds for n-fold cross-validation. Default is 100.
        fold_method (str, optional): Method for splitting data into folds. 
            Options are "interaction-wise" and "random". Default is "random".
        excluded_percentage (float, optional): Percentage of users to exclude from each fold when using "random" fold method. Default is 0.2 (20%).
    """
    all_methods = os.listdir(predicts_path)

    for method in all_methods:
        local_path = Path(os.path.join(predicts_path, method))
        if method in methods_with_subfolders:
            all_submethods = os.listdir(local_path)
            for submethod in all_submethods:
                submethod_path = Path(os.path.join(local_path, submethod))
                save_nfold_metrics(
                    path=submethod_path,
                    k=k,
                    n=n,
                    fold_method=fold_method,
                    excluded_percentage=excluded_percentage,
                )
        else:
            save_nfold_metrics(
                path=local_path,
                k=k,
                n=n,
                fold_method=fold_method,
                excluded_percentage=excluded_percentage,
            )


def save_nfold_metrics(
    path: Path,
    k: Union[int, List[int]],
    n: int,
    fold_method: str = "random",
    excluded_percentage: float = 0.2,
) -> None:
    """
    Generate and save n-fold cross-validation evaluation metrics for a given dataset.

    Parameters:
        path (str): Path to the dataset directory containing prediction result files.
        cutoff_values (int or List[int]): Cutoff value(s) for evaluating the top-k recommendations.
            If int, use a single cutoff value. If List[int], provide multiple values.
        n (int): Number of folds for n-fold cross-validation.
        fold_method (str, optional): Method for splitting data into folds.
            Options are "interaction-wise" and "random". Default is "random".
        excluded_percentage (float, optional): Percentage of users to exclude from each fold when using "random" fold method. Default is 0.2 (20%).
    """
    all_datasets = os.listdir(path)
    for dataset in all_datasets:
        if dataset in ['foursquare',
                            'kuairec_small',
                            'douban_music',
                            'douban_books',
                            'reddit',
                            'movielens_1m',
                            'douban_movies',
                            'kuairec_full',
                            'rekko',
                            'brightkite',
                            'amazon_finefoods',
                            'beeradvocate',
                            'ratebeer',
                            'epinions',
                            'tafeng',
                            'movielens_10m',]:
            continue
        dataset_path = Path(os.path.join(path, dataset))
        all_files = os.listdir(dataset_path)
        for file_name in all_files:
            if "ranks" in file_name:
                if "True" in file_name:
                    type_optimize = True
                elif "False" in file_name:
                    type_optimize = False
                else:
                    type_optimize = "not_applicable"
                file_path = Path(os.path.join(dataset_path, file_name))
                ranks = np.load(file_path)
                n_fold_metrics = run_all_metrics_nfold(
                    ranks=ranks,
                    k=k,
                    n=n,
                    fold_method=fold_method,
                    excluded_percentage=excluded_percentage,
                )

                # Replace "predicts" with "metrics"
                save_path = dataset_path.as_posix().replace("predicts", "metrics")

                if type_optimize != "not_applicable":
                    save_path = Path(
                        os.path.join(
                            save_path, f"results_nfold_wasOptimized_{type_optimize}.csv"
                        )
                    )
                else:
                    save_path = Path(os.path.join(save_path, f"results_nfold.csv"))
                n_fold_metrics.to_csv(save_path)


In [4]:
get_nfold_metrics(Path('results/predicts'))