In [31]:
import sys
import os
from google.colab import drive
drive.mount('/content/drive')
project_path = '/content/drive/My Drive/CARMS_MF_NEW'
os.chdir(project_path)
sys.path.append(project_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
import pandas as pd
import numpy as np
import random
import torch

In [33]:
from loo_data_split import get_loo_split

In [34]:
from ranking_evaluation import ndcg_at_k, hit_rate_at_k

In [35]:
from carm_signal_generator import SignalGeneratorClusterARM_v15

In [36]:
from bias_mf_carms import CARMS_Bias_MF
from bias_mf_standard import Standard_Bias_MF

In [37]:
import time
from datetime import datetime

In [38]:
start_time = time.time()

In [39]:
torch.cuda.is_available()

True

In [40]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7dca57bd70d0>

# Data

In [41]:
df1 = pd.read_csv('./database/dataset_amazon_lux_beauty_5_cleaned.csv')
df2 = pd.read_csv('./database/dataset_amazon_industry_5_cleaned.csv')
df3 = pd.read_csv('./database/dataset_amazon_pantry_5_cleaned.csv')
df4 = pd.read_csv('./database/dataset_amazon_music_5_cleaned.csv')
df5 = pd.read_csv('./database/dataset_amazon_instruments_5_cleaned.csv')

In [42]:
data1 = get_loo_split(df1, sparse_threshold=0.2)
train1, val1, sparse_test1 = data1[0], data1[1], data1[2]
train_val1, test1, sparse_train_val1 = data1[3], data1[4], data1[5]

data2 = get_loo_split(df2, sparse_threshold=0.2)
train2, val2, sparse_test2 = data2[0], data2[1], data2[2]
train_val2, test2, sparse_train_val2 = data2[3], data2[4], data2[5]

data3 = get_loo_split(df3, sparse_threshold=0.2)
train3, val3, sparse_test3 = data3[0], data3[1], data3[2]
train_val3, test3, sparse_train_val3 = data3[3], data3[4], data3[5]

data4 = get_loo_split(df4, sparse_threshold=0.2)
train4, val4, sparse_test4 = data4[0], data4[1], data4[2]
train_val4, test4, sparse_train_val4 = data4[3], data4[4], data4[5]

data5 = get_loo_split(df5, sparse_threshold=0.2)
train5, val5, sparse_test5 = data5[0], data5[1], data5[2]
train_val5, test5, sparse_train_val5 = data5[3], data5[4], data5[5]

In [43]:
dataset_list = \
['01 amz_lux_beauty_5', '02 amz_industry_5', '03 amz_pantry_5', '04 amz_music_5', '05 amz_instruments_5']

train_list = [train1, train2, train3, train4, train5]
val_list = [val1, val2, val3, val4, val5]
sparse_train_list = [sparse_test1, sparse_test2, sparse_test3, sparse_test4, sparse_test5]

train_val_list = [train_val1, train_val2, train_val3, train_val4, train_val5]
test_list = [test1, test2, test3, test4, test5]
sparse_train_val_list = [sparse_train_val1, sparse_train_val2, sparse_train_val3, sparse_train_val4, sparse_train_val5]

In [44]:
dataset_list = \
['05 amz_instruments_5']

train_list = [train5]
val_list = [val5]
sparse_train_list = [sparse_test5]

train_val_list = [train_val5]
test_list = [test5]
sparse_train_val_list = [sparse_train_val5]

# Experiment Setup

In [45]:
N_ITER_STANDARD = 200
N_ITER_CARMS = 200

In [46]:
param_space_common = {
    'latent': [20, 30, 50, 70, 100],
    'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1],
    'lambda_rate': [0.001, 0.005, 0.01, 0.05, 0.1],
    'epoch': [20, 30, 50, 70, 100],
}

param_space_signal = {
    'k_user': [1, 3, 5, 7, 10],
    'min_support': [0.0, 0.00001, 0.0001, 0.001, 0.01],
    'min_confidence': [0.0, 0.0001, 0.001, 0.01, 0.1],
}

param_space_carms_specific = {
    'gamma_rate': [0.0, 0.1, 1, 10, 100]
}

# Tuning

Standard Bias MF

In [47]:
best_params_standard = {}

In [None]:
for i, dataset_name in enumerate(dataset_list):
    print(f"\n========== [MF-VALID-RANDOM] Dataset: {dataset_name} ==========")

    train_mat = train_list[i]
    val_mat = val_list[i]
    sparse_train_mat = sparse_train_list[i]
    user_count, item_count = train_mat.shape

    # Step 1) Print total searching rounds
    print(f"Total searching rounds = {N_ITER_STANDARD}")

    best_ndcg10 = -np.inf
    best_params = None
    best_results = None

    for iter_i in range(N_ITER_STANDARD):
        round_no = iter_i + 1

        # Step 2) Sample hyperparameters and print current round
        model_params = {
            "latent": random.choice(param_space_common["latent"]),
            "learning_rate": random.choice(param_space_common["learning_rate"]),
            "lambda_rate": random.choice(param_space_common["lambda_rate"]),
            "epoch": random.choice(param_space_common["epoch"]),

            "k_user": 999,
            "min_support": 999,
            "min_confidence": 999,
            "gamma": 999,
        }
        print(f"[Round {round_no}/{N_ITER_STANDARD}] params = {model_params}")

        try:
            model = Standard_Bias_MF(
                user_count=user_count,
                item_count=item_count,
                K=model_params["latent"],
                learning_rate=model_params["learning_rate"],
                lambda_rate=model_params["lambda_rate"],
            )
            model.fit(Y=train_mat, epochs=model_params["epoch"])

            pred_mat = model.predict()["predictions"]

            results_ndcg = ndcg_at_k(
                pred_mat=pred_mat,
                train_mat=train_mat,
                test_mat=val_mat,
                cold_user_indices=sparse_train_mat,
                ks=(5, 10, 20, 50, 100),
                mask_train=True,
            )

            ndcg10 = results_ndcg["all"].get(10, np.nan)
            if not np.isfinite(ndcg10):
                ndcg10 = -np.inf

            err_msg = None

        except Exception as e:
            ndcg10 = -np.inf
            results_ndcg = None
            err_msg = str(e)

        # Step 3) Update best if improved
        if ndcg10 > best_ndcg10:
            old_best = best_ndcg10
            best_ndcg10 = ndcg10
            best_params = dict(model_params)
            best_results = results_ndcg
            print(f"  >>> NEW BEST! ndcg@10(all): {old_best:.4f} -> {best_ndcg10:.4f}")

    best_params_standard[dataset_name] = {
        "best_ndcg10_all": float(best_ndcg10) if np.isfinite(best_ndcg10) else best_ndcg10,
        "best_params": best_params,
        "best_results": best_results,
        "user_count": int(user_count),
        "item_count": int(item_count),
    }

    print(f"\n>>> FINAL BEST for {dataset_name}: ndcg@10(all)={best_ndcg10:.4f} | params={best_params}")


Total searching rounds = 200
[Round 1/200] params = {'latent': 20, 'learning_rate': 0.001, 'lambda_rate': 0.01, 'epoch': 30, 'k_user': 999, 'min_support': 999, 'min_confidence': 999, 'gamma': 999}
Epoch 1/30, SSE(obs): 153637.977295 (n_obs=164103)
Epoch 10/30, SSE(obs): 103622.548340 (n_obs=164103)
Epoch 20/30, SSE(obs): 45215.454285 (n_obs=164103)
Epoch 30/30, SSE(obs): 18606.932373 (n_obs=164103)
  >>> NEW BEST! ndcg@10(all): -inf -> 0.0229
[Round 2/200] params = {'latent': 30, 'learning_rate': 0.005, 'lambda_rate': 0.001, 'epoch': 100, 'k_user': 999, 'min_support': 999, 'min_confidence': 999, 'gamma': 999}
Epoch 1/100, SSE(obs): 152360.765381 (n_obs=164103)
Epoch 10/100, SSE(obs): 3185.643799 (n_obs=164103)
Epoch 20/100, SSE(obs): 1559.131451 (n_obs=164103)
Epoch 30/100, SSE(obs): 1714.886301 (n_obs=164103)
Epoch 40/100, SSE(obs): 1807.729984 (n_obs=164103)
Epoch 50/100, SSE(obs): 1872.270842 (n_obs=164103)
Epoch 60/100, SSE(obs): 1858.831625 (n_obs=164103)
Epoch 70/100, SSE(obs): 

In [None]:
for ds, info in best_params_standard.items():
    print(f"- {ds}: ndcg@10(all)={info['best_ndcg10_all']:.4f} | {info['best_params']}")

- 05 amz_instruments_5: ndcg@10(all)=0.0478 | {'latent': 100, 'learning_rate': 0.001, 'lambda_rate': 0.005, 'epoch': 20, 'k_user': 999, 'min_support': 999, 'min_confidence': 999, 'gamma': 999}


CARMS Bias MF

In [None]:
best_params_carms = {}

In [None]:
for i, dataset_name in enumerate(dataset_list):
    print(f"\n========== [MF-VALID-RANDOM] Dataset: {dataset_name} ==========")

    train_mat = train_list[i]
    val_mat = val_list[i]
    sparse_train_mat = sparse_train_list[i]
    user_count, item_count = train_mat.shape

    # Step 1) Print total searching rounds
    print(f"Total searching rounds = {N_ITER_CARMS}")

    best_ndcg10 = -np.inf
    best_params = None
    best_results = None

    for iter_i in range(N_ITER_CARMS):
        round_no = iter_i + 1

        # Step 2) Sample hyperparameters and print current round
        model_params = {
            "latent": random.choice(param_space_common["latent"]),
            "learning_rate": random.choice(param_space_common["learning_rate"]),
            "lambda_rate": random.choice(param_space_common["lambda_rate"]),
            "epoch": random.choice(param_space_common["epoch"]),

            "k_user": random.choice(param_space_signal["k_user"]),
            "min_support": random.choice(param_space_signal["min_support"]),
            "min_confidence": random.choice(param_space_signal["min_confidence"]),

            "gamma": random.choice(param_space_carms_specific["gamma_rate"]),
        }
        print(f"[Round {round_no}/{N_ITER_CARMS}] params = {model_params}")

        try:
            sg = SignalGeneratorClusterARM_v15(
                k_user=model_params["k_user"],
                min_support=model_params["min_support"],
                min_confidence=model_params["min_confidence"],
                remove_seen=True,
                random_state=42,)
            S_mat = sg.fit_transform(train_mat)

            model = CARMS_Bias_MF(
                user_count=user_count,
                item_count=item_count,
                K=model_params["latent"],
                learning_rate=model_params["learning_rate"],
                lambda_rate=model_params["lambda_rate"],
                gamma=model_params["gamma"],
            )
            model.fit(Y=train_mat, S=S_mat, epochs=model_params["epoch"])

            pred_mat = model.predict()["predictions"]

            results_ndcg = ndcg_at_k(
                pred_mat=pred_mat,
                train_mat=train_mat,
                test_mat=val_mat,
                cold_user_indices=sparse_train_mat,
                ks=(5, 10, 20, 50, 100),
                mask_train=True,
            )

            ndcg10 = results_ndcg["all"].get(10, np.nan)
            if not np.isfinite(ndcg10):
                ndcg10 = -np.inf

            err_msg = None

        except Exception as e:
            ndcg10 = -np.inf
            results_ndcg = None
            err_msg = str(e)

        # Step 3) Update best if improved
        if ndcg10 > best_ndcg10:
            old_best = best_ndcg10
            best_ndcg10 = ndcg10
            best_params = dict(model_params)
            best_results = results_ndcg
            print(f"  >>> NEW BEST! ndcg@10(all): {old_best:.4f} -> {best_ndcg10:.4f}")

    best_params_carms[dataset_name] = {
        "best_ndcg10_all": float(best_ndcg10) if np.isfinite(best_ndcg10) else best_ndcg10,
        "best_params": best_params,
        "best_results": best_results,
        "user_count": int(user_count),
        "item_count": int(item_count),
    }

    print(f"\n>>> FINAL BEST for {dataset_name}: ndcg@10(all)={best_ndcg10:.4f} | params={best_params}")


Total searching rounds = 200
[Round 1/200] params = {'latent': 20, 'learning_rate': 0.1, 'lambda_rate': 0.01, 'epoch': 70, 'k_user': 7, 'min_support': 0.001, 'min_confidence': 0.001, 'gamma': 0.1}
Epoch 1/70, SSE(obs): 157604.472412 (n_obs=164103), BPR(sum): 719.330228 (n_pairs=27518), Total: 157676.405435
Epoch 10/70, SSE(obs): 144030.156982 (n_obs=164103), BPR(sum): 925.052548 (n_pairs=27518), Total: 144122.662237
Epoch 20/70, SSE(obs): 59094.887634 (n_obs=164103), BPR(sum): 793.634987 (n_pairs=27518), Total: 59174.251133
Epoch 30/70, SSE(obs): 43238024.250000 (n_obs=164103), BPR(sum): 6466.860188 (n_pairs=27518), Total: 43238670.936019
Epoch 40/70, SSE(obs): 65024.516541 (n_obs=164103), BPR(sum): 1673.336123 (n_pairs=27518), Total: 65191.850153
Epoch 50/70, SSE(obs): 25223.234955 (n_obs=164103), BPR(sum): 1533.597141 (n_pairs=27518), Total: 25376.594669
Epoch 60/70, SSE(obs): 20571.452316 (n_obs=164103), BPR(sum): 1351.381010 (n_pairs=27518), Total: 20706.590417
Epoch 70/70, SSE(ob

In [None]:
for ds, info in best_params_carms.items():
    print(f"- {ds}: ndcg@10(all)={info['best_ndcg10_all']:.4f} | {info['best_params']}")

- 05 amz_instruments_5: ndcg@10(all)=0.0793 | {'latent': 100, 'learning_rate': 0.01, 'lambda_rate': 0.001, 'epoch': 100, 'k_user': 7, 'min_support': 0.0001, 'min_confidence': 0.1, 'gamma': 100}


# Retrain

In [None]:
rows = []

Standard Bias MF

In [None]:
import time

for i, dataset_name in enumerate(dataset_list):
    print(f"\n========== [MF-TEST] Dataset: {dataset_name} ==========")

    train_val_mat = train_val_list[i]
    test_mat = test_list[i]
    sparse_train_val_mat = sparse_train_val_list[i]
    user_count, item_count = train_val_mat.shape

    model_params = {
        "latent": best_params_standard[dataset_name]['best_params']['latent'],
        "learning_rate": best_params_standard[dataset_name]['best_params']['learning_rate'],
        "lambda_rate": best_params_standard[dataset_name]['best_params']['lambda_rate'],
        "epoch": best_params_standard[dataset_name]['best_params']['epoch'],
        "k_user": 999,
        "min_support": 999,
        "min_confidence": 999,
        "gamma": 999,
    }
    print(f"[Best hyperparameter] params = {model_params}")

    t0 = time.perf_counter()

    model = Standard_Bias_MF(
        user_count=user_count,
        item_count=item_count,
        K=model_params["latent"],
        learning_rate=model_params["learning_rate"],
        lambda_rate=model_params["lambda_rate"],
    )
    model.fit(Y=train_val_mat, epochs=model_params["epoch"])
    pred_mat = model.predict()["predictions"]

    results_ndcg = ndcg_at_k(
        pred_mat=pred_mat,
        train_mat=train_val_mat,
        test_mat=test_mat,
        cold_user_indices=sparse_train_val_mat,
        ks=(5, 10, 20, 50, 100),
        mask_train=True,
    )

    results_hitrate = hit_rate_at_k(
        pred_mat=pred_mat,
        train_mat=train_val_mat,
        test_mat=test_mat,
        cold_user_indices=sparse_train_val_mat,
        ks=(5, 10, 20, 50, 100),
        mask_train=True,
    )

    elapsed_sec = time.perf_counter() - t0
    elapsed_min = elapsed_sec / 60.0

    row = {
        "dataset_name": dataset_name,
        "model": "01 Standard Bias MF",
        "runtime_min": float(elapsed_min),
    }

    row.update(model_params)

    for split in ("all", "cold", "warm"):
        if split in results_ndcg and isinstance(results_ndcg[split], dict):
            for k, v in results_ndcg[split].items():
                row[f"ndcg_{split}@{int(k)}"] = float(v)

    for split in ("all", "cold", "warm"):
        if split in results_hitrate and isinstance(results_hitrate[split], dict):
            for k, v in results_hitrate[split].items():
                row[f"hit_rate_{split}@{int(k)}"] = float(v)

    for key in ("n_users_eval", "n_cold_eval", "n_warm_eval"):
        if key in results_ndcg:
            row[key] = int(results_ndcg[key])

    rows.append(row)


[Best hyperparameter] params = {'latent': 100, 'learning_rate': 0.001, 'lambda_rate': 0.005, 'epoch': 20, 'k_user': 999, 'min_support': 999, 'min_confidence': 999, 'gamma': 999}
Epoch 1/20, SSE(obs): 180490.613525 (n_obs=191626)
Epoch 10/20, SSE(obs): 48711.617615 (n_obs=191626)
Epoch 20/20, SSE(obs): 2912.160812 (n_obs=191626)


CARMS Bias MF

In [None]:
import time

for i, dataset_name in enumerate(dataset_list):
    print(f"\n========== [MF-TEST] Dataset: {dataset_name} ==========")

    train_val_mat = train_val_list[i]
    test_mat = test_list[i]
    sparse_train_val_mat = sparse_train_val_list[i]
    user_count, item_count = train_val_mat.shape

    model_params = {
        "latent": best_params_carms[dataset_name]['best_params']['latent'],
        "learning_rate": best_params_carms[dataset_name]['best_params']['learning_rate'],
        "lambda_rate": best_params_carms[dataset_name]['best_params']['lambda_rate'],
        "epoch": best_params_carms[dataset_name]['best_params']['epoch'],
        "k_user": best_params_carms[dataset_name]['best_params']['k_user'],
        "min_support": best_params_carms[dataset_name]['best_params']['min_support'],
        "min_confidence": best_params_carms[dataset_name]['best_params']['min_confidence'],
        "gamma": best_params_carms[dataset_name]['best_params']['gamma'],
    }
    print(f"[Best hyperparameter] params = {model_params}")

    t0 = time.perf_counter()

    sg = SignalGeneratorClusterARM_v15(
        k_user=model_params["k_user"],
        min_support=model_params["min_support"],
        min_confidence=model_params["min_confidence"],
        remove_seen=True,
        random_state=42,
    )
    S_mat = sg.fit_transform(train_val_mat)

    model = CARMS_Bias_MF(
        user_count=user_count,
        item_count=item_count,
        K=model_params["latent"],
        learning_rate=model_params["learning_rate"],
        lambda_rate=model_params["lambda_rate"],
        gamma=model_params["gamma"],
    )
    model.fit(Y=train_val_mat, S=S_mat, epochs=model_params["epoch"])

    pred_mat = model.predict()["predictions"]

    results_ndcg = ndcg_at_k(
        pred_mat=pred_mat,
        train_mat=train_val_mat,
        test_mat=test_mat,
        cold_user_indices=sparse_train_val_mat,
        ks=(5, 10, 20, 50, 100),
        mask_train=True,
    )

    results_hitrate = hit_rate_at_k(
        pred_mat=pred_mat,
        train_mat=train_val_mat,
        test_mat=test_mat,
        cold_user_indices=sparse_train_val_mat,
        ks=(5, 10, 20, 50, 100),
        mask_train=True,
    )

    elapsed_sec = time.perf_counter() - t0
    elapsed_min = elapsed_sec / 60.0

    row = {
        "dataset_name": dataset_name,
        "model": "02 CARMS Bias MF",
        "runtime_min": float(elapsed_min),
    }

    row.update(model_params)

    for split in ("all", "cold", "warm"):
        if split in results_ndcg and isinstance(results_ndcg[split], dict):
            for k, v in results_ndcg[split].items():
                row[f"ndcg_{split}@{int(k)}"] = float(v)

    for split in ("all", "cold", "warm"):
        if split in results_hitrate and isinstance(results_hitrate[split], dict):
            for k, v in results_hitrate[split].items():
                row[f"hit_rate_{split}@{int(k)}"] = float(v)

    for key in ("n_users_eval", "n_cold_eval", "n_warm_eval"):
        if key in results_ndcg:
            row[key] = int(results_ndcg[key])

    rows.append(row)


[Best hyperparameter] params = {'latent': 100, 'learning_rate': 0.01, 'lambda_rate': 0.001, 'epoch': 100, 'k_user': 7, 'min_support': 0.0001, 'min_confidence': 0.1, 'gamma': 100}
Epoch 1/100, SSE(obs): 179748.817627 (n_obs=191626), BPR(sum): 10495.496674 (n_pairs=26534), Total: 1229298.484985
Epoch 10/100, SSE(obs): 35814.225067 (n_obs=191626), BPR(sum): 837.261294 (n_pairs=26534), Total: 119540.354504
Epoch 20/100, SSE(obs): 34113.261444 (n_obs=191626), BPR(sum): 327.139032 (n_pairs=26534), Total: 66827.164680
Epoch 30/100, SSE(obs): 40750.491089 (n_obs=191626), BPR(sum): 211.084084 (n_pairs=26534), Total: 61858.899457
Epoch 40/100, SSE(obs): 41549.227173 (n_obs=191626), BPR(sum): 159.658599 (n_pairs=26534), Total: 57515.087027
Epoch 50/100, SSE(obs): 44364.528931 (n_obs=191626), BPR(sum): 159.071823 (n_pairs=26534), Total: 60271.711267
Epoch 60/100, SSE(obs): 43547.649231 (n_obs=191626), BPR(sum): 129.254818 (n_pairs=26534), Total: 56473.131075
Epoch 70/100, SSE(obs): 43787.285706 (

# Results

In [None]:
df_results = pd.DataFrame(rows)
front_cols = ["dataset_name", "model"] + list(model_params.keys())
other_cols = [c for c in df_results.columns if c not in front_cols]
df_results = df_results[front_cols + other_cols]

In [None]:
df_results.sort_values(by=["dataset_name", "model"], inplace=True)

In [None]:
df_results[['dataset_name','model','ndcg_all@10','hit_rate_all@10','n_users_eval','runtime_min']]

Unnamed: 0,dataset_name,model,ndcg_all@10,hit_rate_all@10,n_users_eval,runtime_min
0,05 amz_instruments_5,01 Standard Bias MF,0.041573,0.054777,27530,0.462623
1,05 amz_instruments_5,02 CARMS Bias MF,0.070044,0.099927,27530,3.669551


# Save Results

In [None]:
timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S")
file_name = f'./results/final_results_{timestamp_str}.xlsx'
df_results.to_excel(file_name)

In [None]:
end_time = time.time()
end_time - start_time

38179.16510510445