# Per-User Wilcoxon Signed-Rank Test: BCE vs Focal Loss

This notebook runs a proper Wilcoxon signed-rank test using per-user metrics (n=943 users).

**Purpose:** Achieve statistical significance that wasn't possible with only n=3 aggregate observations.

**Estimated Runtime:** ~15-30 minutes (model training + evaluation)

## Cell 1: Install Dependencies

Run this cell, then **RESTART** the runtime (Runtime -> Restart session)

In [None]:
# Install dependencies
%pip install -q recbole==1.2.0
%pip install -q kmeans-pytorch
%pip uninstall -y numpy
%pip install -q "numpy<2"

print("\n" + "="*60)
print("RESTART REQUIRED")
print("="*60)
print("Go to: Runtime -> Restart session")
print("Then run Cell 2 to continue.")

## Cell 2: Imports and Setup

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import torch
from scipy import stats
import pandas as pd

# RecBole imports
from recbole.quick_start import run_recbole
from recbole.model.general_recommender import NeuMF
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"NumPy version: {np.__version__}")

## Cell 3: Focal Loss Model Definition

In [None]:
import torch.nn as nn

class FocalLoss(nn.Module):
    """Focal Loss for binary classification."""
    def __init__(self, gamma=2.0, alpha=0.25, reduction='mean'):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction

    def forward(self, inputs, targets):
        BCE_loss = nn.functional.binary_cross_entropy_with_logits(
            inputs, targets, reduction='none'
        )
        pt = torch.exp(-BCE_loss)
        alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets)
        focal_loss = alpha_t * ((1 - pt) ** self.gamma) * BCE_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        return focal_loss


class NeuMF_FocalLoss(NeuMF):
    """NeuMF with Focal Loss."""
    def __init__(self, config, dataset, gamma=2.0, alpha=0.25):
        super().__init__(config, dataset)
        self.focal_loss = FocalLoss(gamma=gamma, alpha=alpha)
        self.gamma = gamma
        self.alpha = alpha

    def calculate_loss(self, interaction):
        user = interaction[self.USER_ID]
        item = interaction[self.ITEM_ID]
        label = interaction[self.LABEL]
        output = self.forward(user, item)
        return self.focal_loss(output, label)


print("Focal Loss model defined.")

## Cell 4: Per-User Metric Extraction

In [None]:
def get_per_user_metrics(model, test_data, config, k_list=[5, 10, 20]):
    """
    Extract per-user NDCG and Hit Rate metrics.
    
    Returns dict: {user_id: {'ndcg@k': value, 'hit@k': value, ...}}
    """
    model.eval()
    user_metrics = {}
    
    with torch.no_grad():
        for batch_idx, batched_data in enumerate(test_data):
            interaction, history_index, positive_u, positive_i = batched_data
            
            # Get model scores for all items
            scores = model.full_sort_predict(interaction)
            scores = scores.view(-1, test_data.dataset.item_num)
            
            # Mask out items in history
            scores[history_index] = -np.inf
            
            # Get user IDs
            user_ids = interaction[model.USER_ID].cpu().numpy()
            
            # Convert positive items to sets per user in batch
            pos_u_np = positive_u.cpu().numpy()
            pos_i_np = positive_i.cpu().numpy()
            
            for idx, user_id in enumerate(user_ids):
                if user_id in user_metrics:
                    continue
                
                # Get this user's scores
                user_scores = scores[idx].cpu().numpy()
                
                # Get positive items for this user (within batch)
                pos_mask = pos_u_np == idx
                pos_items = pos_i_np[pos_mask]
                
                if len(pos_items) == 0:
                    continue
                
                # Rank items
                ranked_items = np.argsort(-user_scores)
                
                # Compute metrics for each k
                metrics = {}
                for k in k_list:
                    top_k = ranked_items[:k]
                    
                    # Hit@k
                    hits = np.isin(top_k, pos_items)
                    hit_rate = 1.0 if hits.any() else 0.0
                    
                    # NDCG@k
                    dcg = 0.0
                    for rank, item in enumerate(top_k):
                        if item in pos_items:
                            dcg += 1.0 / np.log2(rank + 2)
                    
                    n_pos = min(len(pos_items), k)
                    idcg = sum(1.0 / np.log2(i + 2) for i in range(n_pos))
                    ndcg = dcg / idcg if idcg > 0 else 0.0
                    
                    metrics[f'hit@{k}'] = hit_rate
                    metrics[f'ndcg@{k}'] = ndcg
                
                user_metrics[user_id] = metrics
    
    return user_metrics


print("Per-user metric extraction function defined.")

## Cell 5: Wilcoxon Test Functions

In [None]:
def run_wilcoxon_test(bce_metrics, fl_metrics, metric='ndcg@10'):
    """Run Wilcoxon signed-rank test on per-user metrics."""
    common_users = set(bce_metrics.keys()) & set(fl_metrics.keys())
    n_users = len(common_users)
    
    bce_scores = np.array([bce_metrics[u][metric] for u in sorted(common_users)])
    fl_scores = np.array([fl_metrics[u][metric] for u in sorted(common_users)])
    differences = fl_scores - bce_scores
    
    # Statistics
    mean_diff = np.mean(differences)
    std_diff = np.std(differences)
    fl_wins = np.sum(differences > 0)
    bce_wins = np.sum(differences < 0)
    ties = np.sum(differences == 0)
    
    # Wilcoxon tests
    stat, p_two = stats.wilcoxon(differences, alternative='two-sided')
    _, p_greater = stats.wilcoxon(differences, alternative='greater')
    
    # Effect sizes
    n_nonzero = np.sum(differences != 0)
    r_effect = 1 - (2 * stat) / (n_nonzero * (n_nonzero + 1) / 2) if n_nonzero > 0 else 0
    cohens_d = mean_diff / std_diff if std_diff > 0 else 0
    
    return {
        'n_users': n_users,
        'mean_bce': np.mean(bce_scores),
        'mean_fl': np.mean(fl_scores),
        'mean_diff': mean_diff,
        'std_diff': std_diff,
        'fl_wins': fl_wins,
        'bce_wins': bce_wins,
        'ties': ties,
        'wilcoxon_stat': stat,
        'p_two_sided': p_two,
        'p_one_sided': p_greater,
        'rank_biserial_r': r_effect,
        'cohens_d': cohens_d,
    }


def print_results(results, metric, alpha=0.05):
    """Print formatted Wilcoxon test results."""
    print("=" * 70)
    print(f"WILCOXON SIGNED-RANK TEST: {metric.upper()}")
    print("=" * 70)
    
    print(f"\nSample Size: n = {results['n_users']} users")
    
    print(f"\nDescriptive Statistics:")
    print(f"  BCE mean:  {results['mean_bce']:.4f}")
    print(f"  FL mean:   {results['mean_fl']:.4f}")
    pct = results['mean_diff'] / results['mean_bce'] * 100 if results['mean_bce'] > 0 else 0
    print(f"  Difference: {results['mean_diff']:+.4f} ({pct:+.1f}%)")
    
    print(f"\nWin/Loss/Tie:")
    print(f"  FL wins:  {results['fl_wins']} ({results['fl_wins']/results['n_users']*100:.1f}%)")
    print(f"  BCE wins: {results['bce_wins']} ({results['bce_wins']/results['n_users']*100:.1f}%)")
    print(f"  Ties:     {results['ties']}")
    
    print(f"\nStatistical Test:")
    print(f"  Wilcoxon W: {results['wilcoxon_stat']:.2f}")
    print(f"  p-value (two-sided): {results['p_two_sided']:.2e}")
    print(f"  p-value (one-sided): {results['p_one_sided']:.2e}")
    
    sig = "YES ✓" if results['p_one_sided'] < alpha else "NO"
    print(f"  Significant at α={alpha}: {sig}")
    
    print(f"\nEffect Sizes:")
    d = abs(results['cohens_d'])
    d_interp = "negligible" if d < 0.2 else "small" if d < 0.5 else "medium" if d < 0.8 else "large"
    print(f"  Cohen's d: {results['cohens_d']:.3f} ({d_interp})")
    
    r = abs(results['rank_biserial_r'])
    r_interp = "negligible" if r < 0.1 else "small" if r < 0.3 else "medium" if r < 0.5 else "large"
    print(f"  Rank-biserial r: {results['rank_biserial_r']:.3f} ({r_interp})")
    
    print("=" * 70)


print("Wilcoxon test functions defined.")

## Cell 6: Configuration

In [None]:
# Experiment configuration
SEED = 42
DATASET = 'ml-100k'
SAMPLING_RATIO = 10  # Change to 4 or 50 for other ratios

# Focal Loss parameters
FL_GAMMA = 2.0
FL_ALPHA = 0.25

# Base configuration
config_dict = {
    'seed': SEED,
    'reproducibility': True,
    'data_path': 'dataset/',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'RATING_FIELD': 'rating',
    'TIME_FIELD': 'timestamp',
    'load_col': {'inter': ['user_id', 'item_id', 'rating', 'timestamp']},
    'val_interval': {'rating': '[3,inf)'},
    'threshold': {'rating': 3},
    
    # Training
    'epochs': 100,
    'train_batch_size': 256,
    'eval_batch_size': 4096,
    'learning_rate': 0.001,
    'train_neg_sample_args': {
        'distribution': 'uniform',
        'sample_num': SAMPLING_RATIO,
        'dynamic': False,
    },
    
    # Model
    'embedding_size': 64,
    'mlp_hidden_size': [128, 64, 32],
    'dropout_prob': 0.1,
    
    # Evaluation
    'eval_args': {
        'split': {'LS': 'valid_and_test'},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full',
    },
    'metrics': ['Hit', 'NDCG'],
    'topk': [5, 10, 20],
    'valid_metric': 'NDCG@10',
    
    # Misc
    'stopping_step': 10,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'show_progress': True,
}

print(f"Configuration set:")
print(f"  Dataset: {DATASET}")
print(f"  Sampling ratio: 1:{SAMPLING_RATIO}")
print(f"  Focal Loss: gamma={FL_GAMMA}, alpha={FL_ALPHA}")
print(f"  Device: {config_dict['device']}")

## Cell 7: Prepare Data

In [None]:
# Initialize
init_seed(SEED, reproducibility=True)
config = Config(model='NeuMF', dataset=DATASET, config_dict=config_dict)
init_logger(config)

# Create dataset
dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

print(f"\nDataset statistics:")
print(f"  Users: {dataset.user_num}")
print(f"  Items: {dataset.item_num}")
print(f"  Interactions: {dataset.inter_num}")
print(f"  Sparsity: {1 - dataset.inter_num / (dataset.user_num * dataset.item_num):.4%}")

## Cell 8: Train BCE Model

In [None]:
print("="*70)
print("Training BCE Baseline")
print("="*70)

# Train BCE model
model_bce = NeuMF(config, dataset).to(config['device'])
trainer_bce = Trainer(config, model_bce)
best_valid_score_bce, best_valid_result_bce = trainer_bce.fit(train_data, valid_data)

print(f"\nBCE Best validation NDCG@10: {best_valid_score_bce:.4f}")

## Cell 9: Train Focal Loss Model

In [None]:
print("="*70)
print(f"Training Focal Loss (gamma={FL_GAMMA}, alpha={FL_ALPHA})")
print("="*70)

# Reset seed for fair comparison
init_seed(SEED, reproducibility=True)

# Train FL model
model_fl = NeuMF_FocalLoss(config, dataset, gamma=FL_GAMMA, alpha=FL_ALPHA).to(config['device'])
trainer_fl = Trainer(config, model_fl)
best_valid_score_fl, best_valid_result_fl = trainer_fl.fit(train_data, valid_data)

print(f"\nFL Best validation NDCG@10: {best_valid_score_fl:.4f}")

## Cell 10: Extract Per-User Metrics

In [None]:
print("Extracting per-user metrics...")
print("\nBCE model:")
bce_user_metrics = get_per_user_metrics(model_bce, test_data, config)
print(f"  Extracted metrics for {len(bce_user_metrics)} users")

print("\nFocal Loss model:")
fl_user_metrics = get_per_user_metrics(model_fl, test_data, config)
print(f"  Extracted metrics for {len(fl_user_metrics)} users")

## Cell 11: Run Wilcoxon Test

In [None]:
# Test NDCG@10
results_ndcg = run_wilcoxon_test(bce_user_metrics, fl_user_metrics, 'ndcg@10')
print_results(results_ndcg, 'ndcg@10')

print("\n")

# Test Hit@10
results_hit = run_wilcoxon_test(bce_user_metrics, fl_user_metrics, 'hit@10')
print_results(results_hit, 'hit@10')

## Cell 12: Summary Table

In [None]:
# Create summary dataframe
summary_data = []
for metric, results in [('NDCG@10', results_ndcg), ('Hit@10', results_hit)]:
    pct = results['mean_diff'] / results['mean_bce'] * 100 if results['mean_bce'] > 0 else 0
    summary_data.append({
        'Metric': metric,
        'BCE': f"{results['mean_bce']:.4f}",
        'FL': f"{results['mean_fl']:.4f}",
        'Δ': f"{results['mean_diff']:+.4f}",
        '% Δ': f"{pct:+.1f}%",
        'FL Wins': f"{results['fl_wins']}/{results['n_users']} ({results['fl_wins']/results['n_users']*100:.1f}%)",
        'p-value': f"{results['p_one_sided']:.2e}",
        "Cohen's d": f"{results['cohens_d']:.3f}",
        'Significant': '✓' if results['p_one_sided'] < 0.05 else '✗',
    })

df_summary = pd.DataFrame(summary_data)
print("\n" + "="*70)
print(f"SUMMARY: Per-User Wilcoxon Test (1:{SAMPLING_RATIO} sampling, n={results_ndcg['n_users']} users)")
print("="*70)
print(df_summary.to_string(index=False))
print("="*70)

## Cell 13: Save Results

In [None]:
# Save results to file
results_dict = {
    'sampling_ratio': SAMPLING_RATIO,
    'n_users': results_ndcg['n_users'],
    'fl_gamma': FL_GAMMA,
    'fl_alpha': FL_ALPHA,
    'ndcg@10': results_ndcg,
    'hit@10': results_hit,
}

# Print for copy-paste to saved_results
print("\n" + "="*70)
print("RESULTS FOR PAPER / saved_results_ml100k.py")
print("="*70)
print(f"""
WILCOXON_RESULTS_{SAMPLING_RATIO} = {{
    'sampling_ratio': {SAMPLING_RATIO},
    'n_users': {results_ndcg['n_users']},
    'fl_params': {{'gamma': {FL_GAMMA}, 'alpha': {FL_ALPHA}}},
    'ndcg@10': {{
        'bce_mean': {results_ndcg['mean_bce']:.4f},
        'fl_mean': {results_ndcg['mean_fl']:.4f},
        'mean_diff': {results_ndcg['mean_diff']:.4f},
        'fl_wins': {results_ndcg['fl_wins']},
        'p_value': {results_ndcg['p_one_sided']:.6f},
        'cohens_d': {results_ndcg['cohens_d']:.4f},
        'significant': {results_ndcg['p_one_sided'] < 0.05},
    }},
    'hit@10': {{
        'bce_mean': {results_hit['mean_bce']:.4f},
        'fl_mean': {results_hit['mean_fl']:.4f},
        'mean_diff': {results_hit['mean_diff']:.4f},
        'fl_wins': {results_hit['fl_wins']},
        'p_value': {results_hit['p_one_sided']:.6f},
        'cohens_d': {results_hit['cohens_d']:.4f},
        'significant': {results_hit['p_one_sided'] < 0.05},
    }},
}}
""")
print("="*70)

## Cell 14 (Optional): Run for Multiple Sampling Ratios

Uncomment and run to test all three sampling ratios (takes longer).

In [None]:
# # Uncomment to run for all sampling ratios
# all_results = {}
# 
# for ratio in [4, 10, 50]:
#     print(f"\n{'#'*70}")
#     print(f"# SAMPLING RATIO 1:{ratio}")
#     print(f"{'#'*70}\n")
#     
#     # Update config
#     config_dict['train_neg_sample_args']['sample_num'] = ratio
#     config = Config(model='NeuMF', dataset=DATASET, config_dict=config_dict)
#     dataset = create_dataset(config)
#     train_data, valid_data, test_data = data_preparation(config, dataset)
#     
#     # Train BCE
#     init_seed(SEED, reproducibility=True)
#     model_bce = NeuMF(config, dataset).to(config['device'])
#     trainer_bce = Trainer(config, model_bce)
#     trainer_bce.fit(train_data, valid_data)
#     
#     # Train FL
#     init_seed(SEED, reproducibility=True)
#     model_fl = NeuMF_FocalLoss(config, dataset, gamma=FL_GAMMA, alpha=FL_ALPHA).to(config['device'])
#     trainer_fl = Trainer(config, model_fl)
#     trainer_fl.fit(train_data, valid_data)
#     
#     # Extract metrics
#     bce_metrics = get_per_user_metrics(model_bce, test_data, config)
#     fl_metrics = get_per_user_metrics(model_fl, test_data, config)
#     
#     # Run test
#     results = run_wilcoxon_test(bce_metrics, fl_metrics, 'ndcg@10')
#     print_results(results, f'ndcg@10 (1:{ratio})')
#     all_results[ratio] = results
# 
# # Summary across ratios
# print("\n" + "="*70)
# print("SUMMARY ACROSS ALL SAMPLING RATIOS")
# print("="*70)
# for ratio, res in all_results.items():
#     sig = "✓" if res['p_one_sided'] < 0.05 else "✗"
#     print(f"1:{ratio}: FL {'+' if res['mean_diff'] > 0 else ''}{res['mean_diff']/res['mean_bce']*100:.1f}%, p={res['p_one_sided']:.2e}, d={res['cohens_d']:.3f} {sig}")