# 01 data exploration


# Data Exploration for CausalShapGNN

This notebook explores the benchmark datasets used for evaluating CausalShapGNN.

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

from data import DataDownloader, DataPreprocessor

%matplotlib inline
plt.style.use('seaborn-whitegrid')

## 1. Download and Load Data

In [None]:
# Download dataset (e.g., MovieLens-100K for quick exploration)
downloader = DataDownloader('../data')
downloader.download('movielens-100k')

In [None]:
# Load and preprocess
preprocessor = DataPreprocessor('../data', 'movielens-100k')
graph_data = preprocessor.load_data()

print(f"Users: {graph_data.n_users}")
print(f"Items: {graph_data.n_items}")
print(f"Training interactions: {len(graph_data.train_interactions)}")

## 2. Analyze Interaction Distributions

In [None]:
# User degree distribution
user_degrees = defaultdict(int)
item_degrees = defaultdict(int)

for u, i in graph_data.train_interactions:
    user_degrees[u] += 1
    item_degrees[i] += 1

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(list(user_degrees.values()), bins=50, alpha=0.7)
axes[0].set_xlabel('Number of Interactions')
axes[0].set_ylabel('Number of Users')
axes[0].set_title('User Interaction Distribution')
axes[0].set_yscale('log')

axes[1].hist(list(item_degrees.values()), bins=50, alpha=0.7, color='orange')
axes[1].set_xlabel('Number of Interactions')
axes[1].set_ylabel('Number of Items')
axes[1].set_title('Item Popularity Distribution')
axes[1].set_yscale('log')

plt.tight_layout()
plt.show()

## 3. Compute Statistics

In [None]:
stats = preprocessor.compute_statistics(graph_data)

for key, value in stats.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    else:
        print(f"{key}: {value}")

## 4. Popularity Bias Analysis

In [None]:
# Lorenz curve for item popularity
item_pops = sorted(item_degrees.values())
cumsum = np.cumsum(item_pops)
cumsum = cumsum / cumsum[-1]

plt.figure(figsize=(8, 8))
plt.plot(np.linspace(0, 1, len(cumsum)), cumsum, label='Lorenz Curve')
plt.plot([0, 1], [0, 1], 'k--', label='Perfect Equality')
plt.xlabel('Cumulative Share of Items')
plt.ylabel('Cumulative Share of Interactions')
plt.title(f'Item Popularity Inequality (Gini = {stats["item_gini"]:.3f})')
plt.legend()
plt.show()

# 02 training demo


# CausalShapGNN Training Demo

This notebook demonstrates how to train CausalShapGNN on a benchmark dataset.

In [None]:
import sys
sys.path.insert(0, '..')

import torch
import numpy as np
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

from config import get_default_config
from data import DataPreprocessor, BipartiteGraphProcessor, RecommendationDataset, collate_fn
from models import CausalShapGNN
from trainers import Trainer
from utils import set_seed

set_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Load Data

In [None]:
preprocessor = DataPreprocessor('../data', 'movielens-100k')
graph_data = preprocessor.load_data()

## 2. Configure Model

In [None]:
config = get_default_config()

config['n_users'] = graph_data.n_users
config['n_items'] = graph_data.n_items
config['embed_dim'] = 64
config['n_factors'] = 4
config['n_layers'] = 3

config['training'] = {
    'lr': 0.001,
    'batch_size': 1024,
    'n_epochs': 50
}

print("Configuration:")
for k, v in config.items():
    print(f"  {k}: {v}")

## 3. Initialize Model and Trainer

In [None]:
# Process graph
graph_processor = BipartiteGraphProcessor(
    graph_data.n_users, graph_data.n_items,
    graph_data.train_interactions, device
)

# Create data loader
train_dataset = RecommendationDataset(graph_processor, graph_data.train_interactions)
train_loader = DataLoader(
    train_dataset,
    batch_size=config['training']['batch_size'],
    shuffle=True,
    collate_fn=collate_fn
)

# Initialize model
model = CausalShapGNN(config, device)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

# Initialize trainer
trainer = Trainer(model, graph_processor, config, device)

## 4. Training Loop

In [None]:
train_losses = []
val_recalls = []

n_epochs = config['training']['n_epochs']

for epoch in range(n_epochs):
    # Train
    losses = trainer.train_epoch(train_loader, graph_processor.norm_adj)
    train_losses.append(losses['total'])
    
    # Evaluate every 5 epochs
    if (epoch + 1) % 5 == 0:
        val_metrics = trainer.evaluate(
            graph_processor.norm_adj,
            graph_data.val_interactions
        )
        val_recalls.append(val_metrics['recall@20'])
        
        print(f"Epoch {epoch+1}/{n_epochs}")
        print(f"  Loss: {losses['total']:.4f}")
        print(f"  Val R@20: {val_metrics['recall@20']:.4f}")
        print(f"  Val N@20: {val_metrics['ndcg@20']:.4f}")

## 5. Plot Training Curves

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(train_losses)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training Loss')

axes[1].plot(range(5, n_epochs+1, 5), val_recalls, marker='o')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Recall@20')
axes[1].set_title('Validation Recall')

plt.tight_layout()
plt.show()

## 6. Final Evaluation

In [None]:
test_metrics = trainer.evaluate(
    graph_processor.norm_adj,
    graph_data.test_interactions
)

print("\nTest Set Results:")
for k, v in sorted(test_metrics.items()):
    print(f"  {k}: {v:.4f}")

# 03 explanation demo


# CausalShapGNN Explanation Demo

This notebook demonstrates the multi-granularity explanations generated by CausalShapGNN.

In [None]:
import sys
sys.path.insert(0, '..')

import torch
import numpy as np
import matplotlib.pyplot as plt

from config import get_default_config
from data import DataPreprocessor, BipartiteGraphProcessor
from models import CausalShapGNN
from explainers import FeatureShapley, PathShapley, UserProfileShapley
from explainers import ExplanationReport, ExplanationVisualizer
from utils import set_seed

set_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 1. Load Model and Data

In [None]:
# Load data
preprocessor = DataPreprocessor('../data', 'movielens-100k')
graph_data = preprocessor.load_data()

# Setup config
config = get_default_config()
config['n_users'] = graph_data.n_users
config['n_items'] = graph_data.n_items
config['embed_dim'] = 64
config['n_factors'] = 4
config['n_layers'] = 3

# Process graph
graph_processor = BipartiteGraphProcessor(
    graph_data.n_users, graph_data.n_items,
    graph_data.train_interactions, device
)

# Initialize model (in practice, load trained checkpoint)
model = CausalShapGNN(config, device)
model.eval()

## 2. Generate Recommendations

In [None]:
user_id = 42

with torch.no_grad():
    user_emb, item_emb, _ = model(graph_processor.norm_adj, use_causal_only=True)

scores = torch.matmul(user_emb[user_id], item_emb.t())

# Mask training items
train_items = list(graph_processor.train_user_items[user_id])
if train_items:
    scores[train_items] = -float('inf')

_, top_items = torch.topk(scores, 10)
top_items = top_items.cpu().numpy().tolist()

print(f"Top 10 recommendations for User {user_id}:")
for i, item in enumerate(top_items):
    print(f"  {i+1}. Item {item}")

## 3. Feature-Level Explanations

In [None]:
feature_explainer = FeatureShapley(model, device)
feature_explainer._compute_population_means(user_emb, item_emb)

item_idx = top_items[0]
shapley = feature_explainer.compute(user_id, item_idx, user_emb, item_emb)

factor_names = ['Genre', 'Recency', 'Quality', 'Social']

print(f"\nFeature-level explanation for Item {item_idx}:")
for name, value in zip(factor_names, shapley):
    print(f"  {name}: {value:.4f}")

In [None]:
# Visualize
visualizer = ExplanationVisualizer(factor_names)

colors = ['green' if v >= 0 else 'red' for v in shapley]

plt.figure(figsize=(8, 4))
plt.barh(factor_names, shapley, color=colors)
plt.xlabel('Shapley Value')
plt.title(f'Factor Contributions for Item {item_idx}')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.show()

## 4. User Profile Analysis

In [None]:
profile_explainer = UserProfileShapley(feature_explainer)
user_profile = profile_explainer.compute(user_id, top_items, user_emb, item_emb)

report_generator = ExplanationReport(model, device, factor_names)
report = report_generator.generate_user_profile_report(user_id, user_profile, top_items)
print(report)

## 5. Compare Explanations Across Items

In [None]:
# Get explanations for top 5 items
explanations = []
for item_idx in top_items[:5]:
    shapley = feature_explainer.compute(user_id, item_idx, user_emb, item_emb)
    explanations.append({'item_idx': item_idx, 'feature_shapley': shapley})

# Create heatmap
shapley_matrix = np.array([e['feature_shapley'] for e in explanations])

plt.figure(figsize=(10, 6))
plt.imshow(shapley_matrix, cmap='RdBu_r', aspect='auto')
plt.colorbar(label='Shapley Value')
plt.xticks(range(len(factor_names)), factor_names)
plt.yticks(range(len(explanations)), [f"Item {e['item_idx']}" for e in explanations])
plt.title('Factor Contributions Across Recommendations')
plt.tight_layout()
plt.show()