# DyHuCoG Model Analysis

This notebook provides in-depth analysis of the DyHuCoG model components and behavior.

In [None]:
import sys
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import networkx as nx

# Add parent directory to path
sys.path.append('..')

from src.models.dyhucog import DyHuCoG
from src.models.lightgcn import LightGCN
from src.models.ngcf import NGCF
from src.data.dataset import RecommenderDataset
from src.utils.graph_builder import GraphBuilder
from config.model_config import get_model_config

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Load Dataset and Models

In [None]:
# Load dataset
dataset = RecommenderDataset(
    name='ml-100k',
    path='../data/',
    test_size=0.2,
    val_size=0.1
)

# Load configuration
config = get_model_config('../config/config.yaml').to_dict()

print(f"Dataset loaded: {dataset.n_users} users, {dataset.n_items} items")
print(f"Model config: latent_dim={config['model']['latent_dim']}, n_layers={config['model']['n_layers']}")

In [None]:
# Initialize models
dyhucog = DyHuCoG(
    n_users=dataset.n_users,
    n_items=dataset.n_items,
    n_genres=dataset.n_genres,
    config=config['model']
).to(device)

lightgcn = LightGCN(
    n_users=dataset.n_users,
    n_items=dataset.n_items,
    latent_dim=config['model']['latent_dim'],
    n_layers=config['model']['n_layers']
).to(device)

print("Models initialized successfully!")

## 2. Cooperative Game Analysis

In [None]:
# Analyze DAE reconstruction
from src.data.dataloader import get_dataloader

# Get a batch of user data
train_loader = get_dataloader(dataset, 'train', config)
batch = next(iter(train_loader))

# Get user interaction vectors
user_items = batch['user_items'][:10].to(device)  # Take 10 users

# Test DAE reconstruction
dyhucog.eval()
with torch.no_grad():
    reconstructed = dyhucog.dae(user_items)

# Visualize reconstruction
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
axes = axes.flatten()

for i in range(10):
    ax = axes[i]
    
    # Get top 20 items for visualization
    original = user_items[i].cpu().numpy()[:20]
    recon = reconstructed[i].cpu().numpy()[:20]
    
    x = np.arange(len(original))
    width = 0.35
    
    ax.bar(x - width/2, original, width, label='Original', alpha=0.7)
    ax.bar(x + width/2, recon, width, label='Reconstructed', alpha=0.7)
    
    ax.set_title(f'User {i+1}')
    ax.set_ylim(0, 1.1)
    
    if i == 0:
        ax.legend()

plt.suptitle('DAE Reconstruction Quality (First 20 Items)', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Analyze Shapley values
# Get Shapley values for sample users
sample_users = [1, 10, 50, 100, 500]
shapley_analysis = []

with torch.no_grad():
    for user_id in sample_users:
        user_items = dataset.train_mat[user_id].to(device)
        if user_items.sum() > 0:
            shapley_values = dyhucog.shapley_net(user_items.unsqueeze(0)).squeeze()
            
            # Get non-zero items and their Shapley values
            item_indices = torch.where(user_items > 0)[0]
            item_shapley = [(idx.item() + 1, shapley_values[idx].item()) 
                          for idx in item_indices]
            
            # Sort by Shapley value
            item_shapley.sort(key=lambda x: x[1], reverse=True)
            
            shapley_analysis.append({
                'user_id': user_id,
                'n_items': len(item_indices),
                'top_items': item_shapley[:5],
                'mean_shapley': np.mean([s[1] for s in item_shapley]),
                'std_shapley': np.std([s[1] for s in item_shapley])
            })

# Display analysis
for analysis in shapley_analysis:
    print(f"\nUser {analysis['user_id']} ({analysis['n_items']} items):")
    print(f"  Mean Shapley value: {analysis['mean_shapley']:.4f} ± {analysis['std_shapley']:.4f}")
    print("  Top 5 items by Shapley value:")
    for item_id, shapley in analysis['top_items']:
        print(f"    Item {item_id}: {shapley:.4f}")

## 3. Graph Structure Analysis

In [None]:
# Compare graph structures
# Build different graph types
ui_adj, ui_nodes = GraphBuilder.build_user_item_graph(dataset)
hyper_adj, hyper_nodes = GraphBuilder.build_hypergraph(dataset)

# Normalize adjacency matrices
ui_adj_norm = GraphBuilder.normalize_adj(ui_adj)
hyper_adj_norm = GraphBuilder.normalize_adj(hyper_adj)

print("Graph Statistics:")
print(f"User-Item Graph: {ui_nodes} nodes, {ui_adj._nnz()} edges")
print(f"Hypergraph: {hyper_nodes} nodes, {hyper_adj._nnz()} edges")
print(f"\nAdditional nodes in hypergraph: {hyper_nodes - ui_nodes} (genres)")
print(f"Additional edges in hypergraph: {hyper_adj._nnz() - ui_adj._nnz()} (item-genre connections)")

In [None]:
# Visualize a small subgraph
# Sample a small neighborhood for visualization
sample_user = 1
sample_items = dataset.train_mat[sample_user].nonzero()[0][:5].tolist()

# Create NetworkX graph
G = nx.Graph()

# Add user node
G.add_node(f'U{sample_user}', node_type='user')

# Add item nodes and edges
for item_idx in sample_items:
    item_id = item_idx.item() + 1
    G.add_node(f'I{item_id}', node_type='item')
    G.add_edge(f'U{sample_user}', f'I{item_id}')
    
    # Add genre nodes
    if item_id in dataset.item_genres:
        for genre_idx in dataset.item_genres[item_id]:
            genre_name = dataset.genre_cols[genre_idx]
            G.add_node(f'G{genre_name}', node_type='genre')
            G.add_edge(f'I{item_id}', f'G{genre_name}')

# Plot the subgraph
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G, k=2, iterations=50)

# Draw nodes by type
user_nodes = [n for n, d in G.nodes(data=True) if d['node_type'] == 'user']
item_nodes = [n for n, d in G.nodes(data=True) if d['node_type'] == 'item']
genre_nodes = [n for n, d in G.nodes(data=True) if d['node_type'] == 'genre']

nx.draw_networkx_nodes(G, pos, nodelist=user_nodes, node_color='red', 
                      node_size=1000, node_shape='s', label='User')
nx.draw_networkx_nodes(G, pos, nodelist=item_nodes, node_color='lightblue',
                      node_size=800, label='Items')
nx.draw_networkx_nodes(G, pos, nodelist=genre_nodes, node_color='lightgreen',
                      node_size=600, node_shape='^', label='Genres')

nx.draw_networkx_edges(G, pos, alpha=0.5)
nx.draw_networkx_labels(G, pos)

plt.title(f'Hypergraph Structure Example (User {sample_user})', fontsize=14)
plt.legend()
plt.axis('off')
plt.tight_layout()
plt.show()

## 4. Embedding Analysis

In [None]:
# Compare embeddings from different models
# Build weighted graph for DyHuCoG
edge_weights = dyhucog.compute_shapley_weights(dataset.train_mat.to(device))
edge_index, edge_weight = GraphBuilder.get_edge_list(dataset, edge_weights)
dyhucog.build_hypergraph(edge_index.to(device), edge_weight.to(device), dataset.item_genres)

# Get embeddings
with torch.no_grad():
    # DyHuCoG embeddings
    dyhucog_emb = dyhucog.forward()
    dyhucog_user_emb = dyhucog_emb[:dataset.n_users]
    dyhucog_item_emb = dyhucog_emb[dataset.n_users:dataset.n_users + dataset.n_items]
    
    # LightGCN embeddings
    lightgcn_user_emb, lightgcn_item_emb = lightgcn.forward(ui_adj_norm.to(device))

print(f"DyHuCoG embeddings: User {dyhucog_user_emb.shape}, Item {dyhucog_item_emb.shape}")
print(f"LightGCN embeddings: User {lightgcn_user_emb.shape}, Item {lightgcn_item_emb.shape}")

In [None]:
# Visualize embedding distributions
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# User embedding norms
dyhucog_user_norms = torch.norm(dyhucog_user_emb, dim=1).cpu().numpy()
lightgcn_user_norms = torch.norm(lightgcn_user_emb, dim=1).cpu().numpy()

axes[0, 0].hist(dyhucog_user_norms, bins=50, alpha=0.7, label='DyHuCoG', color='blue')
axes[0, 0].hist(lightgcn_user_norms, bins=50, alpha=0.7, label='LightGCN', color='orange')
axes[0, 0].set_xlabel('Embedding Norm')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_title('User Embedding Norm Distribution')
axes[0, 0].legend()

# Item embedding norms
dyhucog_item_norms = torch.norm(dyhucog_item_emb, dim=1).cpu().numpy()
lightgcn_item_norms = torch.norm(lightgcn_item_emb, dim=1).cpu().numpy()

axes[0, 1].hist(dyhucog_item_norms, bins=50, alpha=0.7, label='DyHuCoG', color='blue')
axes[0, 1].hist(lightgcn_item_norms, bins=50, alpha=0.7, label='LightGCN', color='orange')
axes[0, 1].set_xlabel('Embedding Norm')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_title('Item Embedding Norm Distribution')
axes[0, 1].legend()

# Embedding similarity distribution (sample)
sample_size = 100
sample_indices = np.random.choice(dataset.n_users, sample_size, replace=False)

# DyHuCoG similarities
dyhucog_sample = dyhucog_user_emb[sample_indices]
dyhucog_sim = torch.matmul(dyhucog_sample, dyhucog_sample.t()).cpu().numpy()
dyhucog_sim_flat = dyhucog_sim[np.triu_indices(sample_size, k=1)]

# LightGCN similarities
lightgcn_sample = lightgcn_user_emb[sample_indices]
lightgcn_sim = torch.matmul(lightgcn_sample, lightgcn_sample.t()).cpu().numpy()
lightgcn_sim_flat = lightgcn_sim[np.triu_indices(sample_size, k=1)]

axes[1, 0].hist(dyhucog_sim_flat, bins=50, alpha=0.7, label='DyHuCoG', color='blue')
axes[1, 0].hist(lightgcn_sim_flat, bins=50, alpha=0.7, label='LightGCN', color='orange')
axes[1, 0].set_xlabel('Cosine Similarity')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('User-User Similarity Distribution')
axes[1, 0].legend()

# t-SNE visualization (small sample)
from sklearn.manifold import TSNE

tsne_sample = 200
tsne_indices = np.random.choice(dataset.n_users, tsne_sample, replace=False)

# Combine embeddings
combined_emb = np.vstack([
    dyhucog_user_emb[tsne_indices].cpu().numpy(),
    lightgcn_user_emb[tsne_indices].cpu().numpy()
])

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(combined_emb)

# Plot
axes[1, 1].scatter(tsne_results[:tsne_sample, 0], tsne_results[:tsne_sample, 1],
                  alpha=0.6, label='DyHuCoG', color='blue')
axes[1, 1].scatter(tsne_results[tsne_sample:, 0], tsne_results[tsne_sample:, 1],
                  alpha=0.6, label='LightGCN', color='orange')
axes[1, 1].set_xlabel('t-SNE 1')
axes[1, 1].set_ylabel('t-SNE 2')
axes[1, 1].set_title('User Embedding t-SNE Visualization')
axes[1, 1].legend()

plt.suptitle('Embedding Analysis: DyHuCoG vs LightGCN', fontsize=14)
plt.tight_layout()
plt.show()

## 5. Attention Mechanism Analysis

In [None]:
# Analyze attention weights
sample_users = torch.tensor([1, 10, 50, 100, 500], device=device)
sample_items = torch.tensor([1, 10, 50, 100, 500], device=device)

with torch.no_grad():
    # Get embeddings
    emb = dyhucog.forward()
    user_emb = emb[sample_users - 1]
    item_emb = emb[dataset.n_users + sample_items - 1]
    
    # Create all pairs
    attention_weights = []
    for i in range(len(sample_users)):
        for j in range(len(sample_items)):
            concat_emb = torch.cat([user_emb[i], item_emb[j]]).unsqueeze(0)
            weight = dyhucog.attention(concat_emb).item()
            attention_weights.append(weight)

# Reshape to matrix
attention_matrix = np.array(attention_weights).reshape(len(sample_users), len(sample_items))

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(attention_matrix, 
            xticklabels=[f'Item {i}' for i in sample_items.cpu()],
            yticklabels=[f'User {u}' for u in sample_users.cpu()],
            cmap='YlOrRd',
            annot=True,
            fmt='.3f',
            cbar_kws={'label': 'Attention Weight'})
plt.title('Attention Weights for User-Item Pairs')
plt.tight_layout()
plt.show()

print(f"Attention weight statistics:")
print(f"  Mean: {np.mean(attention_weights):.4f}")
print(f"  Std: {np.std(attention_weights):.4f}")
print(f"  Min: {np.min(attention_weights):.4f}")
print(f"  Max: {np.max(attention_weights):.4f}")

## 6. Model Complexity Analysis

In [None]:
# Count parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Parameter counts
dyhucog_params = count_parameters(dyhucog)
lightgcn_params = count_parameters(lightgcn)

# Break down DyHuCoG parameters
embedding_params = dyhucog.embedding.weight.numel()
dae_params = count_parameters(dyhucog.dae)
shapley_params = count_parameters(dyhucog.shapley_net)
attention_params = count_parameters(dyhucog.attention) if dyhucog.use_attention else 0

# Create comparison
param_data = {
    'Model': ['DyHuCoG', 'LightGCN', 'DyHuCoG-Embedding', 'DyHuCoG-DAE', 
              'DyHuCoG-Shapley', 'DyHuCoG-Attention'],
    'Parameters': [dyhucog_params, lightgcn_params, embedding_params, 
                  dae_params, shapley_params, attention_params]
}

param_df = pd.DataFrame(param_data)

# Plot
plt.figure(figsize=(10, 6))
colors = ['blue', 'orange', 'lightblue', 'lightgreen', 'lightcoral', 'lightyellow']
plt.bar(param_df['Model'], param_df['Parameters'], color=colors)
plt.xlabel('Model/Component')
plt.ylabel('Number of Parameters')
plt.title('Model Parameter Comparison')
plt.xticks(rotation=45)

# Add values on bars
for i, v in enumerate(param_df['Parameters']):
    plt.text(i, v + 1000, f'{v:,}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Print summary
print("Parameter Summary:")
print(f"DyHuCoG total: {dyhucog_params:,}")
print(f"LightGCN total: {lightgcn_params:,}")
print(f"\nDyHuCoG breakdown:")
print(f"  Embeddings: {embedding_params:,} ({embedding_params/dyhucog_params*100:.1f}%)")
print(f"  DAE: {dae_params:,} ({dae_params/dyhucog_params*100:.1f}%)")
print(f"  Shapley Network: {shapley_params:,} ({shapley_params/dyhucog_params*100:.1f}%)")
print(f"  Attention: {attention_params:,} ({attention_params/dyhucog_params*100:.1f}%)")

## 7. Save Model Analysis Results

In [None]:
# Save key findings
analysis_results = {
    'model_comparison': {
        'dyhucog_params': dyhucog_params,
        'lightgcn_params': lightgcn_params,
        'parameter_overhead': f"{(dyhucog_params - lightgcn_params) / lightgcn_params * 100:.1f}%"
    },
    'graph_structure': {
        'user_item_nodes': ui_nodes,
        'hypergraph_nodes': hyper_nodes,
        'genre_nodes': hyper_nodes - ui_nodes,
        'edge_increase': f"{(hyper_adj._nnz() - ui_adj._nnz()) / ui_adj._nnz() * 100:.1f}%"
    },
    'embedding_analysis': {
        'dyhucog_user_norm_mean': float(np.mean(dyhucog_user_norms)),
        'lightgcn_user_norm_mean': float(np.mean(lightgcn_user_norms)),
        'dyhucog_similarity_mean': float(np.mean(dyhucog_sim_flat)),
        'lightgcn_similarity_mean': float(np.mean(lightgcn_sim_flat))
    },
    'attention_stats': {
        'mean': float(np.mean(attention_weights)),
        'std': float(np.std(attention_weights)),
        'range': [float(np.min(attention_weights)), float(np.max(attention_weights))]
    }
}

import json
with open('../results/model_analysis.json', 'w') as f:
    json.dump(analysis_results, f, indent=2)

print("Model analysis completed and saved!")
print("\nKey Findings:")
print(f"- DyHuCoG has {analysis_results['model_comparison']['parameter_overhead']} more parameters than LightGCN")
print(f"- Hypergraph adds {analysis_results['graph_structure']['genre_nodes']} genre nodes")
print(f"- Edge count increases by {analysis_results['graph_structure']['edge_increase']}")
print(f"- Attention weights range from {analysis_results['attention_stats']['range'][0]:.3f} to {analysis_results['attention_stats']['range'][1]:.3f}")