# Data Exploration for DyHuCoG

This notebook explores the MovieLens dataset and provides insights for the DyHuCoG model.

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add parent directory to path
sys.path.append('..')

from src.data.dataset import RecommenderDataset

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Load Dataset

In [None]:
# Load MovieLens-100K dataset
dataset = RecommenderDataset(
    name='ml-100k',
    path='../data/',
    test_size=0.2,
    val_size=0.1
)

print(f"Dataset loaded successfully!")
print(f"Number of users: {dataset.n_users}")
print(f"Number of items: {dataset.n_items}")
print(f"Number of interactions: {dataset.n_interactions}")
print(f"Number of genres: {dataset.n_genres}")

## 2. Basic Statistics

In [None]:
# Get dataset statistics
stats = dataset.get_statistics()

print("Dataset Statistics:")
print("-" * 40)
for key, value in stats.items():
    print(f"{key}: {value:.4f}" if isinstance(value, float) else f"{key}: {value}")

In [None]:
# Data splits
print(f"\nData Splits:")
print(f"Training interactions: {len(dataset.train_ratings)}")
print(f"Validation interactions: {len(dataset.val_ratings)}")
print(f"Test interactions: {len(dataset.test_ratings)}")

# User groups
print(f"\nUser Groups:")
print(f"Cold users (< 5 interactions): {len(dataset.cold_users)}")
print(f"Warm users (5-20 interactions): {len(dataset.warm_users)}")
print(f"Hot users (> 20 interactions): {len(dataset.hot_users)}")

## 3. Distribution Analysis

In [None]:
# User interaction distribution
user_counts = dataset.train_ratings['user'].value_counts()

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. User interaction histogram
axes[0, 0].hist(user_counts.values, bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('Number of Interactions')
axes[0, 0].set_ylabel('Number of Users')
axes[0, 0].set_title('User Interaction Distribution')
axes[0, 0].axvline(x=5, color='red', linestyle='--', label='Cold threshold')
axes[0, 0].axvline(x=20, color='orange', linestyle='--', label='Warm threshold')
axes[0, 0].legend()

# 2. Item popularity distribution
item_counts = dataset.train_ratings['item'].value_counts()
axes[0, 1].hist(item_counts.values, bins=50, edgecolor='black', alpha=0.7, color='green')
axes[0, 1].set_xlabel('Number of Interactions')
axes[0, 1].set_ylabel('Number of Items')
axes[0, 1].set_title('Item Popularity Distribution')

# 3. Log-log plot of user activity
sorted_counts = sorted(user_counts.values, reverse=True)
axes[1, 0].loglog(range(1, len(sorted_counts) + 1), sorted_counts, 'b-', linewidth=2)
axes[1, 0].set_xlabel('User Rank (log)')
axes[1, 0].set_ylabel('Number of Interactions (log)')
axes[1, 0].set_title('User Activity Power Law')
axes[1, 0].grid(True, which="both", alpha=0.3)

# 4. Temporal distribution
timestamps = dataset.train_ratings['timestamp']
axes[1, 1].hist(pd.to_datetime(timestamps, unit='s'), bins=50, edgecolor='black', alpha=0.7, color='purple')
axes[1, 1].set_xlabel('Time')
axes[1, 1].set_ylabel('Number of Interactions')
axes[1, 1].set_title('Temporal Distribution of Interactions')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 4. Genre Analysis

In [None]:
# Genre distribution
genre_counts = {genre: 0 for genre in dataset.genre_cols}

for item_id, genres in dataset.item_genres.items():
    for genre_idx in genres:
        genre_counts[dataset.genre_cols[genre_idx]] += 1

# Plot genre distribution
plt.figure(figsize=(12, 6))
genres = list(genre_counts.keys())
counts = list(genre_counts.values())

plt.bar(genres, counts, color='skyblue', edgecolor='black')
plt.xlabel('Genre')
plt.ylabel('Number of Movies')
plt.title('Movie Genre Distribution')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Genre co-occurrence matrix
n_genres = len(dataset.genre_cols)
co_occurrence = np.zeros((n_genres, n_genres))

for item_id, genres in dataset.item_genres.items():
    for i in genres:
        for j in genres:
            co_occurrence[i, j] += 1

# Normalize by diagonal
for i in range(n_genres):
    if co_occurrence[i, i] > 0:
        co_occurrence[i, :] /= co_occurrence[i, i]

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(co_occurrence, 
            xticklabels=dataset.genre_cols,
            yticklabels=dataset.genre_cols,
            cmap='YlOrRd',
            annot=True,
            fmt='.2f',
            cbar_kws={'label': 'Co-occurrence Probability'})
plt.title('Genre Co-occurrence Matrix')
plt.tight_layout()
plt.show()

## 5. User Preference Analysis

In [None]:
# Analyze genre preferences by user group
def get_user_genre_preferences(user_list, dataset):
    genre_prefs = np.zeros(dataset.n_genres)
    total_items = 0
    
    for user in user_list:
        user_items = dataset.train_mat[user].nonzero()[0]
        for item_idx in user_items:
            item_id = item_idx.item() + 1
            if item_id in dataset.item_genres:
                for genre in dataset.item_genres[item_id]:
                    genre_prefs[genre] += 1
                total_items += 1
    
    if total_items > 0:
        genre_prefs /= total_items
    
    return genre_prefs

# Get preferences for each user group
cold_prefs = get_user_genre_preferences(dataset.cold_users[:50], dataset)
warm_prefs = get_user_genre_preferences(dataset.warm_users[:50], dataset)
hot_prefs = get_user_genre_preferences(dataset.hot_users[:50], dataset)

# Plot comparison
x = np.arange(len(dataset.genre_cols))
width = 0.25

fig, ax = plt.subplots(figsize=(14, 6))
ax.bar(x - width, cold_prefs, width, label='Cold Users', color='blue', alpha=0.7)
ax.bar(x, warm_prefs, width, label='Warm Users', color='orange', alpha=0.7)
ax.bar(x + width, hot_prefs, width, label='Hot Users', color='red', alpha=0.7)

ax.set_xlabel('Genre')
ax.set_ylabel('Preference Score')
ax.set_title('Genre Preferences by User Group')
ax.set_xticks(x)
ax.set_xticklabels(dataset.genre_cols, rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Sparsity Analysis

In [None]:
# Visualize sparsity pattern
# Sample a subset for visualization
sample_users = 100
sample_items = 100

# Get the most active users and popular items
top_users = user_counts.head(sample_users).index.tolist()
top_items = item_counts.head(sample_items).index.tolist()

# Create interaction matrix for visualization
interaction_matrix = np.zeros((sample_users, sample_items))
for i, user in enumerate(top_users):
    for j, item in enumerate(top_items):
        if dataset.train_mat[user, item] > 0:
            interaction_matrix[i, j] = 1

# Plot sparsity pattern
plt.figure(figsize=(10, 8))
plt.imshow(interaction_matrix, cmap='Blues', aspect='auto', interpolation='nearest')
plt.colorbar(label='Interaction')
plt.xlabel('Items (sorted by popularity)')
plt.ylabel('Users (sorted by activity)')
plt.title(f'Interaction Matrix Sparsity Pattern\n(Top {sample_users} users × Top {sample_items} items)')
plt.tight_layout()
plt.show()

# Calculate and display sparsity
sparsity = 1 - (interaction_matrix.sum() / (sample_users * sample_items))
print(f"Sparsity of displayed subset: {sparsity:.2%}")
print(f"Overall dataset sparsity: {1 - stats['density']:.2%}")

## 7. Graph Structure Preview

In [None]:
# Analyze graph properties
from src.utils.graph_builder import GraphBuilder

# Build user-item graph
ui_adj, ui_nodes = GraphBuilder.build_user_item_graph(dataset)
print(f"User-Item Graph:")
print(f"  Number of nodes: {ui_nodes}")
print(f"  Number of edges: {ui_adj._nnz()}")
print(f"  Average degree: {ui_adj._nnz() / ui_nodes:.2f}")

# Build hypergraph
hyper_adj, hyper_nodes = GraphBuilder.build_hypergraph(dataset)
print(f"\nHypergraph (with genres):")
print(f"  Number of nodes: {hyper_nodes}")
print(f"  Number of edges: {hyper_adj._nnz()}")
print(f"  Average degree: {hyper_adj._nnz() / hyper_nodes:.2f}")

## 8. Save Processed Statistics

In [None]:
# Save key statistics for reference
summary_stats = {
    'dataset': 'MovieLens-100K',
    'n_users': dataset.n_users,
    'n_items': dataset.n_items,
    'n_genres': dataset.n_genres,
    'n_interactions': dataset.n_interactions,
    'density': stats['density'],
    'avg_user_interactions': stats['avg_interactions_per_user'],
    'avg_item_interactions': stats['avg_interactions_per_item'],
    'n_cold_users': len(dataset.cold_users),
    'n_warm_users': len(dataset.warm_users),
    'n_hot_users': len(dataset.hot_users),
    'train_size': len(dataset.train_ratings),
    'val_size': len(dataset.val_ratings),
    'test_size': len(dataset.test_ratings)
}

# Display summary
print("Dataset Summary for DyHuCoG:")
print("=" * 50)
for key, value in summary_stats.items():
    print(f"{key}: {value}")