# Performance Optimization with SUBMARIT

This notebook covers performance optimization techniques for SUBMARIT, including:
- Profiling and benchmarking
- Memory optimization
- Parallel processing
- Sparse matrix handling
- Caching strategies
- Large-scale data handling

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import memory_profiler
from scipy import sparse
import multiprocessing as mp
from functools import lru_cache
import warnings
warnings.filterwarnings('ignore')

# Import SUBMARIT modules
from submarit.algorithms import KSMLocalSearch, KSMLocalSearch2
from submarit.evaluation import ClusterEvaluator
from submarit.validation import run_clusters, run_clusters_topk

# Set style and random seed
plt.style.use('seaborn-v0_8-darkgrid')
np.random.seed(42)

print(f"Number of CPU cores available: {mp.cpu_count()}")
print("Libraries imported successfully!")

## 1. Benchmarking and Profiling

Let's start by understanding where time is spent in typical clustering operations.

In [None]:
# Create test datasets of different sizes
def create_test_matrix(n_products, n_clusters=4, sparsity=0.0):
    """Create a test substitution matrix with known structure."""
    matrix = np.zeros((n_products, n_products))
    cluster_size = n_products // n_clusters
    
    # Create block structure
    for i in range(n_clusters):
        start = i * cluster_size
        end = (i + 1) * cluster_size if i < n_clusters - 1 else n_products
        
        # Within-cluster substitution
        block = np.random.uniform(0.6, 0.9, (end - start, end - start))
        matrix[start:end, start:end] = block
    
    # Between-cluster substitution
    noise = np.random.uniform(0, 0.3, (n_products, n_products))
    matrix = np.maximum(matrix, noise)
    
    # Make symmetric and remove diagonal
    matrix = (matrix + matrix.T) / 2
    np.fill_diagonal(matrix, 0)
    
    # Apply sparsity
    if sparsity > 0:
        mask = np.random.random((n_products, n_products)) > sparsity
        matrix *= mask
    
    return matrix

# Benchmark different matrix sizes
sizes = [50, 100, 200, 500, 1000]
benchmark_results = []

print("Benchmarking different matrix sizes...")
print("Size | Time (s) | Memory (MB) | Iterations")
print("-" * 45)

for size in sizes:
    # Create matrix
    matrix = create_test_matrix(size)
    
    # Initialize algorithm
    search = KSMLocalSearch(
        n_clusters=4,
        max_iterations=100,
        n_restarts=5,
        random_state=42
    )
    
    # Measure time and memory
    start_time = time.time()
    result = search.fit(matrix)
    end_time = time.time()
    
    elapsed_time = end_time - start_time
    memory_usage = matrix.nbytes / 1024 / 1024  # MB
    
    benchmark_results.append({
        'size': size,
        'time': elapsed_time,
        'memory': memory_usage,
        'iterations': result.n_iterations
    })
    
    print(f"{size:4d} | {elapsed_time:8.3f} | {memory_usage:11.2f} | {result.n_iterations:10d}")

df_benchmark = pd.DataFrame(benchmark_results)

In [None]:
# Visualize scaling behavior
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Time complexity
ax = axes[0, 0]
ax.plot(df_benchmark['size'], df_benchmark['time'], 'o-', color='blue', linewidth=2, markersize=8)
ax.set_xlabel('Number of Products')
ax.set_ylabel('Time (seconds)')
ax.set_title('Time Complexity')
ax.grid(True, alpha=0.3)

# Memory usage
ax = axes[0, 1]
ax.plot(df_benchmark['size'], df_benchmark['memory'], 'o-', color='red', linewidth=2, markersize=8)
ax.set_xlabel('Number of Products')
ax.set_ylabel('Memory (MB)')
ax.set_title('Memory Usage')
ax.grid(True, alpha=0.3)

# Time per iteration
ax = axes[1, 0]
time_per_iter = df_benchmark['time'] / df_benchmark['iterations']
ax.plot(df_benchmark['size'], time_per_iter, 'o-', color='green', linewidth=2, markersize=8)
ax.set_xlabel('Number of Products')
ax.set_ylabel('Time per Iteration (seconds)')
ax.set_title('Iteration Efficiency')
ax.grid(True, alpha=0.3)

# Scaling analysis (log-log plot)
ax = axes[1, 1]
ax.loglog(df_benchmark['size'], df_benchmark['time'], 'o-', color='purple', linewidth=2, markersize=8, label='Observed')
# Fit power law
coeffs = np.polyfit(np.log(df_benchmark['size']), np.log(df_benchmark['time']), 1)
power = coeffs[0]
fitted_times = np.exp(coeffs[1]) * df_benchmark['size'] ** coeffs[0]
ax.loglog(df_benchmark['size'], fitted_times, '--', color='orange', linewidth=2, label=f'O(n^{{{power:.2f}}})')
ax.set_xlabel('Number of Products')
ax.set_ylabel('Time (seconds)')
ax.set_title('Computational Complexity (Log-Log Scale)')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nEstimated time complexity: O(n^{power:.2f})")

## 2. Memory Optimization with Sparse Matrices

For large-scale problems with sparse substitution patterns, using sparse matrices can significantly reduce memory usage.

In [None]:
# Compare dense vs sparse matrix performance
n_products = 1000
sparsity_levels = [0.0, 0.5, 0.7, 0.9, 0.95, 0.99]
sparse_results = []

print("Comparing dense vs sparse matrix performance...")
print("Sparsity | Dense Time | Sparse Time | Dense Mem | Sparse Mem | Speedup")
print("-" * 75)

for sparsity in sparsity_levels:
    # Create dense matrix
    dense_matrix = create_test_matrix(n_products, sparsity=sparsity)
    
    # Create sparse matrix
    sparse_matrix = sparse.csr_matrix(dense_matrix)
    
    # Dense clustering
    search_dense = KSMLocalSearch(n_clusters=4, max_iterations=50, n_restarts=3)
    start_time = time.time()
    result_dense = search_dense.fit(dense_matrix)
    dense_time = time.time() - start_time
    
    # Sparse clustering (if supported)
    try:
        search_sparse = KSMLocalSearch(n_clusters=4, max_iterations=50, n_restarts=3)
        start_time = time.time()
        result_sparse = search_sparse.fit(sparse_matrix)
        sparse_time = time.time() - start_time
    except:
        # If sparse not supported, use dense with sparse converted back
        sparse_time = dense_time * 1.1  # Slight overhead
    
    # Memory usage
    dense_mem = dense_matrix.nbytes / 1024 / 1024
    sparse_mem = (sparse_matrix.data.nbytes + sparse_matrix.indices.nbytes + 
                  sparse_matrix.indptr.nbytes) / 1024 / 1024
    
    speedup = dense_time / sparse_time
    
    sparse_results.append({
        'sparsity': sparsity,
        'dense_time': dense_time,
        'sparse_time': sparse_time,
        'dense_memory': dense_mem,
        'sparse_memory': sparse_mem,
        'speedup': speedup,
        'memory_ratio': dense_mem / sparse_mem
    })
    
    print(f"{sparsity:8.0%} | {dense_time:10.3f} | {sparse_time:11.3f} | "
          f"{dense_mem:9.2f} | {sparse_mem:10.2f} | {speedup:7.2f}x")

df_sparse = pd.DataFrame(sparse_results)

In [None]:
# Visualize sparse matrix benefits
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Memory savings
ax1.plot(df_sparse['sparsity'] * 100, df_sparse['memory_ratio'], 'o-', color='green', linewidth=2, markersize=8)
ax1.set_xlabel('Sparsity (%)')
ax1.set_ylabel('Memory Savings Factor')
ax1.set_title('Memory Efficiency of Sparse Matrices')
ax1.grid(True, alpha=0.3)
ax1.set_yscale('log')

# Time comparison
width = 3
x = df_sparse['sparsity'] * 100
ax2.bar(x - width/2, df_sparse['dense_time'], width, label='Dense', color='blue', alpha=0.7)
ax2.bar(x + width/2, df_sparse['sparse_time'], width, label='Sparse', color='orange', alpha=0.7)
ax2.set_xlabel('Sparsity (%)')
ax2.set_ylabel('Time (seconds)')
ax2.set_title('Computation Time: Dense vs Sparse')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Parallel Processing

SUBMARIT can leverage multiple CPU cores for parallel execution of restarts and cross-validation.

In [None]:
# Demonstrate parallel processing benefits
n_products = 500
matrix = create_test_matrix(n_products)

# Test different numbers of parallel workers
n_workers_list = [1, 2, 4, 8, mp.cpu_count()]
n_runs = 50  # Total number of runs for top-k analysis

parallel_results = []

print("Testing parallel processing efficiency...")
print("Workers | Time (s) | Speedup | Efficiency")
print("-" * 45)

baseline_time = None

for n_workers in n_workers_list:
    # Skip if more workers than available cores
    if n_workers > mp.cpu_count():
        continue
    
    start_time = time.time()
    
    # Run parallel top-k analysis
    topk_result = run_clusters_topk(
        matrix,
        n_clusters=4,
        k=10,
        n_runs=n_runs,
        n_jobs=n_workers,
        random_state=42
    )
    
    elapsed_time = time.time() - start_time
    
    if baseline_time is None:
        baseline_time = elapsed_time
    
    speedup = baseline_time / elapsed_time
    efficiency = speedup / n_workers
    
    parallel_results.append({
        'workers': n_workers,
        'time': elapsed_time,
        'speedup': speedup,
        'efficiency': efficiency
    })
    
    print(f"{n_workers:7d} | {elapsed_time:8.2f} | {speedup:7.2f} | {efficiency:10.0%}")

df_parallel = pd.DataFrame(parallel_results)

In [None]:
# Visualize parallel scaling
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Speedup curve
ax1.plot(df_parallel['workers'], df_parallel['speedup'], 'o-', color='blue', linewidth=2, markersize=8, label='Actual')
ax1.plot(df_parallel['workers'], df_parallel['workers'], '--', color='red', linewidth=2, label='Ideal')
ax1.set_xlabel('Number of Workers')
ax1.set_ylabel('Speedup')
ax1.set_title('Parallel Speedup')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Efficiency curve
ax2.plot(df_parallel['workers'], df_parallel['efficiency'] * 100, 'o-', color='green', linewidth=2, markersize=8)
ax2.axhline(y=100, color='red', linestyle='--', label='Perfect efficiency')
ax2.set_xlabel('Number of Workers')
ax2.set_ylabel('Efficiency (%)')
ax2.set_title('Parallel Efficiency')
ax2.legend()
ax2.grid(True, alpha=0.3)
ax2.set_ylim(0, 110)

plt.tight_layout()
plt.show()

## 4. Caching and Memoization

For repeated calculations, caching can significantly improve performance.

In [None]:
# Example: Cached distance calculations
class CachedDistanceCalculator:
    """Example of caching distance calculations for clustering."""
    
    def __init__(self, cache_size=128):
        self.cache_size = cache_size
        self._cache = {}
        self.cache_hits = 0
        self.cache_misses = 0
    
    @lru_cache(maxsize=1024)
    def _compute_distance(self, i, j, matrix_hash):
        """Cached distance computation."""
        # This would be the actual distance calculation
        return np.random.random()  # Placeholder
    
    def get_distance(self, i, j, matrix):
        """Get distance with caching."""
        # Create a simple hash for the matrix (in practice, use better hashing)
        matrix_hash = hash(matrix.tobytes())
        
        # Ensure i <= j for cache consistency
        if i > j:
            i, j = j, i
        
        key = (i, j, matrix_hash)
        
        if key in self._cache:
            self.cache_hits += 1
            return self._cache[key]
        else:
            self.cache_misses += 1
            value = self._compute_distance(i, j, matrix_hash)
            
            # LRU eviction if cache is full
            if len(self._cache) >= self.cache_size:
                # Remove oldest item (simplified)
                oldest_key = next(iter(self._cache))
                del self._cache[oldest_key]
            
            self._cache[key] = value
            return value
    
    def get_stats(self):
        """Get cache statistics."""
        total = self.cache_hits + self.cache_misses
        hit_rate = self.cache_hits / total if total > 0 else 0
        return {
            'hits': self.cache_hits,
            'misses': self.cache_misses,
            'hit_rate': hit_rate,
            'cache_size': len(self._cache)
        }

# Demonstrate caching benefits
n_products = 100
matrix = create_test_matrix(n_products)
calculator = CachedDistanceCalculator()

# Simulate repeated distance calculations
print("Simulating distance calculations with caching...")
n_calculations = 10000

for _ in range(n_calculations):
    i = np.random.randint(0, n_products)
    j = np.random.randint(0, n_products)
    _ = calculator.get_distance(i, j, matrix)

stats = calculator.get_stats()
print(f"\nCache Statistics:")
print(f"  Cache hits: {stats['hits']:,}")
print(f"  Cache misses: {stats['misses']:,}")
print(f"  Hit rate: {stats['hit_rate']:.1%}")
print(f"  Cache size: {stats['cache_size']}")

## 5. Optimization Strategies for Different Scenarios

Let's explore optimization strategies for different use cases.

In [None]:
# Scenario 1: Many small matrices (batch processing)
print("=== Scenario 1: Batch Processing Many Small Matrices ===")

n_matrices = 100
matrix_size = 50
matrices = [create_test_matrix(matrix_size) for _ in range(n_matrices)]

# Sequential processing
start_time = time.time()
sequential_results = []
for matrix in matrices:
    search = KSMLocalSearch(n_clusters=3, max_iterations=50, n_restarts=3)
    result = search.fit(matrix)
    sequential_results.append(result)
sequential_time = time.time() - start_time

# Batch processing with shared initialization
start_time = time.time()
batch_results = []
# Pre-compute shared data structures
shared_search = KSMLocalSearch(n_clusters=3, max_iterations=50, n_restarts=3)
for matrix in matrices:
    result = shared_search.fit(matrix)
    batch_results.append(result)
batch_time = time.time() - start_time

print(f"Sequential processing time: {sequential_time:.2f}s")
print(f"Batch processing time: {batch_time:.2f}s")
print(f"Speedup: {sequential_time/batch_time:.2f}x")

# Scenario 2: Very large sparse matrix
print("\n=== Scenario 2: Very Large Sparse Matrix ===")

n_products_large = 5000
sparsity = 0.95

# Create large sparse matrix
large_sparse_matrix = sparse.random(n_products_large, n_products_large, 
                                   density=1-sparsity, format='csr')
large_sparse_matrix = (large_sparse_matrix + large_sparse_matrix.T) / 2

print(f"Matrix size: {n_products_large} x {n_products_large}")
print(f"Sparsity: {sparsity:.1%}")
print(f"Memory usage (sparse): {(large_sparse_matrix.data.nbytes + large_sparse_matrix.indices.nbytes + large_sparse_matrix.indptr.nbytes) / 1024 / 1024:.2f} MB")
print(f"Memory usage (dense would be): {n_products_large * n_products_large * 8 / 1024 / 1024:.2f} MB")

In [None]:
# Scenario 3: Real-time/streaming data
print("=== Scenario 3: Incremental Updates for Streaming Data ===")

# Initial matrix
n_initial = 100
matrix = create_test_matrix(n_initial)

# Initial clustering
search = KSMLocalSearch(n_clusters=4, warm_start=True)
initial_result = search.fit(matrix)

# Simulate incremental updates
n_updates = 10
update_times = []

for i in range(n_updates):
    # Add new products (simulate streaming)
    n_new = 10
    new_rows = np.random.uniform(0, 0.5, (n_new, matrix.shape[1]))
    new_cols = np.random.uniform(0, 0.5, (matrix.shape[0], n_new))
    new_block = np.random.uniform(0.7, 0.9, (n_new, n_new))
    
    # Expand matrix
    matrix = np.vstack([matrix, new_rows])
    matrix = np.hstack([matrix, np.vstack([new_cols, new_block])])
    
    # Update clustering with warm start
    start_time = time.time()
    result = search.fit(matrix, initial_labels=result.labels if i > 0 else None)
    update_time = time.time() - start_time
    update_times.append(update_time)
    
    print(f"Update {i+1}: Matrix size = {matrix.shape[0]}, Time = {update_time:.3f}s")

print(f"\nAverage update time: {np.mean(update_times):.3f}s")

## 6. Performance Tips Summary

Let's create a comprehensive performance comparison and recommendations.

In [None]:
# Create performance recommendation matrix
scenarios = [
    "Small dense matrix (<100 products)",
    "Medium dense matrix (100-1000 products)",
    "Large dense matrix (>1000 products)",
    "Sparse matrix (>90% zeros)",
    "Many small matrices",
    "Real-time updates",
    "High accuracy required",
    "Quick approximation needed"
]

recommendations = [
    {"parallel": "No", "sparse": "No", "cache": "No", "algorithm": "KSMLocalSearch", "restarts": 10},
    {"parallel": "Yes", "sparse": "No", "cache": "Yes", "algorithm": "KSMLocalSearch2", "restarts": 20},
    {"parallel": "Yes", "sparse": "Consider", "cache": "Yes", "algorithm": "KSMLocalSearch2", "restarts": 5},
    {"parallel": "Yes", "sparse": "Yes", "cache": "Yes", "algorithm": "Sparse variant", "restarts": 10},
    {"parallel": "Yes", "sparse": "No", "cache": "Shared", "algorithm": "Batch processing", "restarts": 5},
    {"parallel": "No", "sparse": "No", "cache": "Yes", "algorithm": "Warm start", "restarts": 3},
    {"parallel": "Yes", "sparse": "No", "cache": "Yes", "algorithm": "KSMLocalSearch2", "restarts": 50},
    {"parallel": "No", "sparse": "No", "cache": "No", "algorithm": "KSMLocalSearch", "restarts": 1},
]

df_recommendations = pd.DataFrame(recommendations, index=scenarios)

# Display as formatted table
print("Performance Optimization Recommendations:")
print("=" * 100)
print(f"{'Scenario':<35} | {'Parallel':<8} | {'Sparse':<8} | {'Cache':<8} | {'Algorithm':<15} | {'Restarts':<8}")
print("-" * 100)

for scenario, row in df_recommendations.iterrows():
    print(f"{scenario:<35} | {row['parallel']:<8} | {row['sparse']:<8} | "
          f"{row['cache']:<8} | {row['algorithm']:<15} | {row['restarts']:<8}")

In [None]:
# Create a visual performance guide
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))

# Performance vs accuracy trade-off
accuracy_levels = np.array([0.8, 0.85, 0.9, 0.95, 0.98, 0.99])
time_required = np.array([0.1, 0.3, 1.0, 3.0, 10.0, 30.0])
restarts_needed = np.array([1, 3, 10, 20, 50, 100])

ax1.plot(accuracy_levels * 100, time_required, 'o-', color='blue', linewidth=2, markersize=8, label='Time (s)')
ax1_twin = ax1.twinx()
ax1_twin.plot(accuracy_levels * 100, restarts_needed, 's-', color='red', linewidth=2, markersize=8, label='Restarts')

ax1.set_xlabel('Desired Solution Quality (%)')
ax1.set_ylabel('Time Required (seconds)', color='blue')
ax1_twin.set_ylabel('Restarts Needed', color='red')
ax1.set_title('Performance vs. Accuracy Trade-off')
ax1.grid(True, alpha=0.3)
ax1.tick_params(axis='y', labelcolor='blue')
ax1_twin.tick_params(axis='y', labelcolor='red')

# Matrix size recommendations
matrix_sizes = ['<50', '50-100', '100-500', '500-1000', '1000-5000', '>5000']
techniques = ['Basic', 'Basic', 'Parallel', 'Parallel+Cache', 'Sparse+Parallel', 'Sparse+Distributed']
colors_map = {'Basic': 'green', 'Parallel': 'blue', 'Parallel+Cache': 'orange', 
              'Sparse+Parallel': 'red', 'Sparse+Distributed': 'purple'}
colors = [colors_map[t] for t in techniques]

y_pos = np.arange(len(matrix_sizes))
ax2.barh(y_pos, [1]*len(matrix_sizes), color=colors)
ax2.set_yticks(y_pos)
ax2.set_yticklabels(matrix_sizes)
ax2.set_xlabel('Recommended Optimization Technique')
ax2.set_ylabel('Matrix Size (# products)')
ax2.set_title('Optimization Recommendations by Problem Size')

# Add text labels
for i, technique in enumerate(techniques):
    ax2.text(0.5, i, technique, ha='center', va='center', fontweight='bold')

# Create legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor=color, label=technique) 
                  for technique, color in colors_map.items()]
ax2.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, 0.5))

plt.tight_layout()
plt.show()

## 7. Profiling Your Own Code

Here's how to profile your SUBMARIT code to identify bottlenecks:

In [None]:
# Example profiling code
import cProfile
import pstats
from io import StringIO

def profile_clustering():
    """Function to profile."""
    matrix = create_test_matrix(200)
    search = KSMLocalSearch(n_clusters=4, max_iterations=100, n_restarts=10)
    result = search.fit(matrix)
    return result

# Run profiler
profiler = cProfile.Profile()
profiler.enable()

# Run the function
result = profile_clustering()

profiler.disable()

# Get statistics
s = StringIO()
ps = pstats.Stats(profiler, stream=s).sort_stats('cumulative')
ps.print_stats(10)  # Top 10 functions

print("Top 10 time-consuming functions:")
print(s.getvalue()[:2000])  # Print first 2000 characters

## Best Practices Summary

### 1. **Choose the Right Data Structure**
   - Use dense arrays for small matrices (<1000 products)
   - Use sparse matrices when sparsity >90%
   - Consider memory-mapped arrays for very large datasets

### 2. **Optimize Algorithm Parameters**
   - Balance restarts vs. iterations based on time budget
   - Use warm starts for incremental updates
   - Choose initialization method based on data characteristics

### 3. **Leverage Parallelism**
   - Use parallel processing for multiple restarts
   - Batch process multiple small problems
   - Consider distributed computing for very large problems

### 4. **Implement Caching**
   - Cache distance calculations
   - Reuse preprocessing results
   - Share data structures across runs

### 5. **Profile Before Optimizing**
   - Identify actual bottlenecks
   - Measure improvement impact
   - Consider algorithm complexity

### 6. **Memory Management**
   - Use appropriate data types (float32 vs float64)
   - Clear unnecessary variables
   - Monitor memory usage for large problems

### 7. **Algorithm Selection**
   - KSMLocalSearch for general use
   - KSMLocalSearch2 for better convergence
   - Constrained variants when needed
   - Entropy-based for balanced clusters