In [None]:
import sys
import os

# Add the root directory to sys.path so that 'src' can be imported
notebook_dir = os.getcwd()
root_dir = os.path.abspath(os.path.join(notebook_dir, ".."))
if root_dir not in sys.path:
	sys.path.insert(0, root_dir)

from src.drp_fall_2025.topology import SimplicialComplex
from src.drp_fall_2025.analysis import compute_betti_numbers, compute_euler_characteristic, plot_betti_distributions

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from tqdm import trange

# Set style for better-looking plots
sns.set_theme(style="whitegrid", palette="colorblind")

# Problem Set 2: Simplicial Homology

## Overview
This notebook demonstrates:
1. Creating random simplicial complexes
2. Computing Euler characteristics
3. Computing Betti numbers over Z/2Z
4. Statistical analysis of topological features

In [None]:
# Example: Create a simple simplicial complex

# Define a triangle with its edges and vertices
# Vertices: 0, 1, 2
# Edges: (0,1), (1,2), (0,2)
# Triangle: (0,1,2)

triangle_complex = SimplicialComplex.from_maximal_simplices([
    [0, 1, 2]  # This will automatically generate all faces
])

print(f"Complex: {triangle_complex}")
print(f"All simplices: {sorted(triangle_complex.simplices)}")
print(f"Euler characteristic: {compute_euler_characteristic(triangle_complex)}")
print(f"Betti numbers: {compute_betti_numbers(triangle_complex)}")
print()
print("Interpretation:")
print("\tβ₀ = 1: One connected component")
print("\tβ₁ = 0: No holes (the triangle is filled in)")

## Assignment: Random Complex Generation and Betti Number Analysis

**Task**: Generate 100 random complexes from 10 vertices and analyze their Betti numbers.

We use a bottom-up probabilistic model where simplices at each dimension are added
with probability 0.5, provided their boundary faces are already present.

In [None]:
# Configuration constants
NUM_VERTICES = 10
NUM_RUNS = 100
PROBABILITY = 0.5
MAX_DIMENSION = 10

# Uniform probabilities using dict comprehension
P_DICT = {i: PROBABILITY for i in range(1, MAX_DIMENSION + 1)}

print(f"We will run this simulation with {NUM_RUNS} runs on {NUM_VERTICES} vertices.")
print(f"Probabilities: p_k = {PROBABILITY} for all dimensions k ∈ [1, {MAX_DIMENSION}]")

results = []

In [None]:
import random

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Generate random complexes with progress bar
for i in trange(NUM_RUNS, desc="Generating complexes"):
    complex_k = SimplicialComplex.from_bottom_up_process(NUM_VERTICES, P_DICT)
    betti = compute_betti_numbers(complex_k)
    results.append(betti)

In [None]:
# Constants for formatting
SEPARATOR = "=" * 60

# Chain pandas operations for efficiency
df = pd.DataFrame(results).fillna(0).astype(int)

# Compute statistics once
df_mean = df.mean()
df_std = df.std()
df_median = df.median()

print("\n" + SEPARATOR)
print("STATISTICAL SUMMARY")
print(SEPARATOR)

print("\n--- Average Betti Numbers ---")
print(df_mean.to_string())

print("\n--- Full Statistical Summary ---")
print(df.describe().to_string())

print("\n--- Additional Statistics ---")
print(f"Standard Deviations: {df_std.to_dict()}")
print(f"Medians: {df_median.to_dict()}")

In [None]:
print("\n--- Automated Insights ---")

for col in sorted(df.columns):
    # Reuse precomputed statistics
    mean_val = df_mean[col]
    std_val = df_std[col]
    median_val = df_median[col]
    nonzero_pct = (df[col] > 0).sum() / len(df) * 100
    max_val = df[col].max()
    
    print(f"\nβ_{col}:")
    print(f"  • Mean: {mean_val:.2f}, Median: {median_val:.1f}, Std Dev: {std_val:.2f}")
    print(f"  • Range: [0, {max_val}]")
    print(f"  • Non-zero in {nonzero_pct:.1f}% of complexes")
    
    # Dimension-specific interpretation
    if col == 0:
        if mean_val > 1.5:
            print(f"  • Complexes are frequently disconnected (avg {mean_val:.2f} components)")
        elif mean_val < 1.2:
            print(f"  • Complexes are usually connected")
        else:
            print(f"  • Complexes sometimes fragment into multiple components")
    elif col == 1:
        if mean_val > 5:
            print(f"  • Many loops form (p={PROBABILITY} creates rich 1-dimensional structure)")
        else:
            print(f"  • Few loops form")
    elif col == 2:
        if mean_val > 0.5:
            print(f"  • Voids form occasionally (higher-dimensional structure)")
        else:
            print(f"  • Voids are rare (as expected with random generation)")

In [None]:
# Constants for correlation analysis
MODERATE_THRESHOLD = 0.3
STRONG_THRESHOLD = 0.7

if len(df.columns) > 1:
    print("\n" + SEPARATOR)
    print("CORRELATION ANALYSIS")
    print(SEPARATOR)
    
    correlation_matrix = df.corr()
    print("\nCorrelation Matrix:")
    print(correlation_matrix.to_string())

    print("\n--- Key Correlations ---")
    
    # Use the upper triangle of the matrix to avoid duplicates and self-correlation
    upper_triangle = correlation_matrix.where(
        np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
    )
    
    # Unstack to a Series and drop missing values
    strong_corrs = upper_triangle.stack()
    
    # Filter for meaningful correlations
    significant_corrs = strong_corrs[abs(strong_corrs) >= MODERATE_THRESHOLD]

    if significant_corrs.empty:
        print(f"No significant correlations (|r| ≥ {MODERATE_THRESHOLD}) found.")
    else:
        for (col1, col2), corr_val in significant_corrs.items():
            direction = "positive" if corr_val > 0 else "negative"
            strength = "strong" if abs(corr_val) > STRONG_THRESHOLD else "moderate"
            print(f"{col1} and {col2}: {strength} {direction} correlation ({corr_val:.3f})")

    # Correlation Heatmap
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(
        correlation_matrix, 
        annot=True, 
        cmap='coolwarm', 
        center=0, 
        ax=ax,
        fmt='.3f',
        square=True,
        cbar_kws={'label': 'Pearson Correlation Coefficient'},
        linewidths=0.5,
        linecolor='white',
        vmin=-1,
        vmax=1
    )
    ax.set_title('Correlation between Betti Numbers', fontsize=15, fontweight='bold', pad=20)
    
    # Improve tick label presentation
    plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
    plt.yticks(rotation=0)
    
    ax.tick_params(left=False, bottom=False) # Removes tick marks for a cleaner look
    
    plt.tight_layout(pad=1.5)
    plt.show()

else:
    print("\n(Only one Betti number column present - skipping correlation analysis)")

In [None]:
# Extreme Cases Analysis
print("\n" + SEPARATOR)
print("EXTREME CASES")
print(SEPARATOR)

for col in sorted(df.columns):
    if df[col].max() > 0:
        max_idx = df[col].idxmax()
        min_idx = df[col].idxmin()
        
        print(f"\nβ_{col}:")
        print(f"  • Maximum: {df.loc[max_idx, col]} (in run #{max_idx})")
        other_cols = [c for c in df.columns if c != col]
        if other_cols:
            other_bettis = {f"β_{c}": df.loc[max_idx, c] for c in df.columns if c != col}
            print(f"    Other Betti numbers: {other_bettis}")
        
        print(f"  • Minimum: {df.loc[min_idx, col]} (in run #{min_idx})")
        if other_cols:
            other_bettis = {f"β_{c}": df.loc[min_idx, c] for c in df.columns if c != col}
            print(f"    Other Betti numbers: {other_bettis}")

In [None]:
# Generate the plots using the outsourced function
plot_betti_distributions(df, NUM_RUNS)