# Feature Extraction and Analysis

This notebook demonstrates MyceliumFractalNet's feature extraction capabilities, showing how to:
- Extract 18 standardized fractal features
- Analyze feature distributions
- Understand feature correlations
- Use features for downstream ML tasks

In [None]:
# Install required packages if running in Colab
import sys
if 'google.colab' in sys.modules:
    !pip install -q mycelium-fractal-net matplotlib numpy pandas seaborn

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from mycelium_fractal_net import (
    make_simulation_config_demo,
    run_mycelium_simulation_with_history,
    compute_fractal_features,
)

print("✓ Imports successful")

## 1. Extract Features from a Simulation

Run a simulation and extract all 18 standardized features.

In [None]:
# Run simulation
config = make_simulation_config_demo()
result = run_mycelium_simulation_with_history(config)

# Extract features
features = compute_fractal_features(result)

print("Extracted Features:")
print("="*60)
for key, value in features.items():
    print(f"  {key:20s}: {value:12.6f}")

print(f"\nTotal features: {len(features)}")

## 2. Feature Categories

The 18 features are organized into 4 categories:
- **Geometric** (4): D_box, f_active, edge_density, cluster_coeff
- **Statistical** (6): V_mean, V_std, V_skew, V_kurt, entropy, hurst
- **Temporal** (4): dV_dt_mean, dV_dt_std, autocorr_lag1, persistence
- **Structural** (4): gradient_mean, gradient_std, laplacian_mean, laplacian_std

In [None]:
# Organize features by category
feature_categories = {
    'Geometric': ['D_box', 'f_active', 'edge_density', 'cluster_coeff'],
    'Statistical': ['V_mean', 'V_std', 'V_skew', 'V_kurt', 'entropy', 'hurst'],
    'Temporal': ['dV_dt_mean', 'dV_dt_std', 'autocorr_lag1', 'persistence'],
    'Structural': ['gradient_mean', 'gradient_std', 'laplacian_mean', 'laplacian_std']
}

# Display by category
for category, feature_names in feature_categories.items():
    print(f"\n{category} Features:")
    print("-" * 50)
    for name in feature_names:
        if name in features:
            print(f"  {name:20s}: {features[name]:12.6f}")

## 3. Feature Distribution Analysis

Generate multiple simulations to analyze feature distributions.

In [None]:
# Generate multiple simulations with different seeds
n_simulations = 50
all_features = []

print(f"Running {n_simulations} simulations...")
for seed in range(n_simulations):
    config_temp = make_simulation_config_demo()
    config_temp.seed = seed
    config_temp.steps = 50  # Shorter for speed
    result_temp = run_mycelium_simulation_with_history(config_temp)
    features_temp = compute_fractal_features(result_temp)
    all_features.append(features_temp)
    
    if (seed + 1) % 10 == 0:
        print(f"  Completed {seed + 1}/{n_simulations}")

# Convert to DataFrame for analysis
df_features = pd.DataFrame(all_features)

print(f"\n✓ Generated {len(df_features)} feature vectors")
print(f"  Shape: {df_features.shape}")
print(f"\nFeature Statistics:")
print(df_features.describe())

## 4. Visualize Feature Distributions

In [None]:
# Plot distributions for key features
key_features = ['D_box', 'V_mean', 'entropy', 'gradient_mean']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, feature_name in enumerate(key_features):
    if feature_name in df_features.columns:
        ax = axes[i]
        df_features[feature_name].hist(bins=20, ax=ax, alpha=0.7, edgecolor='black')
        ax.axvline(df_features[feature_name].mean(), color='red', linestyle='--', 
                   label=f'Mean: {df_features[feature_name].mean():.3f}')
        ax.set_xlabel(feature_name)
        ax.set_ylabel('Frequency')
        ax.set_title(f'Distribution of {feature_name}')
        ax.legend()
        ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Feature Correlation Analysis

Analyze correlations between different features.

In [None]:
# Compute correlation matrix
correlation_matrix = df_features.corr()

# Plot heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=14, pad=20)
plt.tight_layout()
plt.show()

# Find highly correlated pairs
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_value = correlation_matrix.iloc[i, j]
        if abs(corr_value) > 0.7:  # Threshold for high correlation
            high_corr_pairs.append((correlation_matrix.columns[i], 
                                   correlation_matrix.columns[j], 
                                   corr_value))

print("\nHighly Correlated Feature Pairs (|r| > 0.7):")
for feat1, feat2, corr in sorted(high_corr_pairs, key=lambda x: abs(x[2]), reverse=True):
    print(f"  {feat1:20s} <-> {feat2:20s}: r = {corr:6.3f}")

## 6. Principal Component Analysis

Use PCA to identify the most important feature dimensions.

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(df_features)

# Apply PCA
pca = PCA()
features_pca = pca.fit_transform(features_scaled)

# Plot explained variance
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Explained variance by component
axes[0].bar(range(1, len(pca.explained_variance_ratio_) + 1), 
            pca.explained_variance_ratio_, alpha=0.7, edgecolor='black')
axes[0].set_xlabel('Principal Component')
axes[0].set_ylabel('Explained Variance Ratio')
axes[0].set_title('Explained Variance by Component')
axes[0].grid(True, alpha=0.3)

# Cumulative explained variance
cumsum = np.cumsum(pca.explained_variance_ratio_)
axes[1].plot(range(1, len(cumsum) + 1), cumsum, marker='o', linewidth=2)
axes[1].axhline(y=0.95, color='r', linestyle='--', label='95% variance')
axes[1].set_xlabel('Number of Components')
axes[1].set_ylabel('Cumulative Explained Variance')
axes[1].set_title('Cumulative Explained Variance')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Find number of components for 95% variance
n_components_95 = np.argmax(cumsum >= 0.95) + 1
print(f"\nNumber of components for 95% variance: {n_components_95}/{len(df_features.columns)}")
print(f"Top 3 components explain {cumsum[2]:.1%} of variance")

## 7. Feature Importance for Classification

Use a simple classifier to identify most discriminative features.

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create synthetic binary labels based on fractal dimension threshold
# (High complexity vs Low complexity patterns)
threshold = df_features['D_box'].median()
labels = (df_features['D_box'] > threshold).astype(int)

print(f"Created binary labels based on D_box threshold: {threshold:.3f}")
print(f"  Class 0 (low complexity): {(labels == 0).sum()} samples")
print(f"  Class 1 (high complexity): {(labels == 1).sum()} samples")

# Train random forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(df_features.drop('D_box', axis=1), labels)  # Don't use D_box itself

# Get feature importances
feature_importance = pd.DataFrame({
    'feature': df_features.drop('D_box', axis=1).columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

# Plot top 10 features
plt.figure(figsize=(10, 6))
top_features = feature_importance.head(10)
plt.barh(range(len(top_features)), top_features['importance'], alpha=0.7, edgecolor='black')
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 10 Features for Pattern Complexity Classification')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

print("\nTop 5 Most Important Features:")
for idx, row in feature_importance.head(5).iterrows():
    print(f"  {row['feature']:20s}: {row['importance']:.4f}")

## 8. Feature Vector for ML Pipeline

Show how to prepare features for downstream machine learning tasks.

In [None]:
# Convert features to standardized numpy array
feature_array = df_features.values

print("Feature Array Shape:", feature_array.shape)
print(f"  {feature_array.shape[0]} samples")
print(f"  {feature_array.shape[1]} features")

print("\nFeature Vector Statistics:")
print(f"  Mean: {feature_array.mean(axis=0)[:5]}...")
print(f"  Std:  {feature_array.std(axis=0)[:5]}...")

# Example: Normalize for ML
scaler = StandardScaler()
feature_array_normalized = scaler.fit_transform(feature_array)

print("\nNormalized Feature Vector:")
print(f"  Mean (should be ~0): {feature_array_normalized.mean(axis=0)[:5]}...")
print(f"  Std (should be ~1):  {feature_array_normalized.std(axis=0)[:5]}...")

print("\n✓ Features ready for ML pipeline")
print("  - Can be used for classification, regression, clustering")
print("  - Standard shape: (n_samples, 18)")
print("  - All features are numerical and well-scaled")

## Summary

This notebook demonstrated:
- ✓ Extraction of 18 standardized fractal features
- ✓ Feature organization by category (Geometric, Statistical, Temporal, Structural)
- ✓ Distribution analysis across multiple simulations
- ✓ Feature correlation analysis
- ✓ Dimensionality reduction with PCA
- ✓ Feature importance for classification tasks
- ✓ Preparation for ML pipelines

For more exploration:
- `01_field_simulation.ipynb` - Field simulation basics
- `03_fractal_exploration.ipynb` - Deep dive into fractal dimensions