# Moran Process on Different Graphs -New Database Split


In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os

# Set the style for nicer plots
sns.set_theme(style="whitegrid")

In [None]:
import sys
import os

# Import analysis utilities
from analysis_utils import setup_analysis_environment, load_all_data

# Setup environment and load data
setup_analysis_environment()
data = load_all_data()

# Extract data for use in notebook
df_graphs = data['graphs']
df_experiments = data['random_test']  # or change to 'all_experiments' for all data

# Try to import EXPERIMENTS_CSV from main.py
try:
    from main import EXPERIMENTS_CSV
    print(f"✓ Imported EXPERIMENTS_CSV: {EXPERIMENTS_CSV}")
except ImportError:
    EXPERIMENTS_CSV = 'respiratory_runs.csv'
    print(f"✗ Could not import from main.py, using fallback: {EXPERIMENTS_CSV}")

In [None]:

print("Colums of df_experiments: \n", df_experiments.columns)
print("Colums of df_graphs: \n", df_graphs.columns)


In [None]:
df_graphs['category'].unique()

In [None]:
df = pd.merge(
    df_experiments, 
    df_graphs, 
    on='wl_hash', 
    how='left', 
    suffixes=('', '_db') # If columns name clash (like 'n_nodes'), the DB one gets '_db'
)
df = df.filter(regex='^(?!.*_db)')
df

This code block is here only for some of it's design, will delete later:

In [None]:
# # Get list of unique graphs
# unique_graphs = df['wl_hash'].unique()

# for graph_hash in unique_graphs:
#     # Get subset for this graph
#     graph_subset = df[df['wl_hash'] == graph_hash]
#     unique_r_values = graph_subset['r'].unique()
#     n_nodes = graph_subset['n_nodes'].iloc[0]
#     graph_name = graph_subset['graph_name'].iloc[0]
#     for r in unique_r_values:
#         # Filter data for this specific graph and r value
#         subset = graph_subset[graph_subset['r'] == r]
#         fixation_subset = subset[subset['fixation'] == True]
        
#         # Calculate statistics
#         p_fix = subset['fixation'].mean()
#         n_runs = len(subset)
#         n_success = len(fixation_subset)
        
#         # Create figure
#         fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
#         # Main title with statistics
#         fig.suptitle(f"Graph: {graph_name} | n_nodes: {n_nodes} | r: {r} | Runs: {n_runs} | P(fix): {p_fix:.3f} ({n_success}/{n_runs})", 
#                     fontsize=14, fontweight='bold')
        
#         # Plot 1: All runs histogram
#         if not subset.empty:
#             bins = max(10, min(50, int(subset['steps'].max() - subset['steps'].min()) // 5))
            
#             # Separate data by fixation status
#             fixation_steps = subset[subset['fixation'] == True]['steps']
#             extinction_steps = subset[subset['fixation'] == False]['steps']
            
#             # Plot stacked histogram
#             axes[0].hist([extinction_steps, fixation_steps], 
#                         bins=bins, 
#                         label=['Extinction', 'Fixation'],
#                         color=['gray', 'green'],
#                         alpha=0.7,
#                         stacked=True)
#         median_steps = subset['steps'].median()
#         axes[0].axvline(median_steps, color='red', linestyle='--', alpha=0.8, label=f'Median: {median_steps:.0f}')
#         axes[0].set_title(f"Distribution of Steps (All Runs)")
#         axes[0].set_xlabel("Steps")
#         axes[0].set_ylabel("Frequency")
#         axes[0].legend()
#         axes[0].grid(True, alpha=0.3)
        
#         # Plot 2: Fixation-only histogram
#         if not fixation_subset.empty:
#             bins = max(5, min(30, len(fixation_subset) // 3))
#             axes[1].hist(fixation_subset['steps'], 
#                         bins=bins, 
#                         color='green', 
#                         alpha=0.7, 
#                         edgecolor='black', 
#                         linewidth=0.5)
            
#             median_steps_fixed = fixation_subset['steps'].median()
#             axes[1].axvline(median_steps_fixed, color='red', linestyle='--', alpha=0.8, 
#                            label=f'Median: {median_steps_fixed:.0f}')
#             axes[1].set_title(f"Distribution of Steps (Fixation Events Only)")
#             axes[1].set_xlabel("Steps to Fixation")
#             axes[1].set_ylabel("Frequency")
#             axes[1].legend()
#             axes[1].grid(True, alpha=0.3)
#         else:
#             axes[1].text(0.5, 0.5, "No Fixation Events Observed", 
#                         horizontalalignment='center', verticalalignment='center', 
#                         fontsize=14, transform=axes[1].transAxes)
#             axes[1].set_title("Distribution of Steps (Fixation Events Only)")
#             axes[1].set_xlabel("Steps to Fixation")
#             axes[1].set_ylabel("Frequency")

#         plt.tight_layout()
#         plt.show()

Histogram plots of the steps to fixation

In [None]:
# Get list of unique graphs
unique_graphs = df['wl_hash'].unique()

for graph_hash in unique_graphs:
    # 1. Create a subset for this graph ONCE (Optimization)
    graph_subset = df[df['wl_hash'] == graph_hash]
    
    # 2. Extract Metadata (FIX: Use .iloc[0] to get the single value, not a Series)
    # We use 'n_nodes' if it came from the DB merge, or 'n_nodes' if from the experiment data
    n_nodes = graph_subset['n_nodes'].iloc[0] if 'n_nodes' in graph_subset.columns else graph_subset['n_nodes'].iloc[0]
    graph_name = graph_subset['graph_name'].iloc[0]
    
    # Get the r values present for this specific graph
    unique_r_values = sorted(graph_subset['r'].unique())

    for r in unique_r_values:
        # 3. Filter data for this specific r
        subset = graph_subset[graph_subset['r'] == r]
        
        # 4. Filter for only successful fixations
        fixation_subset = subset[subset['fixation'] == True]
        
        # 5. Calculate Statistics
        p_fix = subset['fixation'].mean()
        n_runs = len(subset)
        n_success = len(fixation_subset)
        
        # --- PLOTTING ---
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        # Main Title
        fig.suptitle(f"Graph: {graph_name} | N: {n_nodes} | r: {r} | Runs: {n_runs} | P(fix): {p_fix:.3f} ({n_success}/{n_runs})", 
                     fontsize=14, fontweight='bold', y=1.05)
        
        # Plot 1: Stacked Histogram (Fixation vs Extinction)
        sns.histplot(data=subset, x='steps', hue='fixation', multiple="stack", 
                     ax=axes[0], palette={True: 'green', False: 'gray'}, bins=30)
        median_all = subset['steps'].median()
        axes[0].axvline(median_all, color='red', linestyle='--', alpha=0.8, label=f'Median: {median_all:.0f}')
        axes[0].set_title(f"All Runs (Fixation + Extinction)\nMedian: {median_all:.0f} steps")
        axes[0].set_xlabel("Steps")
        axes[0].set_ylabel("Count")
        
        # Plot 2: Fixation Only
        if not fixation_subset.empty:
            sns.histplot(data=fixation_subset, x='steps', ax=axes[1], color='green', kde=True, bins=20)
            median_fix = fixation_subset['steps'].median()
            axes[1].axvline(median_fix, color='red', linestyle='--', alpha=0.8, label=f'Median: {median_fix:.0f}')
            axes[1].set_title(f"Successful Fixations Only\nMedian: {median_fix:.0f} steps")
            axes[1].set_xlabel("Steps to Fixation")
            axes[1].set_ylabel("Count")
        else:
            # Handle case with 0 fixations (cleaner UI)
            axes[1].text(0.5, 0.5, "No Fixation Events", 
                         horizontalalignment='center', verticalalignment='center', fontsize=12)
            axes[1].set_title("Successful Fixations Only")
            axes[1].axis('off') # Hides the empty axes

        plt.tight_layout()
        plt.show()

Probabilities and median steps tables

In [None]:
# ---------------------------------------------------------
# TABLE 1: Median Steps (Only Successful Fixations)
# ---------------------------------------------------------
fixation_df = df[df['fixation'] == True]

# 1. Pivot using Hash (Logic: Keep distinct graphs separate)
table_med_fixation = fixation_df.pivot_table(
    index=['graph_name', 'n_nodes', 'wl_hash'], 
    columns='r', 
    values='steps', 
    aggfunc='median'
)

# 2. Sort by Row Average
table_med_fixation = table_med_fixation.assign(row_mean=table_med_fixation.mean(axis=1)) \
                                       .sort_values('row_mean') \
                                       .drop(columns='row_mean')

# 3. Drop Hash from Display (Presentation: Hide the ugly ID)
# This keeps the rows separate but removes the hash column from the view
table_med_fixation.index = table_med_fixation.index.droplevel('wl_hash')

print("--- Table 1: Median Steps (Only Successful Fixations) [Sorted] ---")
display(table_med_fixation)


# ---------------------------------------------------------
# TABLE 2: Median Steps (All Runs)
# ---------------------------------------------------------
table_med_steps_absorption = df.pivot_table(
    index=['graph_name', 'n_nodes', 'wl_hash'], 
    columns='r', 
    values='steps', 
    aggfunc='median'
)

# Sort and Clean
table_med_steps_absorption = table_med_steps_absorption.assign(row_mean=table_med_steps_absorption.mean(axis=1)) \
                      .sort_values('row_mean') \
                      .drop(columns='row_mean')

# Drop Hash from Display
table_med_steps_absorption.index = table_med_steps_absorption.index.droplevel('wl_hash')

print("\n--- Table 2: Median Steps (All Runs) [Sorted] ---")
display(table_med_steps_absorption)

In [None]:

# ---------------------------------------------------------
# TABLE 3: Fixation Probability (P_fix)
# ---------------------------------------------------------
table_prob = df.pivot_table(
    index=['graph_name', 'n_nodes', 'wl_hash'], 
    columns='r', 
    values='fixation', 
    aggfunc='mean'  # Mean of boolean = Probability
)

# Sort by Row Average (Descending: Highest probability at the top)
table_prob = table_prob.assign(row_mean=table_prob.mean(axis=1)) \
                       .sort_values('row_mean', ascending=False) \
                       .drop(columns='row_mean')
table_prob.index = table_prob.index.droplevel('wl_hash')
print("--- Table 3: Fixation Probability (P_fix) [Sorted by High -> Low] ---")
display(table_prob)

In [None]:
# 2. Aggregation Logic
# FIX: Group by 'wl_hash' to ensure distinct graphs are calculated separately
# even if they share the same name (e.g., multiple "Random" graphs).

# Metric A: Fixation Probability (Using all data)
prob_df = df.groupby(['graph_name', 'wl_hash', 'r'])['fixation'].mean().reset_index(name='prob_fixation')

# Metric B: Median Steps (Using only SUCCESSFUL fixations)
success_only_df = df[df['fixation'] == True]
time_df = success_only_df.groupby(['graph_name', 'wl_hash', 'r'])['steps'].median().reset_index(name='median_steps')

# Merge metrics into one plotting table
# FIX: Merge on 'wl_hash' to ensure we match the right probability to the right time
plot_data = pd.merge(prob_df, time_df, on=['graph_name', 'wl_hash', 'r'])

# 3. Create Scatter Plot
plt.figure(figsize=(10, 7))
sns.scatterplot(
    data=plot_data,
    x='median_steps',
    y='prob_fixation',
    hue='graph_name',  # Color by name (groups similar types visually)
    style='r',         # Shape by r
    s=150,             # Make points larger
    palette='deep',
    alpha=0.9
)

# 4. Styling
plt.title('Trade-off: Fixation Probability vs Time', fontsize=15)
plt.xlabel('Median Steps to Fixation (Condition: Success)', fontsize=12)
plt.ylabel('Probability of Fixation', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)

# Move legend outside to keep plot clean
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

plt.tight_layout()
plt.show()

# Probability for an Active Step
Here let's see if indeed the higher the multiplication of $N_{mut} \times N_{wt}$ there is a higher chance for an active step. 


In [None]:
# from population_graph import PopulationGraph
# from process_run import ProcessRun

# N = 100
# r=1
# sim = ProcessRun(PopulationGraph.complete_graph(N=N), selection_coefficient=r)
# sim.initialize_random_mutant(n_mutants=N//2)
# result = sim.run(track_history=True)
            
# history = result['history']
# n_mutants = history[:-1]
# n_wt = N - n_mutants
# interaction_product = n_mutants * n_wt

# is_active = (history[1:] != history[:-1]).astype(int)

# interaction_df = pd.DataFrame({'interaction_product': interaction_product,
#                                'is_active': is_active})

# stats = interaction_df.groupby('interaction_product')['is_active'].mean().reset_index()
# stats.columns = ['N_mutants x N_non_mutants', 'P(change)']

# plt.figure(figsize=(6,8))
# sns.regplot(data=stats, x=stats.columns[0], y=stats.columns[1])
# plt.title(f'Correlation: Group Product vs Activity (N={N}, r={r})')
# plt.ylabel('Probability of State Change P(t+1 ≠ t)')
# plt.xlabel('Interaction Product (N_mut × N_wt)')
# plt.grid(True, alpha=0.3)
# plt.show()

