# Analyse the Results of Running Moran Process Experiment on Different Graphs
This is the newest version of this analysis file, where I can merge the csv of different jobs. 

imports

In [None]:
import os
os.environ

In [None]:
%load_ext autoreload
%autoreload 2
%cd /home/labs/pilpel/matanyaw/moran-process 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import numpy as np
import seaborn as sns
import os
from pathlib import Path
from analysis.analysis_utils import plot_hybrid_density, aggregate_results_no_load, GRAPH_PROPERTY_DESCRIPTION, COLOR_DICT
# change this if on a different computer!
from population_graph import GRAPH_PROPS
# Set aesthetic parameters for "publication-quality" plots
sns.set_theme(style="whitegrid", context="notebook", font_scale=1.2)
plt.rcParams['figure.figsize'] = (12, 7)
plt.rcParams['lines.linewidth'] = 2.5

BATCH_NAME = 'merged_batch_04'

In [None]:
ROOT = Path(os.getcwd()) 

# Now define your paths relative to ROOT
BATCH_DIR = ROOT / "simulation_data" / BATCH_NAME



In [None]:
import glob

output_file = os.path.join(batch_dir, f"temp_full_results.csv")
tmp_results_path = os.path.join(batch_dir, "tmp", "results")
all_files = glob.glob(os.path.join(tmp_results_path, "result_job_*.csv"))
print(f"Found {len(all_files)} files in temp results directory: {tmp_results_path}.")


In [None]:
results_df_path = aggregate_results_no_load(batch_dir=batch_dir, delete_temp=False)

In [None]:
results_df = pd.read_csv(results_df_path)
print("columns: ", results_df.columns)
print("shape: ", results_df.shape)

In [None]:
# Create a column where steps are NaN if fixation failed
# This allows .agg() to ignore those values automatically for median/std
results_df['steps_success'] = results_df['steps'].where(results_df['fixation'] == True)

analysis_df = results_df.groupby(['wl_hash', 'r', 'graph_name']).agg(
    prob_fixation=('fixation', 'mean'),
    median_steps=('steps_success', 'median'),
    mean_steps=('steps_success', 'mean'),
    std_steps=('steps_success', 'std'),
    q25_steps=('steps_success', lambda x: x.quantile(0.25)),
    q75_steps=('steps_success', lambda x: x.quantile(0.75)),
    iqr_steps=('steps_success', lambda x: x.quantile(0.75) - x.quantile(0.25)),
    n_grouped=('fixation', 'size')
).reset_index()

print("Shape before merging: ", analysis_df.shape)
# df_graphs = load_experiment_data('graph_database.csv')       # Graph database
df_graphs = pd.read_csv(os.path.join(batch_dir, 'graph_props.csv'))

# Merge with graph metadata
analysis_df = pd.merge(
    analysis_df, 
    df_graphs, 
    on=['wl_hash', 'graph_name'], 
    how='left', 
    suffixes=('', '_db')
)
# Sorting
analysis_df['z_order'] = (analysis_df['category'] != 'Random').astype(int)
analysis_df = analysis_df.sort_values('z_order').drop(columns='z_order')
analysis_df.to_csv(os.path.join(batch_dir, 'graph_statistics.csv'), index=False)


print("Shape after merging: ", analysis_df.shape)
# Display sample
analysis_df.tail(20)

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(analysis_df['mean_steps'], bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('Mean Steps')
plt.ylabel('Frequency')
plt.title('Distribution of Mean Steps')
plt.grid(axis='y', alpha=0.3)
plt.show()

In [None]:
# Merge results_df and analysis_df on wl_hash and graph_name, excluding Random category
merged_df = pd.merge(
    results_df,
    df_graphs[df_graphs['category'] != 'Random'],
    on=['wl_hash', 'graph_name'],
    how='inner'
)

# Filter out Random category graphs
print(f"Merged dataframe shape: {merged_df.shape}")
merged_df.columns


In [None]:
from analysis.analysis_utils import generate_robust_color_dict
# --- Example Usage ---
# Assuming 'analysis_df' is defined in your environment
# (Creating dummy data for demonstration)

categories = sorted(analysis_df['category'].dropna().unique().tolist())

category_color_dict = generate_robust_color_dict(analysis_df, COLOR_DICT)

print("Final Color Dictionary:")
for cat, color in category_color_dict.items():
    print(f"  {cat:15}: {color}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# 1. Setup the 3D figure
fig = plt.figure(figsize=(12, 10)) 
ax = fig.add_subplot(111, projection='3d')


# 2. Get the global min and max to make standard bins
animal_results = merged_df.loc[merged_df['category'].isin(categories), 'steps_success'].dropna()
bins = np.linspace(animal_results.min(), animal_results.max(), 50) 

spacing_factor = 2.5 

# 3. Plot each category on a different depth axis (z-axis)
for z_index, category in enumerate(categories):
    data = merged_df.loc[merged_df['category'] == category, 'steps_success'].dropna()
    
    # Calculate the raw histogram counts
    counts, _ = np.histogram(data, bins=bins)
    
    # --- NEW: Convert raw counts to percentage ---
    # Divide by the total number of instances in this category, multiply by 100
    if len(data) > 0:
        hist_values = (counts / len(data)) * 100
    else:
        hist_values = counts # Fallback if a category is entirely empty
    
    x_coords = bins[:-1]
    current_depth = z_index * spacing_factor

    ax.bar(x_coords, 
           hist_values, 
           zs=current_depth,               
           zdir='y',        
           width=(bins[1] - bins[0]) * 0.9, 
           color=category_color_dict[category], 
           alpha=0.8,
           edgecolor='black',
           linewidth=0.5)

# 4. Clean up the axes
ax.set_xlabel('Steps to Fixation', labelpad=10)

# --- NEW: Update the Z-axis label ---
ax.set_zlabel('Percentage (%)', labelpad=10)

ax.set_yticks([i * spacing_factor for i in range(len(categories))])
ax.set_yticklabels(categories, rotation=-15, ha='left', va='center')

ax.set_box_aspect(aspect=(1, 2, 1)) 
ax.view_init(elev=25, azim=-55)

plt.title('3D Distribution of Steps to Fixation (Normalized)', pad=20)
plt.subplots_adjust(left=0.05, right=0.85, bottom=0.1, top=0.95)

plt.show()

In [None]:
# Overlaid histograms of steps_success by category
animal_categories = ["Mammalian", "Fish", "Avian"]

plt.figure(figsize=(10, 6))

# Define common bins based on all data
animal_results = merged_df.loc[merged_df['category'].isin(categories), 'steps_success'].dropna()
# bins = 50  # or use: bins = np.linspace(all_data.min(), all_data.max(), 51)
bins = np.linspace(animal_results.min(), animal_results.max(), 101)

for category in categories:
    data = merged_df.loc[merged_df['category'] == category, 'steps_success'].dropna()
    plt.hist(data, bins=bins, alpha=0.4, edgecolor='black', label=category, color=category_color_dict[category])

plt.xlabel('Steps')
plt.ylabel('Frequency')
plt.title('Distribution of Steps to Fixation by Category')
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
plot_hybrid_density(analysis_df, 
                    'std_steps', 
                    'mean_steps', 
                    with_violin=False, 
                    color_dict=category_color_dict, 
                    highlight_categories=animal_categories)


In [None]:
plot_hybrid_density(analysis_df, 'max_degree', 'prob_fixation', with_violin=True, color_dict=category_color_dict, highlight_categories=animal_categories)


In [None]:
df_to_plot = analysis_df[analysis_df['r'] == 1.1]

# NEW_GRAPH_PROPS = ['avg_degree', 'max_degree']
print(GRAPH_PROPS)
# plot_hybrid_density(df_to_plot, 'mean_steps', 'std_steps', with_violin=False)
plot_hybrid_density(analysis_df, 'prob_fixation', 'mean_steps', with_violin=False, color_dict=category_color_dict, highlight_categories=animal_categories)
plot_hybrid_density(analysis_df, 'mean_steps', 'prob_fixation', with_violin=False, color_dict=category_color_dict, highlight_categories=animal_categories)


# Showing all plots on Mean Steps

In [None]:

# --- EXAMPLES OF USAGE ---
for prop in GRAPH_PROPS:
    # plot_property_effect(df_to_plot, prop, 'median_steps')
    plot_hybrid_density(df_to_plot, prop, 'mean_steps', density_threshold=50, with_violin=True, color_dict=category_color_dict, highlight_categories=animal_categories)
    # plot_hybrid_density(df_to_plot, prop, 'prob_fixation', density_threshold=50, with_violin=True)


# Showing all plots on Probability

In [None]:

# --- EXAMPLES OF USAGE ---
for prop in GRAPH_PROPS:
    # plot_property_effect(df_to_plot, prop, 'median_steps')
    # plot_hybrid_density(df_to_plot, prop, 'mean_steps', density_threshold=50, with_violin=True, color_dict=category_color_dict, highlight_categories=animal_categories)
    plot_hybrid_density(df_to_plot, prop, 'prob_fixation', density_threshold=50, with_violin=True, color_dict=category_color_dict, highlight_categories=animal_categories)


In [None]:
plot_hybrid_density(df_to_plot, 'degree_std', 'mean_steps', density_threshold=50, with_violin=False, color_dict=category_color_dict)

plt.figure(figsize=(10, 8))
plt.hexbin(df_to_plot['degree_std'], df_to_plot['mean_steps'], gridsize=20, cmap='YlOrRd', mincnt=1)
plt.xlabel('degree_std')
plt.ylabel('mean_steps')
plt.colorbar(label='count')
plt.title('Hexbin plot: degree_std vs mean_steps')
plt.show()

# Let's compare the extreme graphs to the LR and XGBOOST models predictions 

In [None]:
import joblib
import pandas as pd
import numpy as np
from pathlib import Path

# --- Constants & Paths ---
ML_MODELS_DIR = Path('/home/labs/pilpel/matanyaw/moran-process/simulation_data/batch_large_test_30_02/ml_models')

TIME_LR_MODEL = "LR Fixation Time"
TIME_XGBOOST_MODEL = "XGBOOST Fixation Time"
PROB_LR_MODEL = "LR Fixation Probability"
PROB_XGBOOST_MODEL = "XGBOOST Fixation Probability"

# --- 1. Load Models & Features ---
models = {
    TIME_LR_MODEL: joblib.load(ML_MODELS_DIR / 'mean_steps_linear_regression_pipeline.joblib'),
    TIME_XGBOOST_MODEL: joblib.load(ML_MODELS_DIR / 'mean_steps_xgboost_model.joblib'),
    PROB_LR_MODEL: joblib.load(ML_MODELS_DIR / 'prob_fixation_linear_regression_pipeline.joblib'),
    PROB_XGBOOST_MODEL: joblib.load(ML_MODELS_DIR / 'prob_fixation_xgboost_model.joblib'),
}

# All models use the exact same features, so we only need to extract the list once
expected_features = list(models[TIME_LR_MODEL].feature_names_in_)


In [None]:

# --- 2. Load Graphs & Calculate Properties ---
all_graphs = joblib.load(BATCH_DIR / "tmp" / "graph_zoo.joblib")

LR_graphs = [g for g in all_graphs if "LR" in g.category]
XGBOOST_graphs = [g for g in all_graphs if "XGBOOST" in g.category]

print(f"Total graphs loaded: {len(all_graphs)}")

# Assuming calculate_graph_properties() returns a dict that includes 'wl_hash'
all_graphs_props = [g.calculate_graph_properties() for g in all_graphs]

# LR_props_df = pd.DataFrame([g.calculate_graph_properties() for g in LR_graphs])
# XGBOOST_props_df = pd.DataFrame([g.calculate_graph_properties() for g in XGBOOST_graphs])


In [None]:

# --- 3. Extract True Targets ---
# Pull only the columns we need to match against from your analysis table
targets_df = analysis_df[['wl_hash', 'mean_steps', 'prob_fixation', 'category']]

# --- 4. Merge Features and Targets Safely ---
# This locks the computed features to the true targets based exclusively on wl_hash
all_graphs_merged = pd.DataFrame(all_graphs_props).merge(targets_df, on='wl_hash', how='inner')

# LR_merged = LR_props_df.merge(targets_df, on='wl_hash', how='inner')
# XGBOOST_merged = XGBOOST_props_df.merge(targets_df, on='wl_hash', how='inner')



In [None]:


# --- 5. Clean Missing Values (Required for LR) ---
# We must drop rows where ANY required feature or target is NaN
cols_to_check = expected_features + ['mean_steps', 'prob_fixation']

all_graphs_clean = all_graphs_merged.dropna(subset=cols_to_check)
# LR_clean = LR_merged.dropna(subset=cols_to_check)
# XGBOOST_clean = XGBOOST_merged.dropna(subset=cols_to_check)

print(f"Total graphs ready for prediction: {len(all_graphs_clean)} (Dropped {len(all_graphs_merged) - len(all_graphs_clean)} NaNs)")
# print(f"LR graphs ready for prediction: {len(LR_clean)} (Dropped {len(LR_merged) - len(LR_clean)} NaNs)")
# print(f"XGBOOST graphs ready for prediction: {len(XGBOOST_clean)} (Dropped {len(XGBOOST_merged) - len(XGBOOST_clean)} NaNs)")


In [None]:

# --- 6. Split into X (Features) and y (True Targets) ---
X = all_graphs_clean[expected_features]
# LR_X = LR_clean[expected_features]

y_time_true = all_graphs_clean['mean_steps']
y_prob_true = all_graphs_clean['prob_fixation']
# LR_y_time_true = LR_clean['mean_steps']
# LR_y_prob_true = LR_clean['prob_fixation']


# XGBOOST_X = XGBOOST_clean[expected_features]
# XGBOOST_y_time_true = XGBOOST_clean['mean_steps']
# XGBOOST_y_prob_true = XGBOOST_clean['prob_fixation']

# --- 7. Predict ---

LR_pred_time = models[TIME_LR_MODEL].predict(X)
LR_pred_prob = models[PROB_LR_MODEL].predict(X)

XGBOOST_pred_time = models[TIME_XGBOOST_MODEL].predict(X)
XGBOOST_pred_prob = models[PROB_XGBOOST_MODEL].predict(X)
# LR_pred_time = models[TIME_LR_MODEL].predict(LR_X)
# LR_pred_prob = models[PROB_LR_MODEL].predict(LR_X)

# XGBOOST_pred_time = models[TIME_XGBOOST_MODEL].predict(XGBOOST_X)
# XGBOOST_pred_prob = models[PROB_XGBOOST_MODEL].predict(XGBOOST_X)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score
import matplotlib.lines as mlines
import numpy as np

# --- 8. Prepare Data for Plotting ---
# Bind the predictions back into the clean dataframe
plot_df = all_graphs_clean.copy()
plot_df['LR_pred_time'] = LR_pred_time
plot_df['XGBOOST_pred_time'] = XGBOOST_pred_time
plot_df['LR_pred_prob'] = LR_pred_prob
plot_df['XGBOOST_pred_prob'] = XGBOOST_pred_prob

# --- BULLETPROOF CATEGORY FIX ---
# Map the category directly from the loaded Python objects using the hash
# This bypasses any category_x / category_y weirdness from the merge!
hash_to_cat = {g.wl_hash: g.category for g in all_graphs}
plot_df['category'] = plot_df['wl_hash'].map(hash_to_cat)
# --- 9. Plot Setup ---
fig, axes = plt.subplots(2, 2, figsize=(16, 14))
fig.suptitle('Model Performance: Predicted vs. True Values', fontsize=20, y=0.97)

def plot_pred_vs_true(ax, df, true_col, pred_col, title, color_dict):
    # Scatter plot
    sns.scatterplot(
        data=df, 
        x=true_col, 
        y=pred_col, 
        hue='category', 
        palette=color_dict,
        alpha=0.7, 
        edgecolor='w',
        s=60,
        ax=ax,
        legend=False  # <-- NEW: Turn off local subplot legends!
    )
    
    # Calculate R-squared
    r2 = r2_score(df[true_col], df[pred_col])
    
    # Plot the ideal y=x line (diagonal)
    min_val = min(df[true_col].min(), df[pred_col].min())
    max_val = max(df[true_col].max(), df[pred_col].max())
    buffer = (max_val - min_val) * 0.05
    line_limits = [min_val - buffer, max_val + buffer]
    
    ax.plot(line_limits, line_limits, color='black', linestyle='--', alpha=0.6)
    
    # Formatting
    ax.set_title(title, fontsize=14)
    ax.set_xlabel(f'True {true_col.replace("_", " ").title()}', fontsize=12)
    ax.set_ylabel(f'Predicted {true_col.replace("_", " ").title()}', fontsize=12)
    ax.grid(True, linestyle='--', alpha=0.3)
    
    # Add R2 text box
    ax.text(
        0.05, 0.95, 
        f'$R^2$ = {r2:.3f}', 
        transform=ax.transAxes, 
        fontsize=12, 
        verticalalignment='top', 
        bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.9, edgecolor='gray')
    )

# --- 10. Generate Subplots ---
plot_pred_vs_true(axes[0, 0], plot_df, 'mean_steps', 'LR_pred_time', 'Linear Regression: Fixation Time', category_color_dict)
plot_pred_vs_true(axes[0, 1], plot_df, 'mean_steps', 'XGBOOST_pred_time', 'XGBoost: Fixation Time', category_color_dict)
plot_pred_vs_true(axes[1, 0], plot_df, 'prob_fixation', 'LR_pred_prob', 'Linear Regression: Fixation Probability', category_color_dict)
plot_pred_vs_true(axes[1, 1], plot_df, 'prob_fixation', 'XGBOOST_pred_prob', 'XGBoost: Fixation Probability', category_color_dict)

# --- 11. Create a Single Master Legend ---
# Get the unique categories that actually appear in this specific dataset
present_categories = sorted(plot_df['category'].dropna().unique())

# Manually create the visual markers for the legend based on your color dict
legend_handles = [
    mlines.Line2D([], [], marker='o', color='w', markerfacecolor=category_color_dict[cat], 
                  markersize=10, label=cat) 
    for cat in present_categories if cat in category_color_dict
]

# Append the dashed line so the viewer knows what the diagonal line means
legend_handles.append(mlines.Line2D([], [], color='black', linestyle='--', alpha=0.6, label='Ideal (y=x)'))

# Attach the legend to the FIGURE (not the axes), anchored outside on the right
fig.legend(handles=legend_handles, title='Graph Category', loc='center left', bbox_to_anchor=(0.84, 0.5), fontsize=12, title_fontsize=14)

# --- 12. Adjust Layout Margins ---
# Force the plots to stop at 82% of the figure width, leaving a massive empty space on the right for the legend
plt.subplots_adjust(left=0.06, right=0.82, top=0.92, bottom=0.08, wspace=0.15, hspace=0.25)

plt.show()