In [17]:
import os
from pathlib import Path

try:
    from IPython import get_ipython
    ipynb_path = Path(get_ipython().run_line_magic('pwd', '')).resolve()
except Exception:
    ipynb_path = Path.cwd().resolve()

os.chdir(ipynb_path)
print("Working directory set to:", os.getcwd())

Working directory set to: /Users/wynne/Dropbox/BioGeoFormer/scripts/step_8_evaluating_models


In [13]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os

# Base directory where your split_X folders live
base_dir = "../../filtered_test_set/test_filtered_all/"


# Define splits you want to process (adjust if needed)
splits = ["20", "30", "40", "50", "60", "70", "80", "90"]

# Define confidence bins
bins = np.arange(0, 1.05, 0.05)
labels = [f"{round(b,2)}-{round(b+0.05,2)}" for b in bins[:-1]]

def process_split(split):
    """Read all filtered CSVs for one split, compute precision, and plot."""
    split_dir = os.path.join(base_dir, f"split_{split}")
    if not os.path.exists(split_dir):
        print(f"Skipping {split} (folder not found)")
        return
    
    dfs = []
    for sim in [20, 30, 40, 50, 60, 70, 80, 90]:
        file = os.path.join(split_dir, f"test{split}_filtered_{sim}.csv")
        if os.path.exists(file):
            df = pd.read_csv(file)
            df["Filter_Similarity"] = sim
            dfs.append(df)
    if not dfs:
        print(f"No files found for split {split}")
        return
    
    # Combine
    all_df = pd.concat(dfs, ignore_index=True)

    # Assign confidence bins
    all_df["conf_bin"] = pd.cut(all_df["confidence"], bins=bins, labels=labels, include_lowest=True)

    # Compute precision per bin
    precision_df = (
        all_df
        .groupby(["Filter_Similarity", "conf_bin"], observed=True)
        .apply(lambda g: pd.Series({
            "n": len(g),
            "precision": (g["prediction"] == g["cycle"]).mean() if len(g) > 0 else np.nan
        }))
        .reset_index()
    )

    # Plot
    sns.set_style("whitegrid")
    sns.set_context("talk", font_scale=1.1)

    g = sns.FacetGrid(
        precision_df.dropna(subset=["precision"]),
        col="Filter_Similarity", col_wrap=3, sharey=True, height=4, aspect=1.3
    )
    g.map_dataframe(sns.lineplot, x="conf_bin", y="precision", marker="o")

    # Add n annotations
    for ax, (sim, subdf) in zip(g.axes.flat, precision_df.groupby("Filter_Similarity")):
        for _, row in subdf.iterrows():
            if not pd.isna(row["precision"]) and row["n"] > 0:
                ax.text(
                    row["conf_bin"], row["precision"] + 0.02,
                    f"n={int(row['n'])}",
                    ha="center", va="bottom", fontsize=8, rotation=90
                )
    
    # Rotate x labels
    for ax in g.axes.flat:
        for label in ax.get_xticklabels():
            label.set_rotation(90)

    g.set_axis_labels("Confidence bin", "Precision")
    g.set_titles("Filtered ≤ {col_name}%")
    plt.tight_layout()

    # Save figure
    fig_dir = "../../results/figures"
    out_file = os.path.join(fig_dir, f"precision_confidence_split{split}.png")
    plt.savefig(out_file, dpi=600, bbox_inches="tight")
    plt.close()
    print(f"Saved plot: {out_file}")

# Run for all splits
for split in splits:
    process_split(split)


  .apply(lambda g: pd.Series({


Saved plot: ../../results/figures/precision_confidence_split20.png


  .apply(lambda g: pd.Series({


Saved plot: ../../results/figures/precision_confidence_split30.png


  .apply(lambda g: pd.Series({


Saved plot: ../../results/figures/precision_confidence_split40.png


  .apply(lambda g: pd.Series({


Saved plot: ../../results/figures/precision_confidence_split50.png


  .apply(lambda g: pd.Series({


Saved plot: ../../results/figures/precision_confidence_split60.png


  .apply(lambda g: pd.Series({


Saved plot: ../../results/figures/precision_confidence_split70.png


  .apply(lambda g: pd.Series({


Saved plot: ../../results/figures/precision_confidence_split80.png


  .apply(lambda g: pd.Series({


Saved plot: ../../results/figures/precision_confidence_split90.png


In [14]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.metrics import auc

base_dir = "../../filtered_test_set/test_filtered_all/"

splits = ["20", "30", "40", "50", "60", "70", "80", "90"]

# Define confidence bins
bins = np.arange(0, 1.05, 0.05)
labels = [f"{round(b,2)}-{round(b+0.05,2)}" for b in bins[:-1]]
bin_midpoints = bins[:-1] + 0.025  # center points for AUC

# Collect AUCs here
all_auc_results = []

def process_split(split):
    """Read all filtered CSVs for one split, compute precision, AUC, and plot."""
    split_dir = os.path.join(base_dir, f"split_{split}")
    if not os.path.exists(split_dir):
        print(f"Skipping {split} (folder not found)")
        return
    
    dfs = []
    for sim in [20, 30, 40, 50, 60, 70, 80, 90]:
        file = os.path.join(split_dir, f"test{split}_filtered_{sim}.csv")
        if os.path.exists(file):
            df = pd.read_csv(file)
            df["Filter_Similarity"] = sim
            dfs.append(df)
    if not dfs:
        print(f"No files found for split {split}")
        return
    
    # Combine
    all_df = pd.concat(dfs, ignore_index=True)

    # Assign confidence bins
    all_df["conf_bin"] = pd.cut(all_df["confidence"], bins=bins, labels=labels, include_lowest=True)

    # Compute precision per bin
    precision_df = (
        all_df
        .groupby(["Filter_Similarity", "conf_bin"], observed=True)
        .apply(lambda g: pd.Series({
            "n": len(g),
            "precision": (g["prediction"] == g["cycle"]).mean() if len(g) > 0 else np.nan
        }))
        .reset_index()
    )

    # Compute AUCs
    auc_results = {}
    for sim, subdf in precision_df.groupby("Filter_Similarity"):
        subdf = subdf.dropna(subset=["precision"]).reset_index(drop=True)
        if not subdf.empty:
            # Map bins to midpoints
            x = [bin_midpoints[labels.index(b)] for b in subdf["conf_bin"]]
            y = subdf["precision"].values
            auc_val = auc(x, y)
            auc_results[sim] = auc_val
            # Store results globally
            all_auc_results.append({"Split": split, "Filter_Similarity": sim, "AUC": auc_val})



# Run for all splits
for split in splits:
    process_split(split)

# Print AUC summary table
auc_df = pd.DataFrame(all_auc_results)
if not auc_df.empty:
    auc_pivot = auc_df.pivot(index="Split", columns="Filter_Similarity", values="AUC")
    print("\n AUC Summary Table:")
    print(auc_pivot.round(3))
else:
    print("No AUC results computed.")



  .apply(lambda g: pd.Series({
  .apply(lambda g: pd.Series({
  .apply(lambda g: pd.Series({
  .apply(lambda g: pd.Series({
  .apply(lambda g: pd.Series({
  .apply(lambda g: pd.Series({
  .apply(lambda g: pd.Series({



 AUC Summary Table:
Filter_Similarity     20     30     40     50     60     70     80     90
Split                                                                    
20                 0.197  0.153  0.339  0.425  0.446  0.459  0.425  0.416
30                 0.299  0.322  0.369  0.428  0.451  0.467  0.477  0.480
40                 0.184  0.174  0.274  0.356  0.390  0.374  0.377  0.379
50                 0.131  0.234  0.359  0.441  0.483  0.498  0.499  0.499
60                 0.138  0.267  0.286  0.356  0.429  0.457  0.458  0.461
70                 0.177  0.176  0.311  0.350  0.410  0.453  0.468  0.474
80                 0.159  0.203  0.327  0.401  0.454  0.474  0.488  0.484
90                 0.145  0.236  0.336  0.395  0.426  0.463  0.481  0.500


  .apply(lambda g: pd.Series({


In [27]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os

# Base directory
base_dir = "../../filtered_test_set/final_filtered_test/"

# Splits and bins
splits = ["20", "30", "40", "50", "60", "70", "80", "90"]
bins = np.arange(0, 1.05, 0.05)
labels = [f"{round(b,2)}-{round(b+0.05,2)}" for b in bins[:-1]]

all_precisions = []

for split in splits:
    split_dir = os.path.join(base_dir, f"test_{split}")
    if not os.path.exists(split_dir):
        print(f"⚠️ Skipping {split} (no folder {split_dir})")
        continue
    
    dfs = []
    for file in os.listdir(split_dir):
        if file.endswith(".csv") and file.startswith(f"merged_{split}_under"):
            try:
                sim = int(file.split("_under")[-1].replace(".csv", ""))
            except ValueError:
                print(f"⚠️ Could not parse similarity from filename: {file}")
                continue
            df = pd.read_csv(os.path.join(split_dir, file))
            df["Filter_Similarity"] = sim
            df["Test_Split"] = split
            dfs.append(df)
    
    if not dfs:
        print(f"⚠️ No CSV files found for test_{split}")
        continue
    
    combined = pd.concat(dfs, ignore_index=True)
    combined = combined[combined["confidence"] > 0.85]  # only high confidence

    if combined.empty:
        print(f"⚠️ No rows above confidence > 0.85 for test_{split}")
        continue
    
    combined["conf_bin"] = pd.cut(combined["confidence"], bins=bins, labels=labels, include_lowest=True)
    
    precision_df = (
        combined
        .groupby(["Test_Split", "Filter_Similarity", "conf_bin"], observed=True)
        .apply(lambda g: pd.Series({
            "precision": (g["prediction"] == g["cycle"]).mean() if len(g) > 0 else np.nan
        }))
        .reset_index()
    )
    
    all_precisions.append(precision_df)

# Combine all results
if all_precisions:
    all_precisions_df = pd.concat(all_precisions, ignore_index=True)

    # Filter only similarity 20 and 30
    subset_df = all_precisions_df[all_precisions_df["Filter_Similarity"].isin([20, 30])]

    if subset_df.empty:
        print("⚠️ No precision data found for Filter_Similarity 20 or 30")
    else:
        # --------------------------
        # Boxplot for each Test_Split
        # --------------------------
        plt.figure(figsize=(12, 7))
        sns.boxplot(
            data=subset_df.dropna(subset=["precision"]),
            x="Test_Split", y="precision",
            hue="Filter_Similarity"
        )

        plt.xticks(rotation=45)
        plt.xlabel("Test Split")
        plt.ylabel("Precision")
        plt.title("Precision Distribution (confidence > 0.85, Filter = 20 & 30)")
        plt.legend(title="Filter Similarity", bbox_to_anchor=(1.05, 1), loc="upper left")
        plt.tight_layout()

        figure_dir = "../../results/figures"
        out_file = os.path.join(figure_dir, "precision_boxplots_filter20_30_above085.png")
        plt.savefig(out_file, dpi=600, bbox_inches="tight")
        plt.close()

        print(f"✅ Saved boxplot figure: {out_file}")
else:
    print("❌ No precision data found — check file paths and naming scheme.")


  .apply(lambda g: pd.Series({
  .apply(lambda g: pd.Series({
  .apply(lambda g: pd.Series({
  .apply(lambda g: pd.Series({
  .apply(lambda g: pd.Series({
  .apply(lambda g: pd.Series({
  .apply(lambda g: pd.Series({
  .apply(lambda g: pd.Series({


✅ Saved boxplot figure: ../../results/figures/precision_boxplots_filter20_30_above085.png
