In [3]:
from pathlib import Path
import os
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.colors import qualitative

# 1) Ask for three keywords (original + two additional)
keyword_1 = input("Enter keyword 1 (example: apatite, tray3, zircon): ").strip().lower()
keyword_2 = input("Enter keyword 2: ").strip().lower()
keyword_3 = input("Enter keyword 3: ").strip().lower()

keywords = [keyword_1, keyword_2, keyword_3]
if any(not k for k in keywords):
    raise ValueError("All three keywords are required.")

# Keep order but remove duplicates if repeated
keywords = list(dict.fromkeys(keywords))
print(f"Keywords to compare: {keywords}")

# One unique color per keyword, reused on all five plots
color_palette = qualitative.Plotly
keyword_colors = {keyword: color_palette[i % len(color_palette)] for i, keyword in enumerate(keywords)}
print(f"Keyword colors: {keyword_colors}")

# Base folder containing measurement run folders
base_dir = Path.cwd().parent / "sample_outputs"
if not base_dir.exists():
    raise FileNotFoundError(f"Could not find sample_outputs folder: {base_dir}")

# Retrieve CSV data separately for these five groups
target_groups = {
    "mininghighvoltage": "Mining High Voltage",
    "mininglowvoltage": "Mining Low Voltage",
    "soilhighvoltage": "Soil High Voltage",
    "soilmidvoltage": "Soil Mid Voltage",
    "soillowvoltage": "Soil Low Voltage",
}


def infer_energy_intensity_columns(df: pd.DataFrame):
    cols_lower = {c: str(c).lower() for c in df.columns}
    numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]

    if not numeric_cols:
        coerced = df.copy()
        for c in df.columns:
            coerced[c] = pd.to_numeric(coerced[c], errors="coerce")
        numeric_cols = [c for c in coerced.columns if pd.api.types.is_numeric_dtype(coerced[c])]
        df = coerced

    if len(numeric_cols) < 2:
        return None, None, df

    energy_candidates = [
        c for c in numeric_cols
        if any(k in cols_lower[c] for k in ["energy", "kev", "channel"])
    ]
    intensity_candidates = [
        c for c in numeric_cols
        if any(k in cols_lower[c] for k in ["intensity", "counts", "count", "cps", "rate"])
    ]

    x_col = energy_candidates[0] if energy_candidates else numeric_cols[0]

    if intensity_candidates:
        y_col = next((c for c in intensity_candidates if c != x_col), None)
    else:
        y_col = next((c for c in numeric_cols if c != x_col), None)

    return x_col, y_col, df


def find_candidate_folders(keyword: str):
    # Prefer folder-name matches first (most reliable for choosing the right run set)
    folder_name_matches = [
        folder for folder in base_dir.iterdir()
        if folder.is_dir() and keyword in folder.name.lower()
    ]

    if folder_name_matches:
        return sorted(folder_name_matches)

    # Fallback: match folders where at least one CSV file contains the keyword
    candidate_folders = []
    for folder in base_dir.iterdir():
        if not folder.is_dir():
            continue
        if any(keyword in p.name.lower() for p in folder.glob("*.csv")):
            candidate_folders.append(folder)

    return sorted(candidate_folders)


def build_avg_curves(candidate_folders):
    files_by_group = {k: [] for k in target_groups}

    for folder in candidate_folders:
        for csv_path in folder.glob("*.csv"):
            name = csv_path.name.lower()
            for key in target_groups:
                if key in name:
                    files_by_group[key].append(csv_path)
                    break

    for key, files in files_by_group.items():
        print(f"{target_groups[key]}: {len(files)} files")

    avg_curves = {}
    for key, file_list in files_by_group.items():
        aligned = []

        for file_path in file_list:
            try:
                df_raw = pd.read_csv(file_path)
            except Exception:
                continue

            x_col, y_col, df = infer_energy_intensity_columns(df_raw)
            if x_col is None or y_col is None:
                continue

            curve = df[[x_col, y_col]].copy()
            curve.columns = ["energy", "intensity"]
            curve["energy"] = pd.to_numeric(curve["energy"], errors="coerce")
            curve["intensity"] = pd.to_numeric(curve["intensity"], errors="coerce")
            curve = curve.dropna().sort_values("energy")

            if curve.empty:
                continue

            curve["energy"] = curve["energy"].round(6)
            aligned.append(curve)

        if aligned:
            combined = pd.concat(aligned, ignore_index=True)
            avg = combined.groupby("energy", as_index=False)["intensity"].mean()
            avg_curves[key] = avg
        else:
            avg_curves[key] = pd.DataFrame(columns=["energy", "intensity"])

    return avg_curves


# Build average curves for each keyword
avg_curves_by_keyword = {}
for keyword in keywords:
    print("\n" + "=" * 60)
    print(f"Processing keyword: {keyword}")

    candidate_folders = find_candidate_folders(keyword)
    if not candidate_folders:
        print(f"No folders found in {base_dir} for keyword: '{keyword}'")
        avg_curves_by_keyword[keyword] = {k: pd.DataFrame(columns=["energy", "intensity"]) for k in target_groups}
        continue

    folder_to_open = candidate_folders[0]
    os.startfile(str(folder_to_open))
    print(f"Opened folder: {folder_to_open}")
    print(f"Matched folders: {len(candidate_folders)}")

    avg_curves_by_keyword[keyword] = build_avg_curves(candidate_folders)

# Plot the five energy intensity graphs and superimpose all keyword curves
fig = make_subplots(
    rows=3,
    cols=2,
    subplot_titles=[
        target_groups["mininghighvoltage"],
        target_groups["mininglowvoltage"],
        target_groups["soilhighvoltage"],
        target_groups["soilmidvoltage"],
        target_groups["soillowvoltage"],
        "",
    ],
)

plot_positions = {
    "mininghighvoltage": (1, 1),
    "mininglowvoltage": (1, 2),
    "soilhighvoltage": (2, 1),
    "soilmidvoltage": (2, 2),
    "soillowvoltage": (3, 1),
}

for group_key, (r, c) in plot_positions.items():
    for keyword in keywords:
        avg = avg_curves_by_keyword[keyword][group_key]
        if not avg.empty:
            fig.add_trace(
                go.Scatter(
                    x=avg["energy"],
                    y=avg["intensity"],
                    mode="lines",
                    name=keyword,
                    legendgroup=keyword,
                    showlegend=(group_key == "mininghighvoltage"),
                    line={"color": keyword_colors[keyword]},
                ),
                row=r,
                col=c,
            )

    fig.update_xaxes(title_text="Energy", row=r, col=c)
    fig.update_yaxes(title_text="Intensity", row=r, col=c)

keywords_label = ", ".join(keywords)
fig.update_layout(
    height=1100,
    width=1500,
    title_text=f"Average Energy Intensity Dashboard - Keywords: {keywords_label}",
    showlegend=True,
)

safe_keywords = "_".join(k.replace(" ", "_") for k in keywords)
output_html = Path.cwd() / f"{safe_keywords}_energy_intensity_5_graphs_overlay.html"
fig.write_html(output_html, include_plotlyjs="cdn")
print(f"Saved dashboard HTML: {output_html}")

# Open the generated dashboard page
os.startfile(str(output_html))
fig

Keywords to compare: ['tray', 'nephe_150', 'apatite']
Keyword colors: {'tray': '#636EFA', 'nephe_150': '#EF553B', 'apatite': '#00CC96'}

Processing keyword: tray
Opened folder: c:\Users\plancton\robotray\robotray\sample_outputs\003004_2026_02_10_tray1_1s_150
Matched folders: 2
Mining High Voltage: 299 files
Mining Low Voltage: 299 files
Soil High Voltage: 299 files
Soil Mid Voltage: 299 files
Soil Low Voltage: 299 files

Processing keyword: nephe_150
Opened folder: c:\Users\plancton\robotray\robotray\sample_outputs\003165_2026_02_11_nephe_150
Matched folders: 1
Mining High Voltage: 150 files
Mining Low Voltage: 150 files
Soil High Voltage: 150 files
Soil Mid Voltage: 150 files
Soil Low Voltage: 150 files

Processing keyword: apatite
Opened folder: c:\Users\plancton\robotray\robotray\sample_outputs\003632_2026_02_13_apatite_150
Matched folders: 1
Mining High Voltage: 150 files
Mining Low Voltage: 150 files
Soil High Voltage: 150 files
Soil Mid Voltage: 150 files
Soil Low Voltage: 150 fi