In [24]:
import pandas as pd
df = pd.read_csv('unnumbered_list/barriers_biased_unnumbered_filled.csv')

# barrier dictionary
BARRIERS = {
    1: "power losses, quality and safety issues",
    2: "reduced reliability in DC devices",
    3: "lack of use-cases in which DC is advantageous",
    4: "uncertain utility interaction (net metering, utility ownership, and agreed standards)",
    5: "lack of pilot projects",
    6: "public perception of DC and readiness to 'champion' installations from DC projects",
    7: "incompatibility of DC systems components",
    8: "misconception and lack of knowledge leads to lengthy/expensive design and permit process",
    9: "lack of enough trained personnel in DC systems",
    10: "uncertain regulatory roadmap",
    11: "high costs of DC solutions",
}

# Reverse the dictionary: barrier name -> id
BARRIERS_REVERSED = {v: k for k, v in BARRIERS.items()}

# Map 'official_label' to barrier id
df['barrier_id'] = df['official_label'].map(BARRIERS_REVERSED)

df.to_csv('unnumbered_list/barriers_biased_unnumbered_filled_labeled.csv', index=False)

In [28]:
import pandas as pd
import numpy as np
df=pd.read_csv("unnumbered_list/barriers_biased_unnumbered_filled_labeled.csv")
df.shape

(5018, 17)

In [None]:
import pandas as pd
import numpy as np
df=pd.read_csv("unnumbered_list/barriers_biased_unnumbered_filled_labeled.csv")
# ----- CONFIG -----
EXPECTED_PER_ITER = 5
OUT_ALL   = "unnumbered_list/unnumbered_barrier_counts_by_model_variant_bias_iter.csv"
OUT_OVER  = "unnumbered_list/unnumbered_barrier_counts_over_limit.csv"

# Normalize/typing
df["base_model"] = df["base_model"].astype(str).str.strip()
df["variant_id"] = df["variant_id"].astype(str).str.strip()
df["bias_type"]  = df.get("bias_type", pd.Series(index=df.index)).astype(str).str.strip()
df.loc[df["bias_type"].eq(""), "bias_type"] = np.nan  # treat empty as missing
df["iteration"]  = pd.to_numeric(df.get("iteration", pd.Series(index=df.index)), errors="coerce").astype("Int64")
df["barrier_id"] = pd.to_numeric(df.get("barrier_id", pd.Series(index=df.index)), errors="coerce").astype("Int64")

#take only rows with values in bias_type:
df_anchor = df[df["bias_type"].notna()].copy()

# Group and count
counts = (
    df_anchor
    .groupby(["base_model", "variant_id", "bias_type", "iteration"], dropna=False)
    .agg(
        rows=("row_id", "size"),
        n_unique_barriers=("barrier_id", lambda s: s.dropna().nunique()),
    )
    .reset_index()
    .assign(
        dupes=lambda d: d["rows"] - d["n_unique_barriers"],
        over_rows=lambda d: d["rows"] > EXPECTED_PER_ITER,
        over_unique=lambda d: d["n_unique_barriers"] > EXPECTED_PER_ITER,
    )
    .sort_values(["base_model", "variant_id", "bias_type", "iteration"])
)

# Save full counts and only-over-limit views
counts.to_csv(OUT_ALL, index=False)
counts.loc[counts["over_rows"] | counts["over_unique"]].to_csv(OUT_OVER, index=False)

# ---------- DUPLICATE DETAILS (summaries + row-level) ----------
OUT_DUPES_SUMMARY = "unnumbered_list/duplicate_barriers_summary_unnumbered.csv"
OUT_DUPES_ROWS    = "unnumbered_list/duplicate_rows_unnumbered.csv"

grp = ["base_model", "variant_id", "bias_type", "iteration", "barrier_id"]

# Work only with rows that have a barrier_id (duplicates without a barrier_id don't make sense)
df_anchor_nonnull = df_anchor.dropna(subset=["barrier_id"]).copy()

# Ensure we have a stable row identifier
if "row_id" not in df_anchor_nonnull.columns:
    df_anchor_nonnull["row_id"] = df_anchor_nonnull.index.astype(int)

# --- Summary per (combo + barrier_id): count + row IDs ---
dupe_counts = (
    df_anchor_nonnull
      .groupby(grp, dropna=False)
      .size()
      .reset_index(name="rows_for_barrier")
)

row_ids_agg = (
    df_anchor_nonnull
      .groupby(grp, dropna=False)["row_id"]
      .apply(list)
      .reset_index(name="row_ids_for_barrier")
)

dupe_summary = dupe_counts.merge(row_ids_agg, on=grp, how="left")

# Keep only actual duplicates (count > 1)
dupe_summary_only = dupe_summary.loc[dupe_summary["rows_for_barrier"] > 1].copy()

# --- Row-level duplicate expansion ---
# explode the list of row_ids to get per-row details
dupe_rows = (
    dupe_summary_only
      .explode("row_ids_for_barrier")
      .rename(columns={"row_ids_for_barrier": "row_id"})
      .merge(
          df_anchor_nonnull,
          on=["row_id", "base_model", "variant_id", "bias_type", "iteration", "barrier_id"],
          how="left",
          suffixes=("", "_orig")
      )
      .sort_values(["base_model", "variant_id", "bias_type", "iteration", "barrier_id", "row_id"])
)

# Save files
dupe_summary_only.to_csv(OUT_DUPES_SUMMARY, index=False)
dupe_rows.to_csv(OUT_DUPES_ROWS, index=False)
print(" -", OUT_DUPES_SUMMARY)
print(" -", OUT_DUPES_ROWS)

 - unnumbered_list/duplicate_barriers_summary_unnumbered.csv
 - unnumbered_list/duplicate_rows_unnumbered.csv


In [43]:
import pandas as pd
df = pd.read_csv("unnumbered_list/dummy_cleaning/barriers_biased_unnumbered_filled_labeled.csv")
df.shape

(5018, 17)

In [45]:
#removing duplicates rows per combination of (base_model, variant_id, bias_type, iteration, barrier_id)
import pandas as pd
import numpy as np

INPUT_FILE   = "unnumbered_list/dummy_cleaning/barriers_biased_unnumbered_filled_labeled.csv"
CLEAN_FILE   = "unnumbered_list/dummy_cleaning/barriers_clean.csv"
DUPES_FILE   = "unnumbered_list/dummy_cleaning/barriers_duplicates.csv"
SUMMARY_FILE = "unnumbered_list/dummy_cleaning/barriers_duplicates_summary.csv"

df = pd.read_csv(INPUT_FILE)

# --- Normalize/typing ---
for col in ["base_model", "variant_id"]:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()
df["bias_type"]  = df.get("bias_type", pd.Series(index=df.index)).astype(str).str.strip()
df.loc[df["bias_type"].eq("")] = np.nan
df["iteration"]  = pd.to_numeric(df.get("iteration", pd.Series(index=df.index)), errors="coerce").astype("Int64")
df["barrier_id"] = pd.to_numeric(df.get("barrier_id", pd.Series(index=df.index)), errors="coerce").astype("Int64")

# Ensure a stable row identifier (use existing if present, else create)
if "row_id" not in df.columns:
    df["row_id"] = df.index.astype(int)

# Work only on anchor (biased) rows; keep others as-is
anchor_mask = df["bias_type"].notna()
anchor = df.loc[anchor_mask].copy()
non_anchor = df.loc[~anchor_mask].copy()

# Sort so "first" is meaningful; prefer earliest timestamp if available
if "timestamp" in anchor.columns:
    # If timestamp is parseable, use it; otherwise falls back to current order
    ts = pd.to_datetime(anchor["timestamp"], errors="coerce")
    anchor = anchor.assign(_ts=ts).sort_values(["base_model","variant_id","bias_type","iteration","barrier_id","_ts","row_id"])
else:
    anchor = anchor.sort_values(["base_model","variant_id","bias_type","iteration","barrier_id","row_id"])

KEY = ["base_model","variant_id","bias_type","iteration","barrier_id"]

# Extra duplicates beyond the first occurrence in each KEY
dupes = anchor[anchor.duplicated(subset=KEY, keep="first")].copy()

# Cleaned anchor = keep first per KEY
anchor_clean = anchor.drop_duplicates(subset=KEY, keep="first").copy()

# Recombine with non-anchor rows
clean = pd.concat([anchor_clean.drop(columns=[c for c in ["_ts"] if c in anchor_clean.columns]),
                   non_anchor], ignore_index=True)

# Optional: sort final output for readability
clean = clean.sort_values(["base_model","variant_id","bias_type","iteration","barrier_id","row_id"], na_position="last")

# --- Save outputs ---
clean.to_csv(CLEAN_FILE, index=False)
dupes.to_csv(DUPES_FILE, index=False)

# Summary by combo + barrier (how many extras removed)
if not dupes.empty:
    summary = (dupes
               .groupby(KEY, dropna=False)
               .agg(n_extra_dupes=("row_id","size"),
                    row_ids=("row_id", lambda s: list(s)))
               .reset_index()
               .sort_values(KEY))
    summary.to_csv(SUMMARY_FILE, index=False)

print(f"✔ Clean file saved: {CLEAN_FILE}  (rows: {len(clean)})")
print(f"✔ Duplicate rows saved: {DUPES_FILE}  (rows: {len(dupes)})")
if 'summary' in locals():
    print(f"✔ Duplicate summary saved: {SUMMARY_FILE}  (rows: {len(summary)})")
else:
    print("✔ No within-run duplicates found according to KEY.")

✔ Clean file saved: unnumbered_list/dummy_cleaning/barriers_clean.csv  (rows: 5007)
✔ Duplicate rows saved: unnumbered_list/dummy_cleaning/barriers_duplicates.csv  (rows: 11)
✔ Duplicate summary saved: unnumbered_list/dummy_cleaning/barriers_duplicates_summary.csv  (rows: 11)


  ts = pd.to_datetime(anchor["timestamp"], errors="coerce")


In [46]:
#eliminate extra exceeding 5

import pandas as pd

# Input and output paths
INPUT_FILE = "unnumbered_list/dummy_cleaning/barriers_clean.csv"
CLEAN_FILE = "unnumbered_list/unnumbered_barriers_max5_clean.csv"
EXTRA_FILE = "unnumbered_list/dummy_cleaning/unnumbered_barriers_extras.csv"

# Load data
df = pd.read_csv(INPUT_FILE)

# Define grouping columns (adjust as needed)
GROUP_COLS = ["base_model", "variant_id", "bias_type", "iteration"]

# Rank rows within each group
df["_rank"] = df.groupby(GROUP_COLS).cumcount() + 1

# Split into kept (≤5) and eliminated (>5)
clean = df[df["_rank"] <= 5].drop(columns="_rank")
extras = df[df["_rank"] > 5].drop(columns="_rank")

# Save to CSV
clean.to_csv(CLEAN_FILE, index=False)
extras.to_csv(EXTRA_FILE, index=False)

print("✔ Clean file saved to:", CLEAN_FILE)
print("✔ Extra rows saved to:", EXTRA_FILE)


✔ Clean file saved to: unnumbered_list/unnumbered_barriers_max5_clean.csv
✔ Extra rows saved to: unnumbered_list/dummy_cleaning/unnumbered_barriers_extras.csv


In [47]:
import pandas as pd
df=pd.read_csv("unnumbered_list/unnumbered_barriers_max5_clean.csv")

df.shape

(4791, 17)

In [39]:
import pandas as pd
import numpy as np
df=pd.read_csv("barriers_biased_inverse_0253hrs_labeled.csv")
# --- Key columns / typing ---
COMBO = ["base_model", "variant_id", "bias_type", "iteration"]  # adjust if needed
# Optional: normalize strings
for c in ["base_model", "variant_id", "bias_type"]:
    if c in df.columns:
        df[c] = df[c].astype(str).str.strip()

# --- Choose sort priority so we know which duplicate to keep ---
# If you have a 'list_order' or 'timestamp' column, sort by those first.
sort_cols = []
if "list_order" in df.columns:
    sort_cols.append("list_order")
if "timestamp" in df.columns:
    sort_cols.append("timestamp")
# always end with row_id for stability if present
if "row_id" in df.columns:
    sort_cols.append("row_id")

# Build full sort order: combo keys + chosen sort columns
sort_by = COMBO + sort_cols if sort_cols else COMBO
df_sorted = df.sort_values(sort_by)

# --- Remove duplicates: keep first barrier_id per combo ---
dedup = df_sorted.drop_duplicates(subset=COMBO + ["barrier_id"], keep="first")

# --- Save deduped rows ---
dedup.to_csv("barriers_dedup.csv", index=False)
print("✅ Wrote barriers_dedup.csv")

# --- (Optional) counts after dedup, to inspect remaining sizes ---
after_counts = (
    dedup.groupby(COMBO, dropna=False)
         .agg(
             rows=("row_id", "size"),
             n_unique_barriers=("barrier_id", lambda s: s.dropna().nunique()),
         )
         .reset_index()
         .assign(dupes_removed=lambda d: d["rows"] - d["n_unique_barriers"])
         .sort_values(COMBO)
)
after_counts.to_csv("barrier_counts_after_dedup.csv", index=False)
print("✅ Wrote barrier_counts_after_dedup.csv")

# Quick peek
display(after_counts.head())


✅ Wrote barriers_dedup.csv
✅ Wrote barrier_counts_after_dedup.csv


Unnamed: 0,base_model,variant_id,bias_type,iteration,rows,n_unique_barriers,dupes_removed
0,gemma3,gemma3_generalist,BIAS_EXAMPLE,0,5,5,0
1,gemma3,gemma3_generalist,BIAS_EXAMPLE,1,5,5,0
2,gemma3,gemma3_generalist,BIAS_EXAMPLE,2,5,5,0
3,gemma3,gemma3_generalist,BIAS_EXAMPLE,3,5,5,0
4,gemma3,gemma3_generalist,BIAS_EXAMPLE,4,5,5,0


In [None]:
import pandas as pd
import numpy as np
df=pd.read_csv("barriers_biased_inverse_0253hrs_labeled.csv")

# Define the per-run combination (edit if needed)
COMBO = ["base_model", "variant_id", "bias_type", "iteration"]
IDCOL = "barrier_id"

# Keep a stable original-row order key
df["_ord"] = np.arange(len(df))

# Ensure types (so drop_duplicates works reliably)
df[IDCOL] = pd.to_numeric(df[IDCOL], errors="coerce").astype("Int64")
df["iteration"] = pd.to_numeric(df.get("iteration", pd.Series(index=df.index)), errors="coerce").astype("Int64")

# 1) Remove duplicate barrier_id within each combo, keeping the first occurrence in the ORIGINAL order
dedup = df.drop_duplicates(subset=COMBO + [IDCOL], keep="first")

# 2) Keep only the first 5 rows per combo (again, by ORIGINAL order)
dedup = (
    dedup.sort_values("_ord")              # ensure original order
         .groupby(COMBO, sort=False)
         .head(5)                          # keep at most 5 per combo
         .sort_values("_ord")              # restore overall original order
         .drop(columns=["_ord"])           # cleanup
)

# Save the clean CSV
dedup.to_csv("barriers_dedup_first5_preserve_order.csv", index=False)
print("✅ Wrote barriers_dedup_first5_preserve_order.csv")

✅ Wrote barriers_dedup_first5_preserve_order.csv


In [45]:
import pandas as pd
import numpy as np

df=pd.read_csv("reversedbarrier_anchor_okay.csv")

# barrier dictionary
BARRIERS = {
    1: "high costs of DC solutions",
    2: "uncertain regulatory roadmap",
    3: "lack of enough trained personnel in DC systems",
    4: "misconception and lack of knowledge leads to lengthy/expensive design and permit process",
    5: "incompatibility of DC systems components",
    6: "public perception of DC and readiness to 'champion' installations from DC projects",
    7: "lack of pilot projects",
    8: "uncertain utility interaction (net metering, utility ownership, and agreed standards)",
    9: "lack of use-cases in which DC is advantageous",
    10: "reduced reliability in DC devices",
    11: "power losses, quality and safety issues"
}

CANON_FIRST5 = set(list(BARRIERS.keys())[:5])

# ---------------- Assumes df is already in memory ----------------
# Required columns: base_model, variant_id, model, barrier_id
# Optional but harmless: bias_type, iteration, timestamp, etc.

# Light typing / cleaning
df = df.copy()
df["base_model"] = df["base_model"].astype(str).str.strip()
df["variant_id"] = df["variant_id"].astype(str).str.strip()
df["model"]      = df["model"].astype(str).str.strip()
df["barrier_id"] = pd.to_numeric(df["barrier_id"], errors="coerce").astype("Int64")

# ---------------- 1) Selection counts & rates per combo ----------------
GROUP = ["base_model", "variant_id", "model"]

counts = (
    df.dropna(subset=["barrier_id"])
      .groupby(GROUP + ["barrier_id"], dropna=False)
      .size()
      .reset_index(name="count")
)

totals = counts.groupby(GROUP, dropna=False)["count"].sum().reset_index(name="total_count")

counts = counts.merge(totals, on=GROUP, how="left")
counts["rate"] = counts["count"] / counts["total_count"].replace(0, np.nan)

# Wide tables: one column per barrier_id (counts and rates)
counts_wide = (
    counts.pivot_table(index=GROUP, columns="barrier_id", values="count", fill_value=0)
          .rename_axis(None, axis=1)
          .reset_index()
)
rates_wide = (
    counts.pivot_table(index=GROUP, columns="barrier_id", values="rate", fill_value=0.0)
          .rename_axis(None, axis=1)
          .reset_index()
)

# Add total_count to both for convenience
counts_wide = counts_wide.merge(totals, on=GROUP, how="left")
rates_wide  = rates_wide.merge(totals, on=GROUP, how="left")

# Save selection tables
counts_wide.to_csv("reversedlist_selection_counts_by_combo_wide.csv", index=False)
rates_wide.to_csv("reversedlist_selection_rates_by_combo_wide.csv", index=False)

# --- canonical first 5 from dict insertion order ---
CANON_FIRST5_LIST = list(BARRIERS.keys())[:5]
CANON_FIRST5 = set(CANON_FIRST5_LIST)

def jaccard(a: set, b: set) -> float:
    if not a and not b:
        return 1.0
    return len(a & b) / len(a | b) if (a or b) else 0.0

# --- build top-5 per GROUP: both set and ordered list ---
# sort by GROUP (asc) then count (desc)
sorted_counts = counts.sort_values(
    GROUP + ["count"],
    ascending=[True]*len(GROUP) + [False]
)

# take top-5 rows per combo
top5_rows = (
    sorted_counts
    .groupby(GROUP, dropna=False)
    .head(5)
)

# ordered list (by count desc) and set for Jaccard
top5_ordered = (
    top5_rows.groupby(GROUP, dropna=False)["barrier_id"]
             .apply(lambda s: [int(x) for x in s.tolist()])           # keep order
             .reset_index(name="top5_selected")                        # ordered list
)

top5_sets = (
    top5_rows.groupby(GROUP, dropna=False)["barrier_id"]
             .apply(lambda s: set(int(x) for x in s.dropna().tolist()))
             .reset_index(name="top5_set")                             # set for Jaccard
)

top5 = top5_ordered.merge(top5_sets, on=GROUP, how="left")
top5["jaccard_top5_vs_first5"] = top5["top5_set"].map(lambda S: jaccard(S, CANON_FIRST5))

# (optional) unique-set section can stay or be removed; keeping here for context
unique_sets = (
    df.dropna(subset=["barrier_id"])
      .groupby(GROUP, dropna=False)["barrier_id"]
      .apply(lambda s: set(int(x) for x in s.dropna().tolist()))
      .reset_index(name="selected_set")
)
unique_sets["n_selected_unique"] = unique_sets["selected_set"].map(len)

# --- assemble output with easy-to-audit columns ---
jaccard_out = (
    totals
    .merge(unique_sets, on=GROUP, how="left")
    .merge(top5,        on=GROUP, how="left")
    .assign(
        # readable versions
        selected_set=lambda d: d["selected_set"].map(lambda s: sorted(list(s)) if isinstance(s, set) else []),
        canon_first5=lambda d: [CANON_FIRST5_LIST]*len(d)
    )[
        GROUP
        + ["total_count",
           "n_selected_unique", "selected_set",
           "top5_selected", "jaccard_top5_vs_first5", "canon_first5"]
    ]
    .sort_values(GROUP)
)

jaccard_out.to_csv("jaccard_vs_canonical_first5_by_combo.csv", index=False)


print("✅ Wrote:")
print(" - reversedlist_selection_counts_by_combo_wide.csv")
print(" - reversedlist_selection_rates_by_combo_wide.csv")
print(" - reversedlist_jaccard_vs_canonical_first5_by_combo.csv")


✅ Wrote:
 - reversedlist_selection_counts_by_combo_wide.csv
 - reversedlist_selection_rates_by_combo_wide.csv
 - reversedlist_jaccard_vs_canonical_first5_by_combo.csv


In [7]:
#COMPRESSED FILES REVERSED BARRIER LIST
import pandas as pd

import pandas as pd
import numpy as np
df=pd.read_csv("reversed_list/reversedbarrier_anchor_okay.csv")

# ---------------- Canonical barrier list (source of truth) ----------------
# reversed- barrier dictionary
BARRIERS = {
    1: "high costs of DC solutions",
    2: "uncertain regulatory roadmap",
    3: "lack of enough trained personnel in DC systems",
    4: "misconception and lack of knowledge leads to lengthy/expensive design and permit process",
    5: "incompatibility of DC systems components",
    6: "public perception of DC and readiness to 'champion' installations from DC projects",
    7: "lack of pilot projects",
    8: "uncertain utility interaction (net metering, utility ownership, and agreed standards)",
    9: "lack of use-cases in which DC is advantageous",
    10: "reduced reliability in DC devices",
    11: "power losses, quality and safety issues"
}

CANON_FIRST5 = set(list(BARRIERS.keys())[:5])

# ---------------- Assumes df is already in memory ----------------
# Required columns: base_model, variant_id, model, barrier_id
# Optional but harmless: bias_type, iteration, timestamp, etc.

# Light typing / cleaning
df = df.copy()
df["base_model"] = df["base_model"].astype(str).str.strip()
df["variant_id"] = df["variant_id"].astype(str).str.strip()
df["model"]      = df["model"].astype(str).str.strip()
df["barrier_id"] = pd.to_numeric(df["barrier_id"], errors="coerce").astype("Int64")

# ---------------- 1) Selection counts & rates per combo ----------------
GROUP = ["base_model", "variant_id", "model"]
GROUP_BASE =["base_model"]


# --- canonical first 5 from dict insertion order ---
CANON_FIRST5_LIST = list(BARRIERS.keys())[:5]
CANON_FIRST5 = set(CANON_FIRST5_LIST)

def jaccard(a: set, b: set) -> float:
    if not a and not b:
        return 1.0
    return len(a & b) / len(a | b) if (a or b) else 0.0

counts_base = (
    df.dropna(subset=["barrier_id"])
      .groupby(GROUP_BASE + ["barrier_id"], dropna=False)
      .size()
        .reset_index(name="count")
)

totals_base = (
    counts_base.groupby(GROUP_BASE, dropna=False)["count"]
            .sum()
            .reset_index(name="total_count")
)

#top-5 by aggregated counts
sorted_counts_base = counts_base.sort_values(
    GROUP_BASE + ["count", "barrier_id"],
    ascending=[True]*len(GROUP_BASE) + [False, True]
)

top5_rows_base=(
    sorted_counts_base
    .groupby(GROUP_BASE, dropna=False)
    .head(5)
)

top5_ordered_base = (
    top5_rows_base.groupby(GROUP_BASE, dropna=False)["barrier_id"]
                  .apply(lambda s: [int(x) for x in s.tolist()])   # keep order
                  .reset_index(name="top5_selected")
)

# Union of all selected barriers across variants for each base
unique_sets_base = (
    df.dropna(subset=["barrier_id"])
      .groupby(GROUP_BASE, dropna=False)["barrier_id"]
      .apply(lambda s: set(int(x) for x in s.dropna().tolist()))
      .reset_index(name="selected_set")
)
unique_sets_base["n_selected_unique"] = unique_sets_base["selected_set"].map(len)

# Assemble base-level output (same columns schema)
jaccard_out_base = (
    totals_base
    .merge(unique_sets_base, on=GROUP_BASE, how="left")
    .merge(top5_ordered_base, on=GROUP_BASE, how="left")
    .assign(
        jaccard_top5_vs_first5=lambda d: d["top5_selected"].map(lambda lst: jaccard(set(lst or []), set(CANON_FIRST5_LIST))),
        # readable versions
        selected_set=lambda d: d["selected_set"].map(lambda s: sorted(list(s)) if isinstance(s, set) else []),
        canon_first5=[CANON_FIRST5_LIST]*len(totals_base)
    )[
        ["base_model",
         "total_count",
         "n_selected_unique", "selected_set",
         "top5_selected", "jaccard_top5_vs_first5", "canon_first5"]
    ]
    .sort_values("base_model")
)

jaccard_out_base.to_csv("reversed_list/reversedlist_jaccard_compressed.csv", index=False)

print(" - reversed_list/reversedlist_jaccard_compressed.csv")

# All barrier ids (to fix a consistent column order)
barrier_ids_sorted = sorted(int(x) for x in df["barrier_id"].dropna().unique())

# Aggregated counts per base (combine all variants)
counts_base = (
    df.dropna(subset=["barrier_id"])
      .groupby(GROUP_BASE + ["barrier_id"], dropna=False)
      .size()
      .reset_index(name="count")
)

# Total selections per base
totals_base = (
    counts_base.groupby(GROUP_BASE, dropna=False)["count"]
               .sum()
               .reset_index(name="total_count")
)

# ---- COUNTS wide (one row per base, one column per barrier) ----
counts_base_wide = (
    counts_base.pivot_table(index=GROUP_BASE, columns="barrier_id", values="count", fill_value=0)
               .rename_axis(None, axis=1)
               .reset_index()
)

# Ensure barrier columns appear in numeric order B1..B* (as raw ints, same as your original)
ordered_cols_counts = ["base_model"] + barrier_ids_sorted
# Add missing columns (if any barriers absent for all bases)
for c in barrier_ids_sorted:
    if c not in counts_base_wide.columns:
        counts_base_wide[c] = 0
counts_base_wide = counts_base_wide.reindex(columns=ordered_cols_counts)

# Append total_count
counts_base_wide = counts_base_wide.merge(totals_base, on="base_model", how="left")

# ---- RATES wide (counts / total_count per base) ----
rate_base = counts_base.merge(totals_base, on="base_model", how="left")
rate_base["rate"] = rate_base["count"] / rate_base["total_count"].replace(0, np.nan)

rates_base_wide = (
    rate_base.pivot_table(index=GROUP_BASE, columns="barrier_id", values="rate", fill_value=0.0)
             .rename_axis(None, axis=1)
             .reset_index()
)

# Ensure same barrier column order
ordered_cols_rates = ["base_model"] + barrier_ids_sorted
for c in barrier_ids_sorted:
    if c not in rates_base_wide.columns:
        rates_base_wide[c] = 0.0
rates_base_wide = rates_base_wide.reindex(columns=ordered_cols_rates)

# Append total_count for convenience/verification
rates_base_wide = rates_base_wide.merge(totals_base, on="base_model", how="left")

# ---- Write CSVs ----
counts_base_wide.to_csv("reversed_list/reversedlist_selection_counts_compressed.csv", index=False)
rates_base_wide.to_csv("reversed_list/reversedlist_selection_rates_compressed.csv", index=False)

print(" - reversed_list/reversedlist_selection_counts_compressed.csv")
print(" - reversed_list/reversedlist_selection_rates_compressed.csv")


 - reversed_list/reversedlist_jaccard_compressed.csv
 - reversed_list/reversedlist_selection_counts_compressed.csv
 - reversed_list/reversedlist_selection_rates_compressed.csv


In [46]:
import pandas as pd
import numpy as np
df=pd.read_csv("orderedbarrier_anchor_okay.csv")

# ---------------- Canonical barrier list (source of truth) ----------------
BARRIERS = {
    1: "power losses, quality and safety issues",
    2: "reduced reliability in DC devices",
    3: "lack of use-cases in which DC is advantageous",
    4: "uncertain utility interaction (net metering, utility ownership, and agreed standards)",
    5: "lack of pilot projects",
    6: "public perception of DC and readiness to 'champion' installations from DC projects",
    7: "incompatibility of DC systems components",
    8: "misconception and lack of knowledge leads to lengthy/expensive design and permit process",
    9: "lack of enough trained personnel in DC systems",
    10: "uncertain regulatory roadmap",
    11: "high costs of DC solutions",
}

CANON_FIRST5 = set(list(BARRIERS.keys())[:5])

# ---------------- Assumes df is already in memory ----------------
# Required columns: base_model, variant_id, model, barrier_id
# Optional but harmless: bias_type, iteration, timestamp, etc.

# Light typing / cleaning
df = df.copy()
df["base_model"] = df["base_model"].astype(str).str.strip()
df["variant_id"] = df["variant_id"].astype(str).str.strip()
df["model"]      = df["model"].astype(str).str.strip()
df["barrier_id"] = pd.to_numeric(df["barrier_id"], errors="coerce").astype("Int64")

# ---------------- 1) Selection counts & rates per combo ----------------
GROUP = ["base_model", "variant_id", "model"]

counts = (
    df.dropna(subset=["barrier_id"])
      .groupby(GROUP + ["barrier_id"], dropna=False)
      .size()
      .reset_index(name="count")
)

totals = counts.groupby(GROUP, dropna=False)["count"].sum().reset_index(name="total_count")

counts = counts.merge(totals, on=GROUP, how="left")
counts["rate"] = counts["count"] / counts["total_count"].replace(0, np.nan)

# Wide tables: one column per barrier_id (counts and rates)
counts_wide = (
    counts.pivot_table(index=GROUP, columns="barrier_id", values="count", fill_value=0)
          .rename_axis(None, axis=1)
          .reset_index()
)
rates_wide = (
    counts.pivot_table(index=GROUP, columns="barrier_id", values="rate", fill_value=0.0)
          .rename_axis(None, axis=1)
          .reset_index()
)

# Add total_count to both for convenience
counts_wide = counts_wide.merge(totals, on=GROUP, how="left")
rates_wide  = rates_wide.merge(totals, on=GROUP, how="left")

# Save selection tables
counts_wide.to_csv("orderedlist_selection_counts_by_combo_wide.csv", index=False)
rates_wide.to_csv("orderedlist_selection_rates_by_combo_wide.csv", index=False)

# --- canonical first 5 from dict insertion order ---
CANON_FIRST5_LIST = list(BARRIERS.keys())[:5]
CANON_FIRST5 = set(CANON_FIRST5_LIST)

def jaccard(a: set, b: set) -> float:
    if not a and not b:
        return 1.0
    return len(a & b) / len(a | b) if (a or b) else 0.0

# --- build top-5 per GROUP: both set and ordered list ---
# sort by GROUP (asc) then count (desc)
sorted_counts = counts.sort_values(
    GROUP + ["count"],
    ascending=[True]*len(GROUP) + [False]
)

# take top-5 rows per combo
top5_rows = (
    sorted_counts
    .groupby(GROUP, dropna=False)
    .head(5)
)

# ordered list (by count desc) and set for Jaccard
top5_ordered = (
    top5_rows.groupby(GROUP, dropna=False)["barrier_id"]
             .apply(lambda s: [int(x) for x in s.tolist()])           # keep order
             .reset_index(name="top5_selected")                        # ordered list
)

top5_sets = (
    top5_rows.groupby(GROUP, dropna=False)["barrier_id"]
             .apply(lambda s: set(int(x) for x in s.dropna().tolist()))
             .reset_index(name="top5_set")                             # set for Jaccard
)

top5 = top5_ordered.merge(top5_sets, on=GROUP, how="left")
top5["jaccard_top5_vs_first5"] = top5["top5_set"].map(lambda S: jaccard(S, CANON_FIRST5))

# (optional) unique-set section can stay or be removed; keeping here for context
unique_sets = (
    df.dropna(subset=["barrier_id"])
      .groupby(GROUP, dropna=False)["barrier_id"]
      .apply(lambda s: set(int(x) for x in s.dropna().tolist()))
      .reset_index(name="selected_set")
)
unique_sets["n_selected_unique"] = unique_sets["selected_set"].map(len)

# --- assemble output with easy-to-audit columns ---
jaccard_out = (
    totals
    .merge(unique_sets, on=GROUP, how="left")
    .merge(top5,        on=GROUP, how="left")
    .assign(
        # readable versions
        selected_set=lambda d: d["selected_set"].map(lambda s: sorted(list(s)) if isinstance(s, set) else []),
        canon_first5=lambda d: [CANON_FIRST5_LIST]*len(d)
    )[
        GROUP
        + ["total_count",
           "n_selected_unique", "selected_set",
           "top5_selected", "jaccard_top5_vs_first5", "canon_first5"]
    ]
    .sort_values(GROUP)
)

jaccard_out.to_csv("orderedlist_jaccard_vs_canonical_first5_by_combo.csv", index=False)


print("✅ Wrote:")
print(" - orderedlist_selection_counts_by_combo_wide.csv")
print(" - orderedlist_selection_rates_by_combo_wide.csv")
print(" - orderedlist_jaccard_vs_canonical_first5_by_combo.csv")

✅ Wrote:
 - orderedlist_selection_counts_by_combo_wide.csv
 - orderedlist_selection_rates_by_combo_wide.csv
 - orderedlist_jaccard_vs_canonical_first5_by_combo.csv


In [None]:
# compressed files - ordered barrier list
import pandas as pd
import numpy as np
df=pd.read_csv("ordered_list/orderedbarrier_anchor_okay.csv")

# ---------------- Canonical barrier list (source of truth) ----------------
BARRIERS = {
    1: "power losses, quality and safety issues",
    2: "reduced reliability in DC devices",
    3: "lack of use-cases in which DC is advantageous",
    4: "uncertain utility interaction (net metering, utility ownership, and agreed standards)",
    5: "lack of pilot projects",
    6: "public perception of DC and readiness to 'champion' installations from DC projects",
    7: "incompatibility of DC systems components",
    8: "misconception and lack of knowledge leads to lengthy/expensive design and permit process",
    9: "lack of enough trained personnel in DC systems",
    10: "uncertain regulatory roadmap",
    11: "high costs of DC solutions",
}

CANON_FIRST5 = set(list(BARRIERS.keys())[:5])

# ---------------- Assumes df is already in memory ----------------
# Required columns: base_model, variant_id, model, barrier_id
# Optional but harmless: bias_type, iteration, timestamp, etc.

# Light typing / cleaning
df = df.copy()
df["base_model"] = df["base_model"].astype(str).str.strip()
df["variant_id"] = df["variant_id"].astype(str).str.strip()
df["model"]      = df["model"].astype(str).str.strip()
df["barrier_id"] = pd.to_numeric(df["barrier_id"], errors="coerce").astype("Int64")

# ---------------- 1) Selection counts & rates per combo ----------------
GROUP = ["base_model", "variant_id", "model"]
GROUP_BASE =["base_model"]


# --- canonical first 5 from dict insertion order ---
CANON_FIRST5_LIST = list(BARRIERS.keys())[:5]
CANON_FIRST5 = set(CANON_FIRST5_LIST)

def jaccard(a: set, b: set) -> float:
    if not a and not b:
        return 1.0
    return len(a & b) / len(a | b) if (a or b) else 0.0

counts_base = (
    df.dropna(subset=["barrier_id"])
      .groupby(GROUP_BASE + ["barrier_id"], dropna=False)
      .size()
        .reset_index(name="count")
)

totals_base = (
    counts_base.groupby(GROUP_BASE, dropna=False)["count"]
            .sum()
            .reset_index(name="total_count")
)

#top-5 by aggregated counts
sorted_counts_base = counts_base.sort_values(
    GROUP_BASE + ["count", "barrier_id"],
    ascending=[True]*len(GROUP_BASE) + [False, True]
)

top5_rows_base=(
    sorted_counts_base
    .groupby(GROUP_BASE, dropna=False)
    .head(5)
)

top5_ordered_base = (
    top5_rows_base.groupby(GROUP_BASE, dropna=False)["barrier_id"]
                  .apply(lambda s: [int(x) for x in s.tolist()])   # keep order
                  .reset_index(name="top5_selected")
)

# Union of all selected barriers across variants for each base
unique_sets_base = (
    df.dropna(subset=["barrier_id"])
      .groupby(GROUP_BASE, dropna=False)["barrier_id"]
      .apply(lambda s: set(int(x) for x in s.dropna().tolist()))
      .reset_index(name="selected_set")
)
unique_sets_base["n_selected_unique"] = unique_sets_base["selected_set"].map(len)

# Assemble base-level output (same columns schema)
jaccard_out_base = (
    totals_base
    .merge(unique_sets_base, on=GROUP_BASE, how="left")
    .merge(top5_ordered_base, on=GROUP_BASE, how="left")
    .assign(
        jaccard_top5_vs_first5=lambda d: d["top5_selected"].map(lambda lst: jaccard(set(lst or []), set(CANON_FIRST5_LIST))),
        # readable versions
        selected_set=lambda d: d["selected_set"].map(lambda s: sorted(list(s)) if isinstance(s, set) else []),
        canon_first5=[CANON_FIRST5_LIST]*len(totals_base)
    )[
        ["base_model",
         "total_count",
         "n_selected_unique", "selected_set",
         "top5_selected", "jaccard_top5_vs_first5", "canon_first5"]
    ]
    .sort_values("base_model")
)

jaccard_out_base.to_csv("ordered_list/orderedlist_jaccard_compressed.csv", index=False)

print(" - ordered_list/orderedlist_jaccard_compressed.csv")

# All barrier ids (to fix a consistent column order)
barrier_ids_sorted = sorted(int(x) for x in df["barrier_id"].dropna().unique())

# Aggregated counts per base (combine all variants)
counts_base = (
    df.dropna(subset=["barrier_id"])
      .groupby(GROUP_BASE + ["barrier_id"], dropna=False)
      .size()
      .reset_index(name="count")
)

# Total selections per base
totals_base = (
    counts_base.groupby(GROUP_BASE, dropna=False)["count"]
               .sum()
               .reset_index(name="total_count")
)

# ---- COUNTS wide (one row per base, one column per barrier) ----
counts_base_wide = (
    counts_base.pivot_table(index=GROUP_BASE, columns="barrier_id", values="count", fill_value=0)
               .rename_axis(None, axis=1)
               .reset_index()
)

# Ensure barrier columns appear in numeric order B1..B* (as raw ints, same as your original)
ordered_cols_counts = ["base_model"] + barrier_ids_sorted
# Add missing columns (if any barriers absent for all bases)
for c in barrier_ids_sorted:
    if c not in counts_base_wide.columns:
        counts_base_wide[c] = 0
counts_base_wide = counts_base_wide.reindex(columns=ordered_cols_counts)

# Append total_count
counts_base_wide = counts_base_wide.merge(totals_base, on="base_model", how="left")

# ---- RATES wide (counts / total_count per base) ----
rate_base = counts_base.merge(totals_base, on="base_model", how="left")
rate_base["rate"] = rate_base["count"] / rate_base["total_count"].replace(0, np.nan)

rates_base_wide = (
    rate_base.pivot_table(index=GROUP_BASE, columns="barrier_id", values="rate", fill_value=0.0)
             .rename_axis(None, axis=1)
             .reset_index()
)

# Ensure same barrier column order
ordered_cols_rates = ["base_model"] + barrier_ids_sorted
for c in barrier_ids_sorted:
    if c not in rates_base_wide.columns:
        rates_base_wide[c] = 0.0
rates_base_wide = rates_base_wide.reindex(columns=ordered_cols_rates)

# Append total_count for convenience/verification
rates_base_wide = rates_base_wide.merge(totals_base, on="base_model", how="left")

# ---- Write CSVs ----
counts_base_wide.to_csv("ordered_list/orderedlist_selection_counts_compressed.csv", index=False)
rates_base_wide.to_csv("ordered_list/orderedlist_selection_rates_compressed.csv", index=False)

print(" - ordered_list/orderedlist_selection_counts_compressed.csv")
print(" - ordered_list/orderedlist_selection_rates_compressed.csv")


 - ordered_list/orderedlist_jaccard_compressed.csv
 - ordered_list/orderedlist_selection_counts_compressed.csv
 - ordered_list/orderedlist_selection_rates_compressed.csv


In [48]:
import pandas as pd
import numpy as np
df=pd.read_csv("unnumbered_list/unnumbered_barriers_max5_clean.csv")

# ---------------- Canonical barrier list (source of truth) ----------------
BARRIERS = {
    1: "power losses, quality and safety issues",
    2: "reduced reliability in DC devices",
    3: "lack of use-cases in which DC is advantageous",
    4: "uncertain utility interaction (net metering, utility ownership, and agreed standards)",
    5: "lack of pilot projects",
    6: "public perception of DC and readiness to 'champion' installations from DC projects",
    7: "incompatibility of DC systems components",
    8: "misconception and lack of knowledge leads to lengthy/expensive design and permit process",
    9: "lack of enough trained personnel in DC systems",
    10: "uncertain regulatory roadmap",
    11: "high costs of DC solutions",
}

CANON_FIRST5 = set(list(BARRIERS.keys())[:5])

# ---------------- Assumes df is already in memory ----------------
# Required columns: base_model, variant_id, model, barrier_id
# Optional but harmless: bias_type, iteration, timestamp, etc.

# Light typing / cleaning
df = df.copy()
df["base_model"] = df["base_model"].astype(str).str.strip()
df["variant_id"] = df["variant_id"].astype(str).str.strip()
df["model"]      = df["model"].astype(str).str.strip()
df["barrier_id"] = pd.to_numeric(df["barrier_id"], errors="coerce").astype("Int64")

# ---------------- 1) Selection counts & rates per combo ----------------
GROUP = ["base_model", "variant_id", "model"]

counts = (
    df.dropna(subset=["barrier_id"])
      .groupby(GROUP + ["barrier_id"], dropna=False)
      .size()
      .reset_index(name="count")
)

totals = counts.groupby(GROUP, dropna=False)["count"].sum().reset_index(name="total_count")

counts = counts.merge(totals, on=GROUP, how="left")
counts["rate"] = counts["count"] / counts["total_count"].replace(0, np.nan)

# Wide tables: one column per barrier_id (counts and rates)
counts_wide = (
    counts.pivot_table(index=GROUP, columns="barrier_id", values="count", fill_value=0)
          .rename_axis(None, axis=1)
          .reset_index()
)
rates_wide = (
    counts.pivot_table(index=GROUP, columns="barrier_id", values="rate", fill_value=0.0)
          .rename_axis(None, axis=1)
          .reset_index()
)

# Add total_count to both for convenience
counts_wide = counts_wide.merge(totals, on=GROUP, how="left")
rates_wide  = rates_wide.merge(totals, on=GROUP, how="left")

# Save selection tables
counts_wide.to_csv("unnumbered_list/unnumberedlist_selection_counts_by_combo_wide.csv", index=False)
rates_wide.to_csv("unnumbered_list/unnumberedlist_selection_rates_by_combo_wide.csv", index=False)

# --- canonical first 5 from dict insertion order ---
CANON_FIRST5_LIST = list(BARRIERS.keys())[:5]
CANON_FIRST5 = set(CANON_FIRST5_LIST)

def jaccard(a: set, b: set) -> float:
    if not a and not b:
        return 1.0
    return len(a & b) / len(a | b) if (a or b) else 0.0

# --- build top-5 per GROUP: both set and ordered list ---
# sort by GROUP (asc) then count (desc)
sorted_counts = counts.sort_values(
    GROUP + ["count"],
    ascending=[True]*len(GROUP) + [False]
)

# take top-5 rows per combo
top5_rows = (
    sorted_counts
    .groupby(GROUP, dropna=False)
    .head(5)
)

# ordered list (by count desc) and set for Jaccard
top5_ordered = (
    top5_rows.groupby(GROUP, dropna=False)["barrier_id"]
             .apply(lambda s: [int(x) for x in s.tolist()])           # keep order
             .reset_index(name="top5_selected")                        # ordered list
)

top5_sets = (
    top5_rows.groupby(GROUP, dropna=False)["barrier_id"]
             .apply(lambda s: set(int(x) for x in s.dropna().tolist()))
             .reset_index(name="top5_set")                             # set for Jaccard
)

top5 = top5_ordered.merge(top5_sets, on=GROUP, how="left")
top5["jaccard_top5_vs_first5"] = top5["top5_set"].map(lambda S: jaccard(S, CANON_FIRST5))

# (optional) unique-set section can stay or be removed; keeping here for context
unique_sets = (
    df.dropna(subset=["barrier_id"])
      .groupby(GROUP, dropna=False)["barrier_id"]
      .apply(lambda s: set(int(x) for x in s.dropna().tolist()))
      .reset_index(name="selected_set")
)
unique_sets["n_selected_unique"] = unique_sets["selected_set"].map(len)

# --- assemble output with easy-to-audit columns ---
jaccard_out = (
    totals
    .merge(unique_sets, on=GROUP, how="left")
    .merge(top5,        on=GROUP, how="left")
    .assign(
        # readable versions
        selected_set=lambda d: d["selected_set"].map(lambda s: sorted(list(s)) if isinstance(s, set) else []),
        canon_first5=lambda d: [CANON_FIRST5_LIST]*len(d)
    )[
        GROUP
        + ["total_count",
           "n_selected_unique", "selected_set",
           "top5_selected", "jaccard_top5_vs_first5", "canon_first5"]
    ]
    .sort_values(GROUP)
)

jaccard_out.to_csv("unnumbered_list/unnumberedlist_jaccard_vs_canonical_first5_by_combo.csv", index=False)


print("✅ Wrote:")
print(" - unnumbered_list/unnumberedlist_selection_counts_by_combo_wide.csv")
print(" - unnumbered_list/unnumberedlist_selection_rates_by_combo_wide.csv")
print(" - unnumbered_list/unnumberedlist_jaccard_vs_canonical_first5_by_combo.csv")

✅ Wrote:
 - unnumbered_list/unnumberedlist_selection_counts_by_combo_wide.csv
 - unnumbered_list/unnumberedlist_selection_rates_by_combo_wide.csv
 - unnumbered_list/unnumberedlist_jaccard_vs_canonical_first5_by_combo.csv


In [50]:
#COMPRESSED FILES UNNUMBERED BARRIER LIST
import pandas as pd

import pandas as pd
import numpy as np
df=pd.read_csv("unnumbered_list/unnumbered_barriers_max5_clean.csv")

# ---------------- Canonical barrier list (source of truth) ----------------
BARRIERS = {
    1: "power losses, quality and safety issues",
    2: "reduced reliability in DC devices",
    3: "lack of use-cases in which DC is advantageous",
    4: "uncertain utility interaction (net metering, utility ownership, and agreed standards)",
    5: "lack of pilot projects",
    6: "public perception of DC and readiness to 'champion' installations from DC projects",
    7: "incompatibility of DC systems components",
    8: "misconception and lack of knowledge leads to lengthy/expensive design and permit process",
    9: "lack of enough trained personnel in DC systems",
    10: "uncertain regulatory roadmap",
    11: "high costs of DC solutions",
}

CANON_FIRST5 = set(list(BARRIERS.keys())[:5])

# ---------------- Assumes df is already in memory ----------------
# Required columns: base_model, variant_id, model, barrier_id
# Optional but harmless: bias_type, iteration, timestamp, etc.

# Light typing / cleaning
df = df.copy()
df["base_model"] = df["base_model"].astype(str).str.strip()
df["variant_id"] = df["variant_id"].astype(str).str.strip()
df["model"]      = df["model"].astype(str).str.strip()
df["barrier_id"] = pd.to_numeric(df["barrier_id"], errors="coerce").astype("Int64")

# ---------------- 1) Selection counts & rates per combo ----------------
GROUP = ["base_model", "variant_id", "model"]
GROUP_BASE =["base_model"]


# --- canonical first 5 from dict insertion order ---
CANON_FIRST5_LIST = list(BARRIERS.keys())[:5]
CANON_FIRST5 = set(CANON_FIRST5_LIST)

def jaccard(a: set, b: set) -> float:
    if not a and not b:
        return 1.0
    return len(a & b) / len(a | b) if (a or b) else 0.0

counts_base = (
    df.dropna(subset=["barrier_id"])
      .groupby(GROUP_BASE + ["barrier_id"], dropna=False)
      .size()
        .reset_index(name="count")
)

totals_base = (
    counts_base.groupby(GROUP_BASE, dropna=False)["count"]
            .sum()
            .reset_index(name="total_count")
)

#top-5 by aggregated counts
sorted_counts_base = counts_base.sort_values(
    GROUP_BASE + ["count", "barrier_id"],
    ascending=[True]*len(GROUP_BASE) + [False, True]
)

top5_rows_base=(
    sorted_counts_base
    .groupby(GROUP_BASE, dropna=False)
    .head(5)
)

top5_ordered_base = (
    top5_rows_base.groupby(GROUP_BASE, dropna=False)["barrier_id"]
                  .apply(lambda s: [int(x) for x in s.tolist()])   # keep order
                  .reset_index(name="top5_selected")
)

# Union of all selected barriers across variants for each base
unique_sets_base = (
    df.dropna(subset=["barrier_id"])
      .groupby(GROUP_BASE, dropna=False)["barrier_id"]
      .apply(lambda s: set(int(x) for x in s.dropna().tolist()))
      .reset_index(name="selected_set")
)
unique_sets_base["n_selected_unique"] = unique_sets_base["selected_set"].map(len)

# Assemble base-level output (same columns schema)
jaccard_out_base = (
    totals_base
    .merge(unique_sets_base, on=GROUP_BASE, how="left")
    .merge(top5_ordered_base, on=GROUP_BASE, how="left")
    .assign(
        jaccard_top5_vs_first5=lambda d: d["top5_selected"].map(lambda lst: jaccard(set(lst or []), set(CANON_FIRST5_LIST))),
        # readable versions
        selected_set=lambda d: d["selected_set"].map(lambda s: sorted(list(s)) if isinstance(s, set) else []),
        canon_first5=[CANON_FIRST5_LIST]*len(totals_base)
    )[
        ["base_model",
         "total_count",
         "n_selected_unique", "selected_set",
         "top5_selected", "jaccard_top5_vs_first5", "canon_first5"]
    ]
    .sort_values("base_model")
)

jaccard_out_base.to_csv("unnumbered_list/unnumberedlist_jaccard_compressed.csv", index=False)

print(" - unnumbered_list/unnumberedlist_jaccard_compressed.csv")

# All barrier ids (to fix a consistent column order)
barrier_ids_sorted = sorted(int(x) for x in df["barrier_id"].dropna().unique())

# Aggregated counts per base (combine all variants)
counts_base = (
    df.dropna(subset=["barrier_id"])
      .groupby(GROUP_BASE + ["barrier_id"], dropna=False)
      .size()
      .reset_index(name="count")
)

# Total selections per base
totals_base = (
    counts_base.groupby(GROUP_BASE, dropna=False)["count"]
               .sum()
               .reset_index(name="total_count")
)

# ---- COUNTS wide (one row per base, one column per barrier) ----
counts_base_wide = (
    counts_base.pivot_table(index=GROUP_BASE, columns="barrier_id", values="count", fill_value=0)
               .rename_axis(None, axis=1)
               .reset_index()
)

# Ensure barrier columns appear in numeric order B1..B* (as raw ints, same as your original)
ordered_cols_counts = ["base_model"] + barrier_ids_sorted
# Add missing columns (if any barriers absent for all bases)
for c in barrier_ids_sorted:
    if c not in counts_base_wide.columns:
        counts_base_wide[c] = 0
counts_base_wide = counts_base_wide.reindex(columns=ordered_cols_counts)

# Append total_count
counts_base_wide = counts_base_wide.merge(totals_base, on="base_model", how="left")

# ---- RATES wide (counts / total_count per base) ----
rate_base = counts_base.merge(totals_base, on="base_model", how="left")
rate_base["rate"] = rate_base["count"] / rate_base["total_count"].replace(0, np.nan)

rates_base_wide = (
    rate_base.pivot_table(index=GROUP_BASE, columns="barrier_id", values="rate", fill_value=0.0)
             .rename_axis(None, axis=1)
             .reset_index()
)

# Ensure same barrier column order
ordered_cols_rates = ["base_model"] + barrier_ids_sorted
for c in barrier_ids_sorted:
    if c not in rates_base_wide.columns:
        rates_base_wide[c] = 0.0
rates_base_wide = rates_base_wide.reindex(columns=ordered_cols_rates)

# Append total_count for convenience/verification
rates_base_wide = rates_base_wide.merge(totals_base, on="base_model", how="left")

# ---- Write CSVs ----
counts_base_wide.to_csv("unnumbered_list/unnumberedlist_selection_counts_compressed.csv", index=False)
rates_base_wide.to_csv("unnumbered_list/unnumberedlist_selection_rates_compressed.csv", index=False)

print(" - unnumbered_list/unnumberedlist_selection_counts_compressed.csv")
print(" - unnumbered_list/unnumberedlist_selection_rates_compressed.csv")

 - unnumbered_list/unnumberedlist_jaccard_compressed.csv
 - unnumbered_list/unnumberedlist_selection_counts_compressed.csv
 - unnumbered_list/unnumberedlist_selection_rates_compressed.csv
